diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..4077f91 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,7 @@ +version: 2 +jobs: + build: + docker: + - image: circleci/python:3.7-node-browsers + steps: + - run: echo "hello world" diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..7330426 --- /dev/null +++ b/.clang-format @@ -0,0 +1,88 @@ +--- +AccessModifierOffset: -1 +AlignAfterOpenBracket: AlwaysBreak +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: false +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] +IncludeCategories: + - Regex: '^<.*\.h(pp)?>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 2000000 +PointerAlignment: Left +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never +... diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..5466a4a --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,43 @@ +--- +# NOTE: there must be no spaces before the '-', so put the comma first. +Checks: ' + * + ,modernize-* + ,-cert-err58-cpp + ,-cert-err60-cpp + ,-clang-diagnostic-* + ,-cppcoreguidelines-owning-memory + ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay + ,-cppcoreguidelines-pro-bounds-constant-array-index + ,-cppcoreguidelines-pro-type-static-cast-downcast + ,-cppcoreguidelines-pro-type-vararg + ,-cppcoreguidelines-special-member-functions + ,-fuchsia-* + ,-google-build-using-namespace + ,-google-explicit-constructor + ,-google-readability-braces-around-statements + ,-google-readability-namespace-comments + ,-google-readability-todo + ,-google-runtime-references + ,-google-runtime-references + ,-hicpp-braces-around-statements + ,-hicpp-explicit-conversions + ,-hicpp-no-array-decay + ,-hicpp-special-member-functions + ,-hicpp-vararg + ,-llvm-header-guard + ,-llvm-namespace-comment + ,-misc-unused-parameters + ,-modernize-make-unique + ,-modernize-use-default-member-init + ,-performance-unnecessary-value-param + ,-readability-braces-around-statements + ,-readability-else-after-return + ,-readability-named-parameter + ,clang-analyzer-* + ' +WarningsAsErrors: '' +HeaderFilterRegex: 'torch/csrc/' +AnalyzeTemporaryDtors: false +CheckOptions: +... diff --git a/.dockerignore b/.dockerignore new file mode 120000 index 0000000..3e4e48b --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.gitignore \ No newline at end of file diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..e69de29 diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..407cab0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,38 @@ +If you have a question or would like help and support, please ask at our +[forums](https://discuss.pytorch.org/). + +If you are submitting a feature request, please preface the title with [feature request]. +If you are submitting a bug report, please fill in the following details. + +## Issue description + +Provide a short description. + +## Code example + +Please try to provide a minimal example to repro the bug. +Error messages and stack traces are also helpful. + +## System Info +Please copy and paste the output from our +[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py) +(or fill out the checklist below manually). + +You can get the script and run it with: +``` +wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py +# For security purposes, please check the contents of collect_env.py before running it. +python collect_env.py +``` + +- PyTorch or Caffe2: +- How you installed PyTorch (conda, pip, source): +- Build command you used (if compiling from source): +- OS: +- PyTorch version: +- Python version: +- CUDA/cuDNN version: +- GPU models and configuration: +- GCC version (if compiling from source): +- CMake version: +- Versions of any other relevant libraries: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..e69de29 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09e5ed8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,196 @@ +# READ THIS BEFORE YOU REFACTOR ME +# +# setup.py uses the list of patterns in this file to decide +# what to delete, but it's not 100% sound. So, for example, +# if you delete aten/build/ because it's redundant with build/, +# aten/build/ will stop being cleaned. So be careful when +# refactoring this file! + +## PyTorch + +.mypy_cache +*/*.pyc +*/*.so* +*/**/__pycache__ +*/**/*.dylib* +*/**/*.pyc +*/**/*.pyd +*/**/*.so* +*/**/**/*.pyc +*/**/**/**/*.pyc +*/**/**/**/**/*.pyc +aten/build/ +aten/src/ATen/Config.h +aten/src/ATen/cuda/CUDAConfig.h +build/ +dist/ +docs/src/**/* +test/.coverage +test/cpp/api/mnist +test/data/gpu_tensors.pt +test/data/legacy_modules.t7 +test/data/legacy_serialized.pt +test/data/linear.pt +test/htmlcov +third_party/build/ +tools/shared/_utils_internal.py +torch.egg-info/ +torch/csrc/autograd/generated/* +torch/csrc/cudnn/cuDNN.cpp +torch/csrc/generated +torch/csrc/generic/TensorMethods.cpp +torch/csrc/jit/generated/* +torch/csrc/nn/THCUNN.cpp +torch/csrc/nn/THCUNN.cwrap +torch/csrc/nn/THNN_generic.cpp +torch/csrc/nn/THNN_generic.cwrap +torch/csrc/nn/THNN_generic.h +torch/csrc/nn/THNN.cpp +torch/csrc/nn/THNN.cwrap +torch/lib/*.a* +torch/lib/*.dll* +torch/lib/*.dylib* +torch/lib/*.h +torch/lib/*.lib +torch/lib/*.so* +torch/lib/build +torch/lib/cmake +torch/lib/include +torch/lib/pkgconfig +torch/lib/protoc +torch/lib/tmp_install +torch/lib/torch_shm_manager +torch/version.py + +# IPython notebook checkpoints +.ipynb_checkpoints + +# Editor temporaries +*.swn +*.swo +*.swp +*.swm +*~ + +# macOS dir files +.DS_Store + +# Symbolic files +tools/shared/cwrap_common.py + +# Ninja files +.ninja_deps +.ninja_log +compile_commands.json +*.egg-info/ +docs/source/scripts/activation_images/ + +## General + +# Compiled Object files +*.slo +*.lo +*.o +*.cuo +*.obj + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Compiled protocol buffers +*.pb.h +*.pb.cc +*_pb2.py + +# Compiled python +*.pyc +*.pyd + +# Compiled MATLAB +*.mex* + +# IPython notebook checkpoints +.ipynb_checkpoints + +# Editor temporaries +*.swn +*.swo +*.swp +*~ + +# Sublime Text settings +*.sublime-workspace +*.sublime-project + +# Eclipse Project settings +*.*project +.settings + +# QtCreator files +*.user + +# PyCharm files +.idea + +# Visual Studio Code files +.vscode +.vs + +# OSX dir files +.DS_Store + +## Caffe2 + +# build, distribute, and bins (+ python proto bindings) +build +build_host_protoc +build_android +build_ios +/build_* +.build_debug/* +.build_release/* +distribute/* +*.testbin +*.bin +cmake_build +.cmake_build +gen +.setuptools-cmake-build +.pytest_cache +aten/build/* + +# Bram +plsdontbreak + +# Generated documentation +docs/_site +docs/gathered +_site +doxygen +docs/dev + +# LevelDB files +*.sst +*.ldb +LOCK +LOG* +CURRENT +MANIFEST-* + +# generated version file +caffe2/version.py + +# setup.py intermediates +.eggs +caffe2.egg-info + +# Atom/Watchman required file +.watchmanconfig diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..098255c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,81 @@ +[submodule "third_party/catch"] + path = third_party/catch + url = https://github.com/catchorg/Catch2.git +[submodule "third_party/nanopb"] + path = third_party/nanopb + url = https://github.com/nanopb/nanopb.git +[submodule "third_party/pybind11"] + path = third_party/pybind11 + url = https://github.com/pybind/pybind11.git +[submodule "third_party/cub"] + path = third_party/cub + url = https://github.com/NVlabs/cub.git +[submodule "third_party/eigen"] + path = third_party/eigen + url = https://github.com/eigenteam/eigen-git-mirror.git +[submodule "third_party/googletest"] + path = third_party/googletest + url = https://github.com/google/googletest.git +[submodule "third_party/nervanagpu"] + path = third_party/nervanagpu + url = https://github.com/NervanaSystems/nervanagpu.git +[submodule "third_party/benchmark"] + path = third_party/benchmark + url = https://github.com/google/benchmark.git +[submodule "third_party/protobuf"] + path = third_party/protobuf + url = https://github.com/google/protobuf.git +[submodule "third_party/ios-cmake"] + path = third_party/ios-cmake + url = https://github.com/Yangqing/ios-cmake.git +[submodule "third_party/NNPACK"] + path = third_party/NNPACK + url = https://github.com/Maratyszcza/NNPACK.git +[submodule "third_party/gloo"] + path = third_party/gloo + url = https://github.com/facebookincubator/gloo +[submodule "third_party/NNPACK_deps/pthreadpool"] + path = third_party/pthreadpool + url = https://github.com/Maratyszcza/pthreadpool.git +[submodule "third_party/NNPACK_deps/FXdiv"] + path = third_party/FXdiv + url = https://github.com/Maratyszcza/FXdiv.git +[submodule "third_party/NNPACK_deps/FP16"] + path = third_party/FP16 + url = https://github.com/Maratyszcza/FP16.git +[submodule "third_party/NNPACK_deps/psimd"] + path = third_party/psimd + url = https://github.com/Maratyszcza/psimd.git +[submodule "third_party/zstd"] + path = third_party/zstd + url = https://github.com/facebook/zstd.git +[submodule "third-party/cpuinfo"] + path = third_party/cpuinfo + url = https://github.com/Maratyszcza/cpuinfo.git +[submodule "third_party/python-enum"] + path = third_party/python-enum + url = https://github.com/PeachPy/enum34.git +[submodule "third_party/python-peachpy"] + path = third_party/python-peachpy + url = https://github.com/Maratyszcza/PeachPy.git +[submodule "third_party/python-six"] + path = third_party/python-six + url = https://github.com/benjaminp/six.git +[submodule "third_party/ComputeLibrary"] + path = third_party/ComputeLibrary + url = https://github.com/ARM-software/ComputeLibrary.git +[submodule "third_party/onnx"] + path = third_party/onnx + url = https://github.com/onnx/onnx.git +[submodule "third_party/cereal"] + path = third_party/cereal + url = https://github.com/USCiLab/cereal +[submodule "third_party/onnx-tensorrt"] + path = third_party/onnx-tensorrt + url = https://github.com/onnx/onnx-tensorrt +[submodule "third_party/sleef"] + path = third_party/sleef + url = https://github.com/shibatch/sleef +[submodule "third_party/ideep"] + path = third_party/ideep + url = https://github.com/intel/ideep diff --git a/.jenkins/caffe2/README.md b/.jenkins/caffe2/README.md new file mode 100644 index 0000000..c22cd8f --- /dev/null +++ b/.jenkins/caffe2/README.md @@ -0,0 +1,14 @@ +# Jenkins + +The scripts in this directory are the entrypoint for testing Caffe2. + +The environment variable `BUILD_ENVIRONMENT` is expected to be set to +the build environment you intend to test. It is a hint for the build +and test scripts to configure Caffe2 a certain way and include/exclude +tests. Docker images, they equal the name of the image itself. For +example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are +built on Jenkins and are used in triggered builds already have this +environment variable set in their manifest. Also see +`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. + +Our Jenkins installation is located at https://ci.pytorch.org/jenkins/. diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh new file mode 100755 index 0000000..345e89c --- /dev/null +++ b/.jenkins/caffe2/build.sh @@ -0,0 +1,273 @@ +#!/bin/bash + +set -ex + +# The INSTALL_PREFIX here must match up with test.sh +INSTALL_PREFIX="/usr/local/caffe2" +LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) +CMAKE_ARGS=() + + +# Setup SCCACHE +############################################################################### +# Setup sccache if SCCACHE_BUCKET is set +if [ -n "${SCCACHE_BUCKET}" ]; then + mkdir -p ./sccache + + SCCACHE="$(which sccache)" + if [ -z "${SCCACHE}" ]; then + echo "Unable to find sccache..." + exit 1 + fi + + # Setup wrapper scripts + for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do + ( + echo "#!/bin/sh" + echo "exec $SCCACHE $(which $compiler) \"\$@\"" + ) > "./sccache/$compiler" + chmod +x "./sccache/$compiler" + done + + if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then + ( + echo "#!/bin/sh" + echo "exec $SCCACHE $(which nvcc) \"\$@\"" + ) > "./sccache/nvcc" + chmod +x "./sccache/nvcc" + fi + + export CACHE_WRAPPER_DIR="$PWD/sccache" + + # CMake must find these wrapper scripts + export PATH="$CACHE_WRAPPER_DIR:$PATH" +fi + +# Setup ccache if configured to use it (and not sccache) +if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then + mkdir -p ./ccache + ln -sf "$(which ccache)" ./ccache/cc + ln -sf "$(which ccache)" ./ccache/c++ + ln -sf "$(which ccache)" ./ccache/gcc + ln -sf "$(which ccache)" ./ccache/g++ + ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc + if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then + ln -sf "$(which ccache)" ./ccache/nvcc + fi + export CACHE_WRAPPER_DIR="$PWD/ccache" + export PATH="$CACHE_WRAPPER_DIR:$PATH" +fi + +report_compile_cache_stats() { + if [[ -n "${SCCACHE}" ]]; then + "$SCCACHE" --show-stats + elif which ccache > /dev/null; then + ccache -s + fi +} + +############################################################################### +# Explicitly set Python executable. +############################################################################### +# On Ubuntu 16.04 the default Python is still 2.7. +PYTHON="$(which python)" +if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then + PYTHON=$(which "python${BASH_REMATCH[1]}") + CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}") +fi + + +############################################################################### +# Use special scripts for Android, conda, and setup builds +############################################################################### +if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then + export ANDROID_NDK=/opt/ndk + CMAKE_ARGS+=("-DBUILD_BINARY=ON") + CMAKE_ARGS+=("-DBUILD_TEST=ON") + CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") + CMAKE_ARGS+=("-DUSE_ZSTD=ON") + "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@" + exit 0 +elif [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then + "${ROOT_DIR}/scripts/build_anaconda.sh" --skip-tests --install-locally "$@" + report_compile_cache_stats + + # This build will be tested against onnx tests, which needs onnx installed. + # At this point the visible protbuf installation will be in conda, since one + # of Caffe2's dependencies uses conda, so the correct protobuf include + # headers are those in conda as well + # This path comes from install_anaconda.sh which installs Anaconda into the + # docker image + PROTOBUF_INCDIR=/opt/conda/include pip install -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" + report_compile_cache_stats + exit 0 +elif [[ $BUILD_ENVIRONMENT == *setup* ]]; then + rm -rf $INSTALL_PREFIX && mkdir $INSTALL_PREFIX + PYTHONPATH=$INSTALL_PREFIX $PYTHON setup_caffe2.py develop --install-dir $INSTALL_PREFIX + exit 0 +fi + + +############################################################################### +# Set cmake args +############################################################################### +CMAKE_ARGS+=("-DBUILD_BINARY=ON") +CMAKE_ARGS+=("-DBUILD_TEST=ON") +CMAKE_ARGS+=("-DINSTALL_TEST=ON") +CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") +CMAKE_ARGS+=("-DUSE_ZSTD=ON") +CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") + +if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then + if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then + CMAKE_ARGS+=("-DBUILD_ATEN=ON") + fi +fi +if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then + CMAKE_ARGS+=("-DBLAS=MKL") +fi +if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then + CMAKE_ARGS+=("-DUSE_CUDA=ON") + CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell") + CMAKE_ARGS+=("-DUSE_NNPACK=OFF") + + # Explicitly set path to NVCC such that the symlink to ccache or sccache is used + CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc") + + # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit. + # Setting PATH to resolve to the right nvcc alone isn't enough. + # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589. + export CUDA_PATH="/usr/local/cuda" + + # Ensure the ccache symlink can still find the real nvcc binary. + export PATH="/usr/local/cuda/bin:$PATH" +fi +if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then + # TODO: This is patching the official FindHip to properly handly + # cmake generator expression. A PR is opened in the upstream repo here: + # https://github.com/ROCm-Developer-Tools/HIP/pull/516 + # remove this hack once it's merged. + if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then + sudo sed -i 's/\ -I${dir}/\ $<$:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake + fi + + export LANG=C.UTF-8 + export LC_ALL=C.UTF-8 + export HCC_AMDGPU_TARGET=gfx900 +fi + +# Try to include Redis support for Linux builds +if [ "$(uname)" == "Linux" ]; then + CMAKE_ARGS+=("-DUSE_REDIS=ON") +fi + +# Currently, on Jenkins mac os, we will use custom protobuf. Mac OS +# contbuild at the moment is minimal dependency - it doesn't use glog +# or gflags either. +if [ "$(uname)" == "Darwin" ]; then + CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON") +fi + +# Use a speciallized onnx namespace in CI to catch hardcoded onnx namespace +CMAKE_ARGS+=("-DONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI") + +if [[ -n "$INTEGRATED" ]]; then + # TODO: This is a temporary hack to work around the issue that both + # caffe2 and pytorch have libcaffe2.so and crossfire at runtime. + CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF") + CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF") + CMAKE_ARGS+=("-DCAFFE2_LINK_LOCAL_PROTOBUF=OFF") +fi + +# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04) +# and use that if so. +if [[ -x "$(command -v cmake3)" ]]; then + CMAKE_BINARY=cmake3 +else + CMAKE_BINARY=cmake +fi +# sccache will fail for CUDA builds if all cores are used for compiling +if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]] && [ -n "${SCCACHE}" ]; then + MAX_JOBS=`expr $(nproc) - 1` +else + MAX_JOBS=$(nproc) +fi + + +############################################################################### +# Configure and make +############################################################################### +# Run cmake from ./build_caffe2 directory so it doesn't conflict with +# standard PyTorch build directory. Eventually these won't need to +# be separate. +rm -rf build_caffe2 +mkdir build_caffe2 +cd ./build_caffe2 + +# Configure +${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@" + +# Build +if [ "$(uname)" == "Linux" ]; then + make "-j${MAX_JOBS}" install +else + echo "Don't know how to build on $(uname)" + exit 1 +fi + +report_compile_cache_stats + + +############################################################################### +# Install ONNX +############################################################################### + +# Install ONNX into a local directory +pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" + +report_compile_cache_stats + +if [[ -n "$INTEGRATED" ]]; then + # sccache will be stuck if all cores are used for compiling + # see https://github.com/pytorch/pytorch/pull/7361 + if [[ -n "${SCCACHE}" ]]; then + export MAX_JOBS=`expr $(nproc) - 1` + fi + pip install --user -v -b /tmp/pip_install_torch "file://${ROOT_DIR}#egg=torch" +fi + +report_compile_cache_stats + +# Symlink the caffe2 base python path into the system python path, +# so that we can import caffe2 without having to change $PYTHONPATH. +# Run in a subshell to contain environment set by /etc/os-release. +# +# This is only done when running on Jenkins! We don't want to pollute +# the user environment with Python symlinks and ld.so.conf.d hacks. +# +if [ -n "${JENKINS_URL}" ]; then + ( + source /etc/os-release + + function python_version() { + "$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])' + } + + # Debian/Ubuntu + if [[ "$ID_LIKE" == *debian* ]]; then + python_path="/usr/local/lib/$(python_version)/dist-packages" + sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}" + fi + + # RHEL/CentOS + if [[ "$ID_LIKE" == *rhel* ]]; then + python_path="/usr/lib64/$(python_version)/site-packages/" + sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}" + fi + + # /etc/ld.so.conf.d is used on both Debian and RHEL + echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf + sudo ldconfig + ) +fi diff --git a/.jenkins/caffe2/dirty.sh b/.jenkins/caffe2/dirty.sh new file mode 100755 index 0000000..6b9ba54 --- /dev/null +++ b/.jenkins/caffe2/dirty.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -ex +upstream="$1" +pr="$2" +git diff --name-only "$upstream" "$pr" +# For safety, unconditionally trigger for any changes. +#git diff --name-only "$upstream" "$pr" | grep -Eq '^(CMakeLists.txt|Makefile|.gitmodules|.jenkins/caffe2|binaries|caffe|caffe2|cmake|conda|docker|docs/caffe2|modules|scripts|third_party)' diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh new file mode 100755 index 0000000..a4bb748 --- /dev/null +++ b/.jenkins/caffe2/test.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +set -ex + +LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) +TEST_DIR=$ROOT_DIR/caffe2_tests + +# Figure out which Python to use +PYTHON="python" +if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then + PYTHON="python${BASH_REMATCH[1]}" +fi + +# The prefix must mirror the setting from build.sh +INSTALL_PREFIX="/usr/local/caffe2" + +# Anaconda builds have a special install prefix and python +if [[ "$BUILD_ENVIRONMENT" == conda* ]]; then + # This path comes from install_anaconda.sh which installs Anaconda into the + # docker image + PYTHON="/opt/conda/bin/python" + INSTALL_PREFIX="/opt/conda/" +fi + +# Add the site-packages in the caffe2 install prefix to the PYTHONPATH +SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))") +INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}" + +# Skip tests in environments where they are not built/applicable +if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then + echo 'Skipping tests' + exit 0 +fi + +# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed +# Caffe2. This shouldn't be done on Anaconda, as Anaconda should handle this. +if [[ "$BUILD_ENVIRONMENT" != conda* ]]; then + export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR" + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib" +fi + +cd "$ROOT_DIR" + +if [ -d $TEST_DIR ]; then + echo "Directory $TEST_DIR already exists; please remove it..." + exit 1 +fi + +mkdir -p $TEST_DIR/{cpp,python} + +cd ${INSTALL_PREFIX} + +# C++ tests +echo "Running C++ tests.." +gtest_reports_dir="${TEST_DIR}/cpp" +junit_reports_dir="${TEST_DIR}/junit_reports" +mkdir -p "$gtest_reports_dir" "$junit_reports_dir" +for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do + case "$test" in + # skip tests we know are hanging or bad + */mkl_utils_test|*/aten/integer_divider_test) + continue + ;; + */aten/*) + # ATen uses test framework Catch2 + "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml" + ;; + *) + "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" + ;; + esac +done + +# Get the relative path to where the caffe2 python module was installed +CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2" + +# Collect additional tests to run (outside caffe2/python) +EXTRA_TESTS=() + +# CUDA builds always include NCCL support +if [[ "$BUILD_ENVIRONMENT" == *-cuda* ]]; then + EXTRA_TESTS+=("$CAFFE2_PYPATH/contrib/nccl") +fi + +conda_ignore_test=() +if [[ $BUILD_ENVIRONMENT == conda* ]]; then + # These tests both assume Caffe2 was built with leveldb, which is not the case + conda_ignore_test+=("--ignore $CAFFE2_PYPATH/python/dataio_test.py") + conda_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/checkpoint_test.py") +fi + + +# TODO: re-enable this for rocm CI jobs once we have more rocm workers +if [[ $BUILD_ENVIRONMENT != *rocm* ]]; then + # Python tests + echo "Running Python tests.." + "$PYTHON" \ + -m pytest \ + -x \ + -v \ + --junit-xml="$TEST_DIR/python/result.xml" \ + --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \ + --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \ + --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \ + --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \ + ${conda_ignore_test[@]} \ + "$CAFFE2_PYPATH/python" \ + "${EXTRA_TESTS[@]}" +fi + +if [[ -n "$INTEGRATED" ]]; then + pip install --user pytest-xdist torchvision + "$ROOT_DIR/scripts/onnx/test.sh" -p +fi diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md new file mode 100644 index 0000000..ea6c6dd --- /dev/null +++ b/.jenkins/pytorch/README.md @@ -0,0 +1,42 @@ +This directory contains scripts for our continuous integration. + +One important thing to keep in mind when reading the scripts here is +that they are all based off of Docker images, which we build for each of +the various system configurations we want to run on Jenkins. This means +it is very easy to run these tests yourself: + +1. Figure out what Docker image you want. The general template for our + images look like: + ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``, + where ``$BUILD_ENVIRONMENT`` is one of the build environments + enumerated in + [pytorch-dockerfiles](https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh) + +2. Run ``docker -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and + run one of the scripts in this directory. + +The Docker images are designed so that any "reasonable" build commands +will work; if you look in [build.sh](build.sh) you will see that it is a +very simple script. This is intentional. Idiomatic build instructions +should work inside all of our Docker images. You can tweak the commands +however you need (e.g., in case you want to rebuild with DEBUG, or rerun +the build with higher verbosity, etc.). + +We have to do some work to make this so. Here is a summary of the +mechanisms we use: + +- We install binaries to directories like `/usr/local/bin` which + are automatically part of your PATH. + +- We add entries to the PATH using Docker ENV variables (so + they apply when you enter Docker) and `/etc/environment` (so they + continue to apply even if you sudo), instead of modifying + `PATH` in our build scripts. + +- We use `/etc/ld.so.conf.d` to register directories containing + shared libraries, instead of modifying `LD_LIBRARY_PATH` in our + build scripts. + +- We reroute well known paths like `/usr/bin/gcc` to alternate + implementations with `update-alternatives, instead of setting + `CC` and `CXX` in our implementations. diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh new file mode 100755 index 0000000..4ece2ae --- /dev/null +++ b/.jenkins/pytorch/build-asan.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Required environment variable: $BUILD_ENVIRONMENT +# (This is set by default in the Docker images we build, so you don't +# need to set it yourself. + +COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-build" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +echo "Clang version:" +clang --version + +# detect_leaks=0: Python is very leaky, so we need suppress it +# symbolize=1: Gives us much better errors when things go wrong +export ASAN_OPTIONS=detect_leaks=0:symbolize=1 + +# TODO: Make the ASAN flags a more unified env var +CC="clang" CXX="clang++" LDSHARED="clang --shared" \ + CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \ + NO_CUDA=1 DEBUG=1 \ + python setup.py install diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh new file mode 100755 index 0000000..bb06df2 --- /dev/null +++ b/.jenkins/pytorch/build.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +if [[ "$BUILD_ENVIRONMENT" == "pytorch-linux-xenial-py3-clang5-asan" ]]; then + exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" $* +fi + +# TODO: move this to Docker +# TODO: add both NCCL and MPI in CI test by fixing these test first +# sudo apt-get update +# sudo apt-get install libnccl-dev libnccl2 +# sudo apt-get install openmpi-bin libopenmpi-dev + +# Required environment variable: $BUILD_ENVIRONMENT +# (This is set by default in the Docker images we build, so you don't +# need to set it yourself. + +COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-build" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +echo "Python version:" +python --version + +echo "GCC version:" +gcc --version + +echo "CMake version:" +cmake --version + +# TODO: Don't run this... +pip install -r requirements.txt || true + +if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + export HCC_AMDGPU_TARGET=gfx900 + export LANG=C.UTF-8 + export LC_ALL=C.UTF-8 + + sudo chown -R jenkins:jenkins /usr/local + rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true + python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py" + USE_ROCM=1 python setup.py install + exit +fi + +# TODO: Don't install this here +if ! which conda; then + pip install mkl mkl-devel +fi + +# sccache will fail for CUDA builds if all cores are used for compiling +# gcc 7 with sccache seems to have intermittent OOM issue if all cores are used +if ([[ "$BUILD_ENVIRONMENT" == *cuda* ]] || [[ "$BUILD_ENVIRONMENT" == *gcc7* ]]) && which sccache > /dev/null; then + export MAX_JOBS=`expr $(nproc) - 1` +fi + +# Target only our CI GPU machine's CUDA arch to speed up the build +export TORCH_CUDA_ARCH_LIST=5.2 + +if [[ "$BUILD_ENVIRONMENT" == *trusty-py3.6-gcc5.4* ]]; then + export DEBUG=1 +fi + +WERROR=1 python setup.py install + +# Add the test binaries so that they won't be git clean'ed away +git add -f build/bin + +# Testing ATen install +if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then + echo "Testing ATen install" + time tools/test_aten_install.sh +fi + +# Test C FFI plugins +# cffi install doesn't work for Python 3.7 +if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then + # TODO: Don't run this here + pip install cffi + git clone https://github.com/pytorch/extension-ffi.git + pushd extension-ffi/script + python build.py + popd +fi + +# Test documentation build +if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then + pushd docs + # TODO: Don't run this here + pip install -r requirements.txt || true + make html + popd +fi + +# Test no-Python build +if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then + echo "Building libtorch" + # NB: Install outside of source directory (at the same level as the root + # pytorch folder) so that it doesn't get cleaned away prior to docker push. + WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$PWD/../cpp-build" +fi diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh new file mode 100644 index 0000000..ca728df --- /dev/null +++ b/.jenkins/pytorch/common.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Common setup for all Jenkins scripts + +# NB: define this function before set -x, so that we don't +# pollute the log with a premature EXITED_USER_LAND ;) +function cleanup { + # Note that if you've exited user land, then CI will conclude that + # any failure is the CI's fault. So we MUST only output this + # string + retcode=$? + set +x + if [ $retcode -eq 0 ]; then + echo "EXITED_USER_LAND" + fi +} + +set -ex + +# Required environment variables: +# $BUILD_ENVIRONMENT (should be set by your Docker image) + +# This token is used by a parser on Jenkins logs for determining +# if a failure is a legitimate problem, or a problem with the build +# system; to find out more, grep for this string in ossci-job-dsl. +echo "ENTERED_USER_LAND" + +# compositional trap taken from https://stackoverflow.com/a/7287873/23845 + +# note: printf is used instead of echo to avoid backslash +# processing and to properly handle values that begin with a '-'. + +log() { printf '%s\n' "$*"; } +error() { log "ERROR: $*" >&2; } +fatal() { error "$@"; exit 1; } + +# appends a command to a trap +# +# - 1st arg: code to add +# - remaining args: names of traps to modify +# +trap_add() { + trap_add_cmd=$1; shift || fatal "${FUNCNAME} usage error" + for trap_add_name in "$@"; do + trap -- "$( + # helper fn to get existing trap command from output + # of trap -p + extract_trap_cmd() { printf '%s\n' "$3"; } + # print existing trap command with newline + eval "extract_trap_cmd $(trap -p "${trap_add_name}")" + # print the new trap command + printf '%s\n' "${trap_add_cmd}" + )" "${trap_add_name}" \ + || fatal "unable to add to trap ${trap_add_name}" + done +} +# set the trace attribute for the above function. this is +# required to modify DEBUG or RETURN traps because functions don't +# inherit them unless the trace attribute is set +declare -f -t trap_add + +trap_add cleanup EXIT + +if which sccache > /dev/null; then + # Save sccache logs to file + sccache --stop-server || true + rm ~/sccache_error.log || true + SCCACHE_ERROR_LOG=~/sccache_error.log RUST_LOG=sccache::server=error sccache --start-server + + # Report sccache stats for easier debugging + sccache --zero-stats + function sccache_epilogue() { + echo '=================== sccache compilation log ===================' + python $(dirname "${BASH_SOURCE[0]}")/print_sccache_log.py ~/sccache_error.log + echo '=========== If your build fails, please take a look at the log above for possible reasons ===========' + sccache --show-stats + sccache --stop-server || true + } + trap_add sccache_epilogue EXIT +fi + +if which ccache > /dev/null; then + # Report ccache stats for easier debugging + ccache --zero-stats + ccache --show-stats + function ccache_epilogue() { + ccache --show-stats + } + trap_add ccache_epilogue EXIT +fi + +# It's called a COMPACT_JOB_NAME because it's distinct from the +# Jenkin's provided JOB_NAME, which also includes a prefix folder +# e.g. pytorch-builds/ + +if [ -z "$COMPACT_JOB_NAME" ]; then + echo "Jenkins build scripts must set COMPACT_JOB_NAME" + exit 1 +fi + +if grep --line-regexp -q "$COMPACT_JOB_NAME" "$(dirname "${BASH_SOURCE[0]}")/disabled-configs.txt"; then + echo "Job is explicitly disabled, SKIPPING" + exit 0 +else + echo "Job is not disabled, proceeding" +fi + +if grep --line-regexp -q "$COMPACT_JOB_NAME" "$(dirname "${BASH_SOURCE[0]}")/enabled-configs.txt"; then + echo "Job is enabled, proceeding" +else + echo "Job is not enabled, FAILING now (revert changes to enabled-configs.txt to fix this)" + exit 1 +fi + +if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3 ]] || \ + [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then + BUILD_TEST_LIBTORCH=1 +else + BUILD_TEST_LIBTORCH=0 +fi + +# Use conda cmake in some CI build. Conda cmake will be newer than our supported +# min version 3.5, so we only do it in two builds that we know should use conda. +if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then + if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \ + [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then + if ! which conda; then + echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty" + exit 1 + else + conda install -q -y cmake + fi + else + if ! cmake --version | grep 'cmake version 3\.5'; then + echo "Expected ${BUILD_ENVIRONMENT} to have cmake version 3.5.* (min support version), but 'cmake --version' returns:" + cmake --version + exit 1 + fi + fi +fi diff --git a/.jenkins/pytorch/dirty.sh b/.jenkins/pytorch/dirty.sh new file mode 100755 index 0000000..cc0d90e --- /dev/null +++ b/.jenkins/pytorch/dirty.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -ex +upstream="$1" +pr="$2" +git diff --name-only "$upstream" "$pr" +# Now that PyTorch build depends on Caffe2, unconditionally trigger +# for any changes. +# TODO: Replace this with a NEGATIVE regex that allows us to blacklist +# files (letting us skip builds when they are unnecessary) +#git diff --name-only "$upstream" "$pr" | grep -Eq '^(aten/|caffe2/|.jenkins/pytorch|docs/(make.bat|Makefile|requirements.txt|source)|mypy|requirements.txt|setup.py|test/|third_party/|tools/|\.gitmodules|torch/)' diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt new file mode 100644 index 0000000..cdd51d3 --- /dev/null +++ b/.jenkins/pytorch/disabled-configs.txt @@ -0,0 +1,5 @@ +# This file contains a list of disabled configurations. Disabled +# configurations are skipped and not considered a failure if they +# fail. You can use this to temporarily reserve a test name to +# turn on CI side before PyTorch repository supports it. This +# file has the same format as .jenkins/enabled-configs.txt diff --git a/.jenkins/pytorch/docker-build-test.sh b/.jenkins/pytorch/docker-build-test.sh new file mode 100755 index 0000000..508699a --- /dev/null +++ b/.jenkins/pytorch/docker-build-test.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +COMPACT_JOB_NAME="docker-build-test" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +docker build -t pytorch . diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt new file mode 100644 index 0000000..14e3144 --- /dev/null +++ b/.jenkins/pytorch/enabled-configs.txt @@ -0,0 +1,43 @@ +# This file contains a list of enabled configurations +# to perform tests on. If you want to run tests on CI on +# a limited set of tests before enabling the full test suite, +# you can delete lines from this file. Any test that is not +# in this file will report a failure (so you don't forget to +# reenable the tests on merge ;) + +pytorch-linux-xenial-cuda8-cudnn6-py3-build +pytorch-linux-xenial-cuda8-cudnn6-py3-test +pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test +pytorch-linux-xenial-cuda9-cudnn7-py2-build +pytorch-linux-xenial-cuda9-cudnn7-py2-test +pytorch-linux-xenial-cuda9-cudnn7-py3-build +pytorch-linux-xenial-cuda9-cudnn7-py3-test +pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build +pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test +pytorch-linux-xenial-py3-clang5-asan-build +pytorch-linux-xenial-py3-clang5-asan-test +pytorch-linux-trusty-py2.7.9-build +pytorch-linux-trusty-py2.7.9-test +pytorch-linux-trusty-py2.7-build +pytorch-linux-trusty-py2.7-test +pytorch-linux-trusty-py3.5-build +pytorch-linux-trusty-py3.5-test +pytorch-linux-trusty-py3.6-gcc4.8-build +pytorch-linux-trusty-py3.6-gcc4.8-test +pytorch-linux-trusty-py3.6-gcc5.4-build +pytorch-linux-trusty-py3.6-gcc5.4-test +pytorch-linux-trusty-py3.6-gcc7.2-build +pytorch-linux-trusty-py3.6-gcc7.2-test +pytorch-linux-trusty-py3.6-gcc7-build +pytorch-linux-trusty-py3.6-gcc7-test +pytorch-linux-trusty-pynightly-build +pytorch-linux-trusty-pynightly-test +pytorch-win-ws2016-cuda9-cudnn7-py3-build +pytorch-win-ws2016-cuda9-cudnn7-py3-test +pytorch-macos-10.13-py3-build +pytorch-macos-10.13-py3-test +pytorch-macos-10.13-cuda9.2-cudnn7-py3-build +pytorch-docker-build-test +short-perf-test-cpu +short-perf-test-gpu +py2-clang3.8-rocmnightly-ubuntu16.04-build diff --git a/.jenkins/pytorch/macos-build-test.sh b/.jenkins/pytorch/macos-build-test.sh new file mode 100755 index 0000000..330e093 --- /dev/null +++ b/.jenkins/pytorch/macos-build-test.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-build* ]]; then + source "$(dirname "${BASH_SOURCE[0]}")/macos-build.sh" +fi + +if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test* ]]; then + source "$(dirname "${BASH_SOURCE[0]}")/macos-test.sh" +fi diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh new file mode 100755 index 0000000..41b272e --- /dev/null +++ b/.jenkins/pytorch/macos-build.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-build" +export PATH="/usr/local/bin:$PATH" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +# Set up conda environment +export PYTORCH_ENV_DIR="${HOME}/pytorch-ci-env" +# If a local installation of conda doesn't exist, we download and install conda +if [ ! -d "${PYTORCH_ENV_DIR}/miniconda3" ]; then + mkdir -p ${PYTORCH_ENV_DIR} + curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${PYTORCH_ENV_DIR}/miniconda3.sh + bash ${PYTORCH_ENV_DIR}/miniconda3.sh -b -p ${PYTORCH_ENV_DIR}/miniconda3 +fi +export PATH="${PYTORCH_ENV_DIR}/miniconda3/bin:$PATH" +source ${PYTORCH_ENV_DIR}/miniconda3/bin/activate +conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja +rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch* + +git submodule update --init --recursive +export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/ + +# Build PyTorch +if [[ "${JOB_BASE_NAME}" == *cuda9.2* ]]; then + export CUDA_VERSION=9.2 + export TORCH_CUDA_ARCH_LIST=5.2 + export PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/bin${PATH:+:${PATH}} + export DYLD_LIBRARY_PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/lib${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}} + export CUDA_HOME=/Developer/NVIDIA/CUDA-${CUDA_VERSION} + export NO_CUDA=0 + + # Eigen gives "explicit specialization of class must precede its first use" error + # when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead. + export DEVELOPER_DIR=/Library/Developer/CommandLineTools +else + export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer +fi + +export MACOSX_DEPLOYMENT_TARGET=10.9 +export CXX=clang++ +export CC=clang +if which sccache > /dev/null; then + printf "#!/bin/sh\nexec sccache $(which clang++) \$*" > "${PYTORCH_ENV_DIR}/clang++" + chmod a+x "${PYTORCH_ENV_DIR}/clang++" + + printf "#!/bin/sh\nexec sccache $(which clang) \$*" > "${PYTORCH_ENV_DIR}/clang" + chmod a+x "${PYTORCH_ENV_DIR}/clang" + + if [[ "${JOB_BASE_NAME}" == *cuda* ]]; then + printf "#!/bin/sh\nexec sccache $(which nvcc) \$*" > "${PYTORCH_ENV_DIR}/nvcc" + chmod a+x "${PYTORCH_ENV_DIR}/nvcc" + export CUDA_NVCC_EXECUTABLE="${PYTORCH_ENV_DIR}/nvcc" + fi + + export PATH="${PYTORCH_ENV_DIR}:$PATH" +fi +# If we run too many parallel jobs, we will OOM +export MAX_JOBS=2 + +export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID} + +python setup.py install + +# Upload torch binaries when the build job is finished +7z a ${IMAGE_COMMIT_TAG}.7z ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch* +aws s3 cp ${IMAGE_COMMIT_TAG}.7z s3://ossci-macos-build/pytorch/${IMAGE_COMMIT_TAG}.7z --acl public-read diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh new file mode 100755 index 0000000..7dc760c --- /dev/null +++ b/.jenkins/pytorch/macos-test.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-test" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +export PATH="/usr/local/bin:$PATH" + +# Set up conda environment +export PYTORCH_ENV_DIR="${HOME}/pytorch-ci-env" +# If a local installation of conda doesn't exist, we download and install conda +if [ ! -d "${PYTORCH_ENV_DIR}/miniconda3" ]; then + mkdir -p ${PYTORCH_ENV_DIR} + curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${PYTORCH_ENV_DIR}/miniconda3.sh + bash ${PYTORCH_ENV_DIR}/miniconda3.sh -b -p ${PYTORCH_ENV_DIR}/miniconda3 +fi +export PATH="${PYTORCH_ENV_DIR}/miniconda3/bin:$PATH" +source ${PYTORCH_ENV_DIR}/miniconda3/bin/activate +conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja +rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch* + +git submodule update --init --recursive +export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/ + +# Test PyTorch +if [[ "${JOB_BASE_NAME}" == *cuda9.2* ]]; then + # Eigen gives "explicit specialization of class must precede its first use" error + # when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead. + export DEVELOPER_DIR=/Library/Developer/CommandLineTools +else + export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer +fi +export MACOSX_DEPLOYMENT_TARGET=10.9 +export CXX=clang++ +export CC=clang +# If we run too many parallel jobs, we will OOM +export MAX_JOBS=2 + +export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID} + +# Download torch binaries in the test jobs +rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch* +aws s3 cp s3://ossci-macos-build/pytorch/${IMAGE_COMMIT_TAG}.7z ${IMAGE_COMMIT_TAG}.7z +7z x ${IMAGE_COMMIT_TAG}.7z -o"${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages" + +test_python_all() { + echo "Ninja version: $(ninja --version)" + python test/run_test.py --verbose +} + +test_cpp_api() { + # C++ API + + # NB: Install outside of source directory (at the same level as the root + # pytorch folder) so that it doesn't get cleaned away prior to docker push. + # But still clean it before we perform our own build. + # + CPP_BUILD="$PWD/../cpp-build" + rm -rf $CPP_BUILD + mkdir -p $CPP_BUILD + WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$CPP_BUILD" + + python tools/download_mnist.py --quiet -d test/cpp/api/mnist + + # Unfortunately it seems like the test can't load from miniconda3 + # without these paths being set + export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib" + "$CPP_BUILD"/libtorch/bin/test_api +} + +if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then + test_python_all + test_cpp_api +else + if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then + test_python_all + elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then + test_cpp_api + fi +fi diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh new file mode 100755 index 0000000..ceee027 --- /dev/null +++ b/.jenkins/pytorch/multigpu-test.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Required environment variable: $BUILD_ENVIRONMENT +# (This is set by default in the Docker images we build, so you don't +# need to set it yourself. + +COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-multigpu-test" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +echo "Testing pytorch (distributed only)" +time python test/run_test.py --verbose -i distributed diff --git a/.jenkins/pytorch/perf_test/common.sh b/.jenkins/pytorch/perf_test/common.sh new file mode 100644 index 0000000..21ce05f --- /dev/null +++ b/.jenkins/pytorch/perf_test/common.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +run_test () { + rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/ + "$@" + cd .. && rm -rf test_tmp/ +} + +get_runtime_of_command () { + TIMEFORMAT=%R + + # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null) + runtime=$( { time $@; } 2>&1 1>/dev/null) + if [[ $runtime == *"Error"* ]]; then + exit 1 + fi + runtime=${runtime#+++ $@} + runtime=$(python -c "print($runtime)") + + echo $runtime +} diff --git a/.jenkins/pytorch/perf_test/compare_with_baseline.py b/.jenkins/pytorch/perf_test/compare_with_baseline.py new file mode 100644 index 0000000..0fbeda6 --- /dev/null +++ b/.jenkins/pytorch/perf_test/compare_with_baseline.py @@ -0,0 +1,66 @@ +import sys +import json +import numpy +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--test-name', dest='test_name', action='store', + required=True, help='test name') +parser.add_argument('--sample-stats', dest='sample_stats', action='store', + required=True, help='stats from sample') +parser.add_argument('--update', action='store_true', + help='whether to update baseline using stats from sample') +args = parser.parse_args() + +test_name = args.test_name + +if 'cpu' in test_name: + backend = 'cpu' +elif 'gpu' in test_name: + backend = 'gpu' + +data_file_path = '../{}_runtime.json'.format(backend) + +with open(data_file_path) as data_file: + data = json.load(data_file) + +if test_name in data: + mean = float(data[test_name]['mean']) + sigma = float(data[test_name]['sigma']) +else: + # Let the test pass if baseline number doesn't exist + mean = sys.maxsize + sigma = 0.001 + +print("population mean: ", mean) +print("population sigma: ", sigma) + +sample_stats_data = json.loads(args.sample_stats) + +sample_mean = sample_stats_data['mean'] +sample_sigma = sample_stats_data['sigma'] + +print("sample mean: ", sample_mean) +print("sample sigma: ", sample_sigma) + +z_value = (sample_mean - mean) / sigma + +print("z-value: ", z_value) + +if z_value >= 3: + raise Exception('''\n +z-value >= 3, there is high chance of perf regression.\n +To reproduce this regression, run `cd .jenkins/pytorch/perf_test/ && bash ''' + test_name + '''.sh` on your local machine and compare the runtime before/after your code change. +''') +else: + print("z-value < 3, no perf regression detected.") + if args.update: + print("We will use these numbers as new baseline.") + new_data_file_path = '../new_{}_runtime.json'.format(backend) + with open(new_data_file_path) as new_data_file: + new_data = json.load(new_data_file) + new_data[test_name] = {} + new_data[test_name]['mean'] = sample_mean + new_data[test_name]['sigma'] = max(sample_sigma, sample_mean * 0.1) + with open(new_data_file_path, 'w') as new_data_file: + json.dump(new_data, new_data_file, indent=4) diff --git a/.jenkins/pytorch/perf_test/get_stats.py b/.jenkins/pytorch/perf_test/get_stats.py new file mode 100644 index 0000000..9e6e72a --- /dev/null +++ b/.jenkins/pytorch/perf_test/get_stats.py @@ -0,0 +1,16 @@ +import sys +import json +import numpy + +sample_data_list = sys.argv[1:] +sample_data_list = [float(v.strip()) for v in sample_data_list] + +sample_mean = numpy.mean(sample_data_list) +sample_sigma = numpy.std(sample_data_list) + +data = { + 'mean': sample_mean, + 'sigma': sample_sigma, +} + +print(json.dumps(data)) diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh new file mode 100644 index 0000000..e1360c7 --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +. ./common.sh + +test_cpu_speed_mini_sequence_labeler () { + echo "Testing: mini sequence labeler, CPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/benchmark.git + + cd benchmark/ + + git checkout 726567a455edbfda6199445922a8cfee82535664 + + cd scripts/mini_sequence_labeler + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python main.py) + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_cpu_speed_mini_sequence_labeler "$@" +fi diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh new file mode 100644 index 0000000..af3d32a --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +. ./common.sh + +test_cpu_speed_mnist () { + echo "Testing: MNIST, CPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/examples.git -b perftests + + cd examples/mnist + + pip install -r requirements.txt + + # Download data + python main.py --epochs 0 + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log) + echo $runtime + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_cpu_speed_mnist "$@" +fi diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh new file mode 100644 index 0000000..cd4776c --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh @@ -0,0 +1,28 @@ +. ./common.sh + +test_cpu_speed_torch () { + echo "Testing: torch.*, CPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/yf225/perf-tests.git + + if [ "$1" == "compare_with_baseline" ]; then + export ARGS="--compare ../cpu_runtime.json" + elif [ "$1" == "compare_and_update" ]; then + export ARGS="--compare ../cpu_runtime.json --update ../new_cpu_runtime.json" + elif [ "$1" == "update_only" ]; then + export ARGS="--update ../new_cpu_runtime.json" + fi + + if ! python perf-tests/modules/test_cpu_torch.py ${ARGS}; then + echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash "${FUNCNAME[0]}".sh\` on your local machine and compare the runtime before/after your code change." + exit 1 + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_cpu_speed_torch "$@" +fi + diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh b/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh new file mode 100644 index 0000000..c924e2e --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh @@ -0,0 +1,28 @@ +. ./common.sh + +test_cpu_speed_torch_tensor () { + echo "Testing: torch.Tensor.*, CPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/yf225/perf-tests.git + + if [ "$1" == "compare_with_baseline" ]; then + export ARGS="--compare ../cpu_runtime.json" + elif [ "$1" == "compare_and_update" ]; then + export ARGS="--compare ../cpu_runtime.json --update ../new_cpu_runtime.json" + elif [ "$1" == "update_only" ]; then + export ARGS="--update ../new_cpu_runtime.json" + fi + + if ! python perf-tests/modules/test_cpu_torch_tensor.py ${ARGS}; then + echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash "${FUNCNAME[0]}".sh\` on your local machine and compare the runtime before/after your code change." + exit 1 + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_cpu_speed_torch_tensor "$@" +fi + diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh new file mode 100644 index 0000000..ab02eb8 --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +. ./common.sh + +test_gpu_speed_cudnn_lstm () { + echo "Testing: CuDNN LSTM, GPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/benchmark.git + + cd benchmark/ + + git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0 + + cd scripts/ + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check) + echo $runtime + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_gpu_speed_cudnn_lstm "$@" +fi diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh new file mode 100644 index 0000000..ddc0d6f --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +. ./common.sh + +test_gpu_speed_lstm () { + echo "Testing: LSTM, GPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/benchmark.git + + cd benchmark/ + + git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0 + + cd scripts/ + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check) + echo $runtime + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_gpu_speed_lstm "$@" +fi diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh new file mode 100644 index 0000000..fd76267 --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +. ./common.sh + +test_gpu_speed_mlstm () { + echo "Testing: MLSTM, GPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/benchmark.git + + cd benchmark/ + + git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0 + + cd scripts/ + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check) + echo $runtime + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_gpu_speed_mlstm "$@" +fi diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh new file mode 100644 index 0000000..61d7585 --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +. ./common.sh + +test_gpu_speed_mnist () { + echo "Testing: MNIST, GPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/examples.git -b perftests + + cd examples/mnist + + pip install -r requirements.txt + + # Download data + python main.py --epochs 0 + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log) + echo $runtime + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_gpu_speed_mnist "$@" +fi diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh new file mode 100644 index 0000000..89ed044 --- /dev/null +++ b/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +. ./common.sh + +test_gpu_speed_word_language_model () { + echo "Testing: word language model on Wikitext-2, GPU" + + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + + git clone https://github.com/pytorch/examples.git -b perftests + + cd examples/word_language_model + + cd data/wikitext-2 + + # Reduce dataset size, so that we can have more runs per test + sed -n '1,200p' test.txt > test_tmp.txt + sed -n '1,1000p' train.txt > train_tmp.txt + sed -n '1,200p' valid.txt > valid_tmp.txt + + mv test_tmp.txt test.txt + mv train_tmp.txt train.txt + mv valid_tmp.txt valid.txt + + cd ../.. + + SAMPLE_ARRAY=() + NUM_RUNS=$1 + + for (( i=1; i<=$NUM_RUNS; i++ )) do + runtime=$(get_runtime_of_command python main.py --cuda --epochs 1) + echo $runtime + SAMPLE_ARRAY+=(${runtime}) + done + + cd ../.. + + stats=$(python ../get_stats.py ${SAMPLE_ARRAY[@]}) + echo "Runtime stats in seconds:" + echo $stats + + if [ "$2" == "compare_with_baseline" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" + elif [ "$2" == "compare_and_update" ]; then + python ../compare_with_baseline.py --test-name ${FUNCNAME[0]} --sample-stats "${stats}" --update + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + run_test test_gpu_speed_word_language_model "$@" +fi diff --git a/.jenkins/pytorch/perf_test/update_commit_hash.py b/.jenkins/pytorch/perf_test/update_commit_hash.py new file mode 100644 index 0000000..ee7fa8a --- /dev/null +++ b/.jenkins/pytorch/perf_test/update_commit_hash.py @@ -0,0 +1,13 @@ +import sys +import json + +data_file_path = sys.argv[1] +commit_hash = sys.argv[2] + +with open(data_file_path) as data_file: + data = json.load(data_file) + +data['commit'] = commit_hash + +with open(data_file_path, 'w') as data_file: + json.dump(data, data_file) diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py new file mode 100644 index 0000000..c914728 --- /dev/null +++ b/.jenkins/pytorch/print_sccache_log.py @@ -0,0 +1,11 @@ +import sys + +log_file_path = sys.argv[1] + +with open(log_file_path) as f: + lines = f.readlines() + +for line in lines: + # Ignore errors from CPU instruction set testing + if 'src.c' not in line: + print(line) diff --git a/.jenkins/pytorch/short-perf-test-cpu.sh b/.jenkins/pytorch/short-perf-test-cpu.sh new file mode 100755 index 0000000..5aa86cb --- /dev/null +++ b/.jenkins/pytorch/short-perf-test-cpu.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +COMPACT_JOB_NAME="short-perf-test-cpu" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +cd .jenkins/pytorch/perf_test + +echo "Running CPU perf test for PyTorch..." + +pip install awscli + +# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read +# More info at https://github.com/aws/aws-cli/issues/2321 +aws configure set default.s3.multipart_threshold 5GB + +if [[ "$COMMIT_SOURCE" == master ]]; then + # Get current master commit hash + export MASTER_COMMIT_ID=$(git log --format="%H" -n 1) +fi + +# Find the master commit to test against +git remote add upstream https://github.com/pytorch/pytorch.git +git fetch upstream +IFS=$'\n' +master_commit_ids=($(git rev-list upstream/master)) +for commit_id in "${master_commit_ids[@]}"; do + if aws s3 ls s3://ossci-perf-test/pytorch/cpu_runtime/${commit_id}.json; then + LATEST_TESTED_COMMIT=${commit_id} + break + fi +done +aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/${LATEST_TESTED_COMMIT}.json cpu_runtime.json + +if [[ "$COMMIT_SOURCE" == master ]]; then + # Prepare new baseline file + cp cpu_runtime.json new_cpu_runtime.json + python update_commit_hash.py new_cpu_runtime.json ${MASTER_COMMIT_ID} +fi + +# Include tests +. ./test_cpu_speed_mini_sequence_labeler.sh +. ./test_cpu_speed_mnist.sh +. ./test_cpu_speed_torch.sh +. ./test_cpu_speed_torch_tensor.sh + +# Run tests +export TEST_MODE="compare_with_baseline" +if [[ "$COMMIT_SOURCE" == master ]]; then + export TEST_MODE="compare_and_update" +fi + +# Operator tests +run_test test_cpu_speed_torch ${TEST_MODE} +run_test test_cpu_speed_torch_tensor ${TEST_MODE} + +# Sample model tests +run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE} +run_test test_cpu_speed_mnist 20 ${TEST_MODE} + +if [[ "$COMMIT_SOURCE" == master ]]; then + # This could cause race condition if we are testing the same master commit twice, + # but the chance of them executing this line at the same time is low. + aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/${MASTER_COMMIT_ID}.json --acl public-read +fi diff --git a/.jenkins/pytorch/short-perf-test-gpu.sh b/.jenkins/pytorch/short-perf-test-gpu.sh new file mode 100755 index 0000000..dc59fde --- /dev/null +++ b/.jenkins/pytorch/short-perf-test-gpu.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +COMPACT_JOB_NAME="short-perf-test-gpu" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +pushd .jenkins/pytorch/perf_test + +echo "Running GPU perf test for PyTorch..." + +pip install awscli + +# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read +# More info at https://github.com/aws/aws-cli/issues/2321 +aws configure set default.s3.multipart_threshold 5GB + +if [[ "$COMMIT_SOURCE" == master ]]; then + # Get current master commit hash + export MASTER_COMMIT_ID=$(git log --format="%H" -n 1) +fi + +# Find the master commit to test against +git remote add upstream https://github.com/pytorch/pytorch.git +git fetch upstream +IFS=$'\n' +master_commit_ids=($(git rev-list upstream/master)) +for commit_id in "${master_commit_ids[@]}"; do + if aws s3 ls s3://ossci-perf-test/pytorch/gpu_runtime/${commit_id}.json; then + LATEST_TESTED_COMMIT=${commit_id} + break + fi +done +aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/${LATEST_TESTED_COMMIT}.json gpu_runtime.json + +if [[ "$COMMIT_SOURCE" == master ]]; then + # Prepare new baseline file + cp gpu_runtime.json new_gpu_runtime.json + python update_commit_hash.py new_gpu_runtime.json ${MASTER_COMMIT_ID} +fi + +# Include tests +. ./test_gpu_speed_mnist.sh +. ./test_gpu_speed_word_language_model.sh +. ./test_gpu_speed_cudnn_lstm.sh +. ./test_gpu_speed_lstm.sh +. ./test_gpu_speed_mlstm.sh + +# Run tests +if [[ "$COMMIT_SOURCE" == master ]]; then + run_test test_gpu_speed_mnist 20 compare_and_update + run_test test_gpu_speed_word_language_model 20 compare_and_update + run_test test_gpu_speed_cudnn_lstm 20 compare_and_update + run_test test_gpu_speed_lstm 20 compare_and_update + run_test test_gpu_speed_mlstm 20 compare_and_update +else + run_test test_gpu_speed_mnist 20 compare_with_baseline + run_test test_gpu_speed_word_language_model 20 compare_with_baseline + run_test test_gpu_speed_cudnn_lstm 20 compare_with_baseline + run_test test_gpu_speed_lstm 20 compare_with_baseline + run_test test_gpu_speed_mlstm 20 compare_with_baseline +fi + +if [[ "$COMMIT_SOURCE" == master ]]; then + # This could cause race condition if we are testing the same master commit twice, + # but the chance of them executing this line at the same time is low. + aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/${MASTER_COMMIT_ID}.json --acl public-read +fi + +popd diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh new file mode 100755 index 0000000..bc27628 --- /dev/null +++ b/.jenkins/pytorch/test.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}-test" +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +# Required environment variable: $BUILD_ENVIRONMENT +# (This is set by default in the Docker images we build, so you don't +# need to set it yourself. + +echo "Testing pytorch" + +if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + echo "Skipping ROCm tests for now" + exit 0 +fi + +# JIT C++ extensions require ninja. +git clone https://github.com/ninja-build/ninja --quiet +pushd ninja +python ./configure.py --bootstrap +export PATH="$PWD:$PATH" +popd + +# DANGER WILL ROBINSON. The LD_PRELOAD here could cause you problems +# if you're not careful. Check this if you made some changes and the +# ASAN test is not working +if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then + export ASAN_OPTIONS=detect_leaks=0:symbolize=1 + export UBSAN_OPTIONS=print_stacktrace=1 + export PYTORCH_TEST_WITH_ASAN=1 + export PYTORCH_TEST_WITH_UBSAN=1 + # TODO: Figure out how to avoid hard-coding these paths + export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-5.0/bin/llvm-symbolizer + export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so + # Increase stack size, because ASAN red zones use more stack + ulimit -s 81920 + + function get_exit_code() { + set +e + "$@" + retcode=$? + set -e + return $retcode + } + (cd test && python -c "import torch") + echo "The next three invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured" + (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_asan(3)") + (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_ubsan(0)") + (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)") +fi + +export ATEN_DISABLE_AVX= +export ATEN_DISABLE_AVX2= +if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then + export ATEN_DISABLE_AVX=1 +fi +if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then + export ATEN_DISABLE_AVX2=1 +fi + +test_python_nn() { + time python test/run_test.py --include nn --verbose +} + +test_python_all_except_nn() { + time python test/run_test.py --exclude nn --verbose +} + +test_aten() { + # Test ATen + if [[ "$BUILD_ENVIRONMENT" != *asan* ]]; then + echo "Running ATen tests with pytorch lib" + TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib + # NB: the ATen test binaries don't have RPATH set, so it's necessary to + # put the dynamic libraries somewhere were the dynamic linker can find them. + # This is a bit of a hack. + ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin + ln -s "$TORCH_LIB_PATH"/libnccl* build/bin + ls build/bin + aten/tools/run_tests.sh build/bin + fi +} + +test_torchvision() { + rm -rf ninja + + echo "Installing torchvision at branch master" + rm -rf vision + # TODO: This git clone is bad, it means pushes to torchvision can break + # PyTorch CI + git clone https://github.com/pytorch/vision --quiet + pushd vision + # python setup.py install with a tqdm dependency is broken in the + # Travis Python nightly (but not in latest Python nightlies, so + # this should be a transient requirement...) + # See https://github.com/pytorch/pytorch/issues/7525 + #time python setup.py install + pip install . + popd +} + +test_libtorch() { + if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then + echo "Testing libtorch" + CPP_BUILD="$PWD/../cpp-build" + if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then + "$CPP_BUILD"/libtorch/bin/test_jit + else + "$CPP_BUILD"/libtorch/bin/test_jit "[cpu]" + fi + python tools/download_mnist.py --quiet -d test/cpp/api/mnist + OMP_NUM_THREADS=2 "$CPP_BUILD"/libtorch/bin/test_api + fi +} + +if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then + test_python_nn + test_python_all_except_nn + test_aten + test_torchvision + test_libtorch +else + if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then + test_python_nn + elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then + test_python_all_except_nn + test_aten + test_torchvision + test_libtorch + fi +fi diff --git a/.jenkins/pytorch/win-build.sh b/.jenkins/pytorch/win-build.sh new file mode 100755 index 0000000..03adf17 --- /dev/null +++ b/.jenkins/pytorch/win-build.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# If you want to rebuild, run this with REBUILD=1 +# If you want to build with CUDA, run this with USE_CUDA=1 +# If you want to build without CUDA, run this with USE_CUDA=0 + +if [ ! -f setup.py ]; then + echo "ERROR: Please run this build script from PyTorch root directory." + exit 1 +fi + +COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-build +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID} +if [[ ${JOB_NAME} == *"develop"* ]]; then + export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG} +fi + +mkdir -p ci_scripts/ + +cat >ci_scripts/upload_image.py << EOL + +import os +import sys +import boto3 + +IMAGE_COMMIT_TAG = os.getenv('IMAGE_COMMIT_TAG') + +session = boto3.session.Session() +s3 = session.resource('s3') +data = open(sys.argv[1], 'rb') +s3.Bucket('ossci-windows-build').put_object(Key='pytorch/'+IMAGE_COMMIT_TAG+'.7z', Body=data) +object_acl = s3.ObjectAcl('ossci-windows-build','pytorch/'+IMAGE_COMMIT_TAG+'.7z') +response = object_acl.put(ACL='public-read') + +EOL + +cat >ci_scripts/build_pytorch.bat < nul + del %CD%\\tmp_bin\\sccache.exe + if "%BUILD_ENVIRONMENT%"=="" ( + curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %CD%\\tmp_bin\\sccache.exe + ) else ( + aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe + ) + goto :check_sccache + ) +) + +:: Install Miniconda3 +if "%REBUILD%"=="" ( + IF EXIST C:\\Jenkins\\Miniconda3 ( rd /s /q C:\\Jenkins\\Miniconda3 ) + curl -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -O + .\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=C:\\Jenkins\\Miniconda3 +) +call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3 +if "%REBUILD%"=="" ( call conda install -y -q numpy cffi pyyaml boto3 ) + +:: Install ninja +if "%REBUILD%"=="" ( pip install ninja ) + +call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" x86_amd64 + +git submodule update --init --recursive + +set PATH=%CD%\\tmp_bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\libnvvp;%PATH% +set CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0 +set CUDA_PATH_V9_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0 +set NVTOOLSEXT_PATH=C:\\Program Files\\NVIDIA Corporation\\NvToolsExt +set CUDNN_LIB_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0\\lib\\x64 +set CUDA_TOOLKIT_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0 +set CUDNN_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0 + +:: Target only our CI GPU machine's CUDA arch to speed up the build +set TORCH_CUDA_ARCH_LIST=5.2 + +sccache --stop-server +sccache --start-server +sccache --zero-stats +set CC=sccache cl +set CXX=sccache cl + +set DISTUTILS_USE_SDK=1 + +set CMAKE_GENERATOR=Ninja + +if not "%USE_CUDA%"=="1" ( + if "%REBUILD%"=="" ( + set NO_CUDA=1 + python setup.py install + ) + if errorlevel 1 exit /b 1 + if not errorlevel 0 exit /b 1 +) + +if not "%USE_CUDA%"=="0" ( + if "%REBUILD%"=="" ( + sccache --show-stats + sccache --zero-stats + rd /s /q C:\\Jenkins\\Miniconda3\\Lib\\site-packages\\torch + copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe + ) + + set CUDA_NVCC_EXECUTABLE=%CD%\\tmp_bin\\nvcc + + if "%REBUILD%"=="" set NO_CUDA=0 + + python setup.py install && sccache --show-stats && ( + if "%BUILD_ENVIRONMENT%"=="" ( + echo "NOTE: To run \`import torch\`, please make sure to activate the conda environment by running \`call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3\` in Command Prompt before running Git Bash." + ) else ( + 7z a %IMAGE_COMMIT_TAG%.7z C:\\Jenkins\\Miniconda3\\Lib\\site-packages\\torch && python ci_scripts\\upload_image.py %IMAGE_COMMIT_TAG%.7z + ) + ) +) + +EOL + +ci_scripts/build_pytorch.bat +if [ ! -f $IMAGE_COMMIT_TAG.7z ] && [ ! ${BUILD_ENVIRONMENT} == "" ]; then + exit 1 +fi +echo "BUILD PASSED" diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh new file mode 100755 index 0000000..a27b9f4 --- /dev/null +++ b/.jenkins/pytorch/win-test.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-test +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID} +if [[ ${JOB_NAME} == *"develop"* ]]; then + export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG} +fi + +mkdir -p ci_scripts/ + +cat >ci_scripts/download_image.py << EOL + +import os +import sys +import boto3 +import botocore + +IMAGE_COMMIT_TAG = os.getenv('IMAGE_COMMIT_TAG') + +session = boto3.session.Session() +s3 = session.resource('s3') +BUCKET_NAME = 'ossci-windows-build' +KEY = 'pytorch/'+IMAGE_COMMIT_TAG+'.7z' +LOCAL_FILE_PATH = sys.argv[1] +try: + s3.Bucket(BUCKET_NAME).download_file(KEY, LOCAL_FILE_PATH) +except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == "404": + print("The object does not exist.") + else: + raise + +EOL + +cat >ci_scripts/setup_pytorch_env.bat <ci_scripts/test_python_nn.bat <ci_scripts/test_python_all_except_nn.bat <>>>>>> mod diff --git a/aten/.flake8 b/aten/.flake8 new file mode 100644 index 0000000..5f32207 --- /dev/null +++ b/aten/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 + diff --git a/aten/.gitignore b/aten/.gitignore new file mode 100644 index 0000000..c57b970 --- /dev/null +++ b/aten/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +build/ +*.pyc diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt new file mode 100644 index 0000000..0dc61c5 --- /dev/null +++ b/aten/CMakeLists.txt @@ -0,0 +1,143 @@ +if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + if (NOT BUILD_ATEN) + return() + endif() +else() + cmake_minimum_required(VERSION 3.0 FATAL_ERROR) + project(ATen CXX C) + include(CMakeDependentOption) + option(USE_CUDA "Use CUDA" ON) + option(USE_ROCM "Use ROCm" OFF) + option(USE_CUDNN "Use cuDNN" ON) + option(USE_MKLDNN "Use MKLDNN" ON) + cmake_dependent_option( + USE_CUDNN "Use cuDNN" ON + "USE_CUDA" OFF) + option(ATEN_NO_TEST "Do not build ATen test binaries" ON) + + # Flag for shared dependencies + set(BUILD_ATEN ON) +endif() + +# Find modules +list(APPEND CMAKE_MODULE_PATH + /usr/lib/x86_64-linux-gnu/ + ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/Modules + ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/public + ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/Modules_CUDA_fix) +list(APPEND CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/) + +cmake_policy(SET CMP0012 NEW) + +############################################# + +set(ATen_CPU_SRCS) +set(ATen_CPU_TEST_SRCS) +set(ATen_CPU_INCLUDE) +set(ATen_THIRD_PARTY_INCLUDE) +set(ATen_CUDA_SRCS) +set(ATen_CUDA_TEST_SRCS) +set(ATen_CUDA_INCLUDE) +set(ATen_CPU_DEPENDENCY_LIBS) +set(ATen_CUDA_DEPENDENCY_LIBS) +set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS) +SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory") +SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory") +SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory") + +if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + # ---[ Build variables set within the cmake tree + include(../cmake/BuildVariables.cmake) + set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.") + + # ---[ Misc checks to cope with various compiler modes + include(../cmake/MiscCheck.cmake) + + # External projects + include(ExternalProject) + + # ---[ Utils + # TODO: merge the following 3 files into cmake/public/utils.cmake. + include(../cmake/Utils.cmake) + include(../cmake/public/utils.cmake) + + # ---[ Dependencies + include(../cmake/Dependencies.cmake) + list(APPEND ATen_CPU_INCLUDE ${Caffe2_CPU_INCLUDE}) + list(APPEND ATen_CUDA_INCLUDE ${Caffe2_GPU_INCLUDE}) + list(APPEND ATen_CPU_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS}) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS}) + list(APPEND ATen_PUBLIC_CUDA_DEPENDENCY_LIBS + ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) +endif() + +if(USE_CUDA) + list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS}) +endif() + +set(TH_LINK_STYLE STATIC) +add_subdirectory(src/TH) +set(TH_CPU_INCLUDE + # dense + ${CMAKE_CURRENT_SOURCE_DIR}/src/TH + ${CMAKE_CURRENT_SOURCE_DIR}/src/THC + ${CMAKE_CURRENT_BINARY_DIR}/src/TH + ${CMAKE_CURRENT_BINARY_DIR}/src/THC + + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_BINARY_DIR}/src + ${CMAKE_BINARY_DIR}/aten/src) +list(APPEND ATen_CPU_INCLUDE ${TH_CPU_INCLUDE}) +add_subdirectory(src/THNN) + +# Find the HIP package, set the HIP paths, load the HIP CMake. +IF(USE_ROCM) + include(LoadHIP) + if (NOT PYTORCH_FOUND_HIP) + MESSAGE(FATAL_ERROR + "Could not find HIP installation") + endif() +ENDIF() + +IF(MSVC) + # we want to respect the standard, and we are bored of those **** . + ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1) + LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819 -Xcompiler /wd4503 -Xcompiler /wd4190 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4275 -Xcompiler /wd4522") +ENDIF(MSVC) + +if(USE_ROCM) + SET(AT_CUDA_ENABLED 1) + add_subdirectory(src/THC) + add_subdirectory(src/THCUNN) + message("ROCm is enabled.") +elseif(USE_CUDA) + SET(AT_CUDA_ENABLED 1) + add_subdirectory(src/THC) + add_subdirectory(src/THCUNN) +else() + message("disabling CUDA because USE_CUDA is set false") + SET(AT_CUDA_ENABLED 0) +endif() + +list(APPEND ATen_CPU_INCLUDE + ${CMAKE_CURRENT_SOURCE_DIR}/src/THNN + ${CMAKE_CURRENT_SOURCE_DIR}/src/THCUNN) + +list(APPEND ATen_CPU_INCLUDE + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/catch/single_include + ${CMAKE_CURRENT_BINARY_DIR}/src/ATen) +add_subdirectory(src/ATen) + +if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + # Pass source, includes, and libs to parent + set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) + set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) + set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) + set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) + set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) + set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) + set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) + set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) + set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) +endif() diff --git a/aten/README.md b/aten/README.md new file mode 100644 index 0000000..e9ada01 --- /dev/null +++ b/aten/README.md @@ -0,0 +1,258 @@ +# ATen: A TENsor library + +ATen is a simple tensor library thats exposes the Tensor operations in Torch +and PyTorch directly in C++11. The wrapper respects the semantics of operators +in PyTorch, except minor details due to differences between C++ and Python in +the way default arguments are handled. See the [documentation for tensors](http://pytorch.org/docs/tensors.html) in PyTorch for what these operations do. +ATen's API is auto-generated from the same declarations PyTorch uses so the +two APIs will track each other over time. + +Tensor types are resolved dynamically, such that the API is generic and +does not include templates. That is, there is one `Tensor` type. It can hold a +CPU or CUDA Tensor, and the tensor may have Doubles, Float, Ints, etc. This design +makes it easy to write generic code without templating everything. + +See the _generated_ [`Tensor.h` file](doc/Tensor.h) and [`Functions.h` file](doc/Functions.h) for the provided API. Excerpt: +```c++ +Tensor atan2(const Tensor & other) const; +Tensor & atan2_(const Tensor & other); +Tensor pow(Scalar exponent) const; +Tensor pow(const Tensor & exponent) const; +Tensor & pow_(Scalar exponent); +Tensor & pow_(const Tensor & exponent); +Tensor lerp(const Tensor & end, Scalar weight) const; +Tensor & lerp_(const Tensor & end, Scalar weight); +Tensor histc() const; +Tensor histc(int64_t bins) const; +Tensor histc(int64_t bins, Scalar min) const; +Tensor histc(int64_t bins, Scalar min, Scalar max) const; +``` + +Inplace operations are also provided, and always suffixed by `_` to indicate they will modify the Tensor. + +### Installation + +TH/THC/THNN/THCUNN are provided (as git subtrees), so the repo is standalone. You will need a C++11 compiler, cmake, and the pyyaml python package. +``` + +# Install pyyaml used by python code generation to read API declarations + +# macOS: if you don't have pip +sudo easy_install pip +# Ubuntu: if you don't have pip +apt-get -y install python-pip + +# if you don't have pyyaml +sudo pip install pyyaml + +mkdir build +cd build +cmake .. -DCMAKE_INSTALL_PREFIX=/where/you/want # specify your dest directory +# cmake .. -DUSE_NVRTC=ON -DUSE_TENSORRT=OFF -DCMAKE_INSTALL_PREFIX=../install -DCAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO=OFF -DUSE_CUDA=ON # for CUDA +# cmake .. -DUSE_CUDA=OFF # for CPU only machines +make install +``` + +### Example usage + +Here is a simple example; again, the syntax follows Torch semantics. + +```c++ +using namespace at; // assumed in the following + +Tensor d = CPU(kFloat).ones({3, 4}); +Tensor r = CPU(kFloat).zeros({3,4}); +for(auto i = 0; i < 100000; i++) { + r = r.add(d); + // equivalently + r = r + d; + // or + r += d; +} +``` + +Want this running on the GPU? +```c++ +using namespace at; // assumed in the following + +Tensor d = CUDA(kFloat).ones({3, 4}); +Tensor r = CUDA(kFloat).zeros({3,4}); +for(auto i = 0; i < 100000; i++) { + r = r.add(d); + // equivalently + r = r + d; + // or + r += d; +} +``` + +Expressions like `CUDA(kFloat)` are first-class `at::Type` objects that represent +the type of a Tensor and are used to create Tensors when their type cannot be +inferred. See the _generated_ [Type header](doc/Type.h) for its API. + +See more in [sample files](src/ATen/test). + +### Creating your kernel + +It is easy to create new kernels, thanks to the `dispatch<>()` templated function. Example: +```c++ + +// a simple sum kernel (for CPU only) +template +struct sum_op { + // dispatch handles variable arguments for you + Tensor CPU(const Type & t, Tensor & x_) + { + Tensor x = x_.contiguous(); + auto x_p = x.data(); + int64_t size = x.numel(); + T sum = 0; + for(int64_t i = 0; i < size; i++) { + sum += x_p[i]; + } + return sum; + }; + Tensor CUDA(Tensor& x) { + throw std::invalid_argument("device not supported"); + }; +}; + +Tensor a = CPU(kFloat).rand({3, 7}); +std::cout << a << std::endl; +std::cout << dispatch(a.type(),a) << " == " << a.sum() << std::endl; +``` + +### Efficient access to tensor elements + +When using Tensor-wide operations, the relative cost of dynamic dispatch is very small. +However, there are cases, especially in your own kernels, where efficient element-wise access is needed, +and the cost of dynamic dispatch inside the element-wise loop is very high. +ATen provides _accessors_ that are created with a single dynamic check that a Tensor is the type and number of +dimensions. Accessors then expose an API for accessing the Tensor elements efficiently: + +```c++ + +Tensor foo = CPU(kFloat).rand({12,12}); + +// assert foo is 2-dimensional and holds floats. +auto foo_a = foo.accessor(); +float trace = 0; + +for(int i = 0; i < foo_a.size(0); i++) { + // use the accessor foo_a to get tensor data. + trace += foo_a[i][i]; +} +``` + +Accessors are temporary views of a Tensor. They are only valid for the lifetime of the tensor that they +view and hence should only be used locally in a function, like iterators. + +### Using externally created data + +If you already have your tensor data allocated in memory (CPU or CUDA), +you can view that memory as a Tensor in ATen: + +```c++ +float data[] = { 1, 2, 3, + 4, 5, 6}; +auto f = CPU(kFloat).tensorFromBlob(data, {2,3}); +cout << f << endl; +``` + +These tensors cannot be resized because ATen does not own the memory, but otherwise +behave as normal tensors. + +### Scalars and zero-dimensional tensors + +In addition to the `Tensor` objects, ATen also includes `Scalar`s that represent a single number. +Like a Tensor, Scalars are dynamically typed and can hold any one of ATen's [number types](doc/Type.h). +Scalars can be implicitly constructed from C++ number types. Scalars are needed because some functions like `addmm` take numbers along with Tensors and expect these +numbers to be the same dynamic type as the tensor. They are also used in the API to indicate places where +a function will _always_ return a Scalar value, like `sum`. + +```c++ +Tensor addmm(Scalar beta, const Tensor & self, + Scalar alpha, const Tensor & mat1, + const Tensor & mat2); +Scalar sum(const Tensor & self); + +//usage +Tensor a = ... +Tensor b = ... +Tensor c = ... +Tensor r = addmm(1.0, a, .5, b, c); +``` + +In addition to Scalars, ATen also allows Tensor objects to be zero-dimensional. These Tensors hold +a single value and they can be references to a single element in a larger Tensor. They can be used anywhere a Tensor is expected. They are normally created by operators like `select` which reduce the dimensions of +a Tensor. + +```c++ +Tensor two = CPU(kFloat).rand({10,20}); +two[1][2] = 4; +//~~~~~~~ zero-dimensional Tensor +``` + +It is possible to convert between Scalar and zero-dim Tensors: + +```c++ +Tensor zero_dim = CPU(kFloat).scalarTensor(4); +Scalar from_tensor = Scalar(zero_dim); //only valid when zero_dim.dim() == 0; +``` + +### Avoiding unnecessary CUDA synchronization in your kernels when using Scalars + +Moving a single number from the GPU to the CPU introduces a synchronization point +that can add latency to your program. In certain cases the result of a GPU operator like `sum` which +returns a Scalar may be plugged into another GPU operator as an argument. If Scalars were always copied +to the CPU, this would result in 2 copies. To avoid these synchronizations, Scalar objects can be +optionally backed by a zero-dim Tensor, and are only copied to the CPU when requested. + +```c++ +auto a = CUDA(kFloat).rand({3,4}); +Scalar on_gpu = Scalar(a[1][1]); //backed by zero-dim Tensor +assert(on_gpu.isBackedByTensor()); + +double value = on_gpu.toDouble(); // copied to CPU, if it was backed by GPU Tensor. +Scalar svalue = on_gpu.local(); // force the Scalar to become local to CPU. + +// get the scalar as a zero-dim tensor. If it was already backed +// by a zero-dim Tensor then this op has no synchronization. +// if the Scalar was local on CPU, it performs the copy +Tensor same_tensor = CUDA(kFloat).scalarTensor(on_gpu); +``` + +Operators aware of the location of Scalars can arrange to do the minimal number of copies required. + +### Developer notes + +ATen relies heavily on code generation to automatically generate headers +and implementations for all of the tensor methods it supports. The main +entry point for the script which does all this work is +[`src/ATen/gen.py`](src/ATen/gen.py), which ingests +[`src/ATen/Declarations.cwrap`](src/ATen/Declarations.cwrap), +[`src/ATen/nn.yaml`](src/ATen/nn.yaml), +[`src/ATen/native/native_functions.yaml`](src/ATen/native/native_functions.yaml) and the THNN/THCUNN headers and +produces all of the headers and wrapping code necessary to generate +the ATen interface. + +If you need to understand how ATen understands a declaration after all +of this processing occurs, it's helpful to look at the generated file +`Declarations.yaml` (NB: not cwrap) which contains information for all +ATen methods in a uniform manner. This file is utilized by PyTorch +which further extends the ATen interface with support for automatic +differentation. + +#### Note [ATen preprocessor philosophy] + +ATen is designed to be simple to use, and one of the things this implies is +that it should not be necessary to use preprocessor macros when using ATen; +we would rather provide all symbols, even for functionality that is not +available on the system ATen is running on. + +This means that internally inside ATen, whereas other libraries might +simply omit source files for, e.g., CuDNN, when CuDNN libraries are not +installed, ATen will always build these source files, compiling stub +functions for anything that is not available. ATen never uses +`AT_ENABLED_CUDA()` in header files, and all types in ATen's public API +are always available no matter your build configuration. diff --git a/aten/conda/build.sh b/aten/conda/build.sh new file mode 100644 index 0000000..f0ca38f --- /dev/null +++ b/aten/conda/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e + +if [ -z "$PREFIX" ]; then + PREFIX="$CONDA_PREFIX" +fi + +# When conda-build constructs a new working copy to perform a build +# in, it recursively copies *all* files and directories in the original +# source directory, including any pre-existing build products (e.g., +# if you previously ran cmake.) This is problematic, because if +# a 'build' directory already exists, cmake will reuse build settings +# rather than recompute them from scratch. We want a fresh build, so +# we prophylactically remove the build directory. +rm -rf build || true + +mkdir -p build +cd build +cmake -DCMAKE_INSTALL_PREFIX="$PREFIX" -DCMAKE_PREFIX_PATH="$PREFIX" -DCMAKE_BUILD_TYPE=Release $CONDA_CMAKE_ARGS .. +make install -j20 diff --git a/aten/conda/meta.yaml b/aten/conda/meta.yaml new file mode 100644 index 0000000..7493e5d --- /dev/null +++ b/aten/conda/meta.yaml @@ -0,0 +1,33 @@ +{% set version = "0.1.dev" %} + +package: + name: aten + version: {{ version }} + +source: + path: .. + +build: + number: 1 + skip: True # [win] + script_env: + - CONDA_CMAKE_ARGS + +requirements: + build: + - cmake + - pyyaml + - setuptools + - python + - mkl # [not osx] + run: + - mkl # [not osx] + +about: + home: https://github.com/zdevito/ATen + license: BSD + summary: A TENsor library for C++11 + +extra: + recipe-maintainers: + - ezyang diff --git a/aten/doc/Functions.h b/aten/doc/Functions.h new file mode 100644 index 0000000..2fd9d72 --- /dev/null +++ b/aten/doc/Functions.h @@ -0,0 +1,3133 @@ +#pragma once + +#include "ATen/Scalar.h" +#include "ATen/Type.h" +#include "ATen/Tensor.h" +#include "ATen/Storage.h" +#include "ATen/Generator.h" + + +namespace at { + +static inline Tensor & zeros_out(Tensor & result, IntList size); +static inline Tensor & zeros_like_out(Tensor & result, const Tensor & input); +static inline Tensor zeros_like(const Tensor & input); +static inline Tensor & ones_out(Tensor & result, IntList size); +static inline Tensor & ones_like_out(Tensor & result, const Tensor & input); +static inline Tensor ones_like(const Tensor & input); +static inline int64_t numel(const Tensor & self); +static inline Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask); +static inline Tensor masked_select(const Tensor & self, const Tensor & mask); +static inline Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1); +static inline Tensor t(const Tensor & self); +static inline Tensor & nonzero_out(Tensor & result, const Tensor & self); +static inline Tensor nonzero(const Tensor & self); +static inline Tensor & index_select_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index); +static inline Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index); +static inline Tensor & take_out(Tensor & result, const Tensor & self, const Tensor & index); +static inline Tensor take(const Tensor & self, const Tensor & index); +static inline Tensor & range_out(Tensor & result, Scalar start, Scalar end, Scalar step=1); +static inline Tensor & arange_out(Tensor & result, Scalar start, Scalar end, Scalar step=1); +static inline Tensor & arange_out(Tensor & result, Scalar end); +static inline Tensor & gather_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index); +static inline Tensor gather(const Tensor & self, int64_t dim, const Tensor & index); +static inline bool equal(const Tensor & self, const Tensor & other); +static inline Tensor & __and___out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor __and__(const Tensor & self, Scalar other); +static inline Tensor & __and___out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor __and__(const Tensor & self, const Tensor & other); +static inline Tensor & __iand__(Tensor & self, Scalar other); +static inline Tensor & __iand__(Tensor & self, const Tensor & other); +static inline Tensor & __or___out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor __or__(const Tensor & self, Scalar other); +static inline Tensor & __or___out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor __or__(const Tensor & self, const Tensor & other); +static inline Tensor & __ior__(Tensor & self, Scalar other); +static inline Tensor & __ior__(Tensor & self, const Tensor & other); +static inline Tensor & __xor___out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor __xor__(const Tensor & self, Scalar other); +static inline Tensor & __xor___out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor __xor__(const Tensor & self, const Tensor & other); +static inline Tensor & __ixor__(Tensor & self, Scalar other); +static inline Tensor & __ixor__(Tensor & self, const Tensor & other); +static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor __lshift__(const Tensor & self, Scalar other); +static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor __lshift__(const Tensor & self, const Tensor & other); +static inline Tensor & __ilshift__(Tensor & self, Scalar other); +static inline Tensor & __ilshift__(Tensor & self, const Tensor & other); +static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor __rshift__(const Tensor & self, Scalar other); +static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor __rshift__(const Tensor & self, const Tensor & other); +static inline Tensor & __irshift__(Tensor & self, Scalar other); +static inline Tensor & __irshift__(Tensor & self, const Tensor & other); +static inline Tensor & lt_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor lt(const Tensor & self, Scalar other); +static inline Tensor & lt_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor lt(const Tensor & self, const Tensor & other); +static inline Tensor & gt_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor gt(const Tensor & self, Scalar other); +static inline Tensor & gt_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor gt(const Tensor & self, const Tensor & other); +static inline Tensor & le_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor le(const Tensor & self, Scalar other); +static inline Tensor & le_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor le(const Tensor & self, const Tensor & other); +static inline Tensor & ge_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor ge(const Tensor & self, Scalar other); +static inline Tensor & ge_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor ge(const Tensor & self, const Tensor & other); +static inline Tensor & eq_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor eq(const Tensor & self, Scalar other); +static inline Tensor & eq_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor eq(const Tensor & self, const Tensor & other); +static inline Tensor & ne_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor ne(const Tensor & self, Scalar other); +static inline Tensor & ne_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor ne(const Tensor & self, const Tensor & other); +static inline std::tuple min_out(Tensor & min, Tensor & min_indices, const Tensor & self, int64_t dim, bool keepdim=false); +static inline std::tuple min(const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor & min_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor min(const Tensor & self, const Tensor & other); +static inline Tensor min(const Tensor & self); +static inline std::tuple max_out(Tensor & max, Tensor & max_indices, const Tensor & self, int64_t dim, bool keepdim=false); +static inline std::tuple max(const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor & max_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor max(const Tensor & self, const Tensor & other); +static inline Tensor max(const Tensor & self); +static inline std::tuple kthvalue_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false); +static inline std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false); +static inline std::tuple mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool keepdim=false); +static inline std::tuple mode(const Tensor & self, int64_t dim=-1, bool keepdim=false); +static inline std::tuple median_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim=false); +static inline std::tuple median(const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor median(const Tensor & self); +static inline std::tuple sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool descending=false); +static inline std::tuple sort(const Tensor & self, int64_t dim=-1, bool descending=false); +static inline std::tuple topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true); +static inline std::tuple topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true); +static inline Tensor & abs_out(Tensor & result, const Tensor & self); +static inline Tensor abs(const Tensor & self); +static inline Tensor & sigmoid_out(Tensor & result, const Tensor & self); +static inline Tensor sigmoid(const Tensor & self); +static inline Tensor & log_out(Tensor & result, const Tensor & self); +static inline Tensor log(const Tensor & self); +static inline Tensor & log1p_out(Tensor & result, const Tensor & self); +static inline Tensor log1p(const Tensor & self); +static inline Tensor & lgamma_out(Tensor & result, const Tensor & self); +static inline Tensor lgamma(const Tensor & self); +static inline Tensor & digamma_out(Tensor & result, const Tensor & self); +static inline Tensor digamma(const Tensor & self); +static inline Tensor & polygamma_out(Tensor & result, int64_t n, const Tensor & self); +static inline Tensor polygamma(int64_t n, const Tensor & self); +static inline Tensor & exp_out(Tensor & result, const Tensor & self); +static inline Tensor exp(const Tensor & self); +static inline Tensor & expm1_out(Tensor & result, const Tensor & self); +static inline Tensor expm1(const Tensor & self); +static inline Tensor & cos_out(Tensor & result, const Tensor & self); +static inline Tensor cos(const Tensor & self); +static inline Tensor & acos_out(Tensor & result, const Tensor & self); +static inline Tensor acos(const Tensor & self); +static inline Tensor & cosh_out(Tensor & result, const Tensor & self); +static inline Tensor cosh(const Tensor & self); +static inline Tensor & sin_out(Tensor & result, const Tensor & self); +static inline Tensor sin(const Tensor & self); +static inline Tensor & asin_out(Tensor & result, const Tensor & self); +static inline Tensor asin(const Tensor & self); +static inline Tensor & sinh_out(Tensor & result, const Tensor & self); +static inline Tensor sinh(const Tensor & self); +static inline Tensor & tan_out(Tensor & result, const Tensor & self); +static inline Tensor tan(const Tensor & self); +static inline Tensor & atan_out(Tensor & result, const Tensor & self); +static inline Tensor atan(const Tensor & self); +static inline Tensor & tanh_out(Tensor & result, const Tensor & self); +static inline Tensor tanh(const Tensor & self); +static inline Tensor & erf_out(Tensor & result, const Tensor & self); +static inline Tensor erf(const Tensor & self); +static inline Tensor & erfc_out(Tensor & result, const Tensor & self); +static inline Tensor erfc(const Tensor & self); +static inline Tensor & erfinv_out(Tensor & result, const Tensor & self); +static inline Tensor erfinv(const Tensor & self); +static inline Tensor & sqrt_out(Tensor & result, const Tensor & self); +static inline Tensor sqrt(const Tensor & self); +static inline Tensor & rsqrt_out(Tensor & result, const Tensor & self); +static inline Tensor rsqrt(const Tensor & self); +static inline Tensor & ceil_out(Tensor & result, const Tensor & self); +static inline Tensor ceil(const Tensor & self); +static inline Tensor & floor_out(Tensor & result, const Tensor & self); +static inline Tensor floor(const Tensor & self); +static inline Tensor & round_out(Tensor & result, const Tensor & self); +static inline Tensor round(const Tensor & self); +static inline Tensor & trunc_out(Tensor & result, const Tensor & self); +static inline Tensor trunc(const Tensor & self); +static inline Tensor & frac_out(Tensor & result, const Tensor & self); +static inline Tensor frac(const Tensor & self); +static inline Tensor & mean_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor mean(const Tensor & self); +static inline Tensor & var_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false); +static inline Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false); +static inline Tensor var(const Tensor & self, bool unbiased=true); +static inline Tensor & std_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false); +static inline Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false); +static inline Tensor std(const Tensor & self, bool unbiased=true); +static inline Tensor & norm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, bool keepdim=false); +static inline Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false); +static inline Tensor norm(const Tensor & self, Scalar p=2); +static inline Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +static inline Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +static inline Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2); +static inline Tensor & reciprocal_out(Tensor & result, const Tensor & self); +static inline Tensor reciprocal(const Tensor & self); +static inline Tensor & neg_out(Tensor & result, const Tensor & self); +static inline Tensor neg(const Tensor & self); +static inline Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor atan2(const Tensor & self, const Tensor & other); +static inline Tensor & pow_out(Tensor & result, const Tensor & self, Scalar exponent); +static inline Tensor pow(const Tensor & self, Scalar exponent); +static inline Tensor & pow_out(Tensor & result, const Tensor & self, const Tensor & exponent); +static inline Tensor pow(const Tensor & self, const Tensor & exponent); +static inline Tensor & pow_out(Tensor & result, Scalar base, const Tensor & self); +static inline Tensor pow(Scalar base, const Tensor & self); +static inline Tensor & lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight); +static inline Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight); +static inline Tensor & linspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100); +static inline Tensor & logspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100); +static inline Tensor & histc_out(Tensor & result, const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0); +static inline Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0); +static inline Tensor & sum_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor sum(const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor sum(const Tensor & self); +static inline Tensor & prod_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false); +static inline Tensor prod(const Tensor & self); +static inline Tensor & cumsum_out(Tensor & result, const Tensor & self, int64_t dim); +static inline Tensor cumsum(const Tensor & self, int64_t dim); +static inline Tensor & cumprod_out(Tensor & result, const Tensor & self, int64_t dim); +static inline Tensor cumprod(const Tensor & self, int64_t dim); +static inline Tensor & sign_out(Tensor & result, const Tensor & self); +static inline Tensor sign(const Tensor & self); +static inline Tensor trace(const Tensor & self); +static inline Tensor & add_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1); +static inline Tensor add(const Tensor & self, Scalar other, Scalar alpha=1); +static inline Tensor & add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1); +static inline Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1); +static inline Tensor & add_out(Tensor & result, const Tensor & self, SparseTensor other, Scalar alpha=1); +static inline Tensor add(const Tensor & self, SparseTensor other, Scalar alpha=1); +static inline Tensor & sub_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1); +static inline Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1); +static inline Tensor & sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1); +static inline Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1); +static inline Tensor & mul_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor mul(const Tensor & self, Scalar other); +static inline Tensor & mul_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor mul(const Tensor & self, const Tensor & other); +static inline Tensor & div_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor div(const Tensor & self, Scalar other); +static inline Tensor & div_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor div(const Tensor & self, const Tensor & other); +static inline Tensor & fmod_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor fmod(const Tensor & self, Scalar other); +static inline Tensor & fmod_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor fmod(const Tensor & self, const Tensor & other); +static inline Tensor & remainder_out(Tensor & result, const Tensor & self, Scalar other); +static inline Tensor remainder(const Tensor & self, Scalar other); +static inline Tensor & remainder_out(Tensor & result, const Tensor & self, const Tensor & other); +static inline Tensor remainder(const Tensor & self, const Tensor & other); +static inline Tensor & clamp_out(Tensor & result, const Tensor & self, Scalar min, Scalar max); +static inline Tensor clamp(const Tensor & self, Scalar min, Scalar max); +static inline Tensor & clamp_(Tensor & self, Scalar min, Scalar max); +static inline Tensor & clamp_min_out(Tensor & result, const Tensor & self, Scalar min); +static inline Tensor clamp_min(const Tensor & self, Scalar min); +static inline Tensor & clamp_min_(Tensor & self, Scalar min); +static inline Tensor & clamp_max_out(Tensor & result, const Tensor & self, Scalar max); +static inline Tensor clamp_max(const Tensor & self, Scalar max); +static inline Tensor & clamp_max_(Tensor & self, Scalar max); +static inline Tensor _dot(const Tensor & self, const Tensor & tensor); +static inline Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal=0); +static inline Tensor tril(const Tensor & self, int64_t diagonal=0); +static inline Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal=0); +static inline Tensor triu(const Tensor & self, int64_t diagonal=0); +static inline Tensor & cross_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim=-1); +static inline Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1); +static inline Tensor & eye_out(Tensor & result, int64_t n, int64_t m=-1); +static inline Tensor & diag_out(Tensor & result, const Tensor & self, int64_t diagonal=0); +static inline Tensor diag(const Tensor & self, int64_t diagonal=0); +static inline Tensor & addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); +static inline Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & addmm_out(Tensor & result, const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); +static inline Tensor addmm(const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & _addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); +static inline Tensor _addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); +static inline Tensor & _addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); +static inline Tensor _addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & _ger_out(Tensor & result, const Tensor & self, const Tensor & vec2); +static inline Tensor _ger(const Tensor & self, const Tensor & vec2); +static inline Tensor & _mv_out(Tensor & result, const Tensor & self, const Tensor & vec); +static inline Tensor _mv(const Tensor & self, const Tensor & vec); +static inline Tensor & _mm_out(Tensor & result, const Tensor & self, const Tensor & mat2); +static inline Tensor _mm(const Tensor & self, const Tensor & mat2); +static inline Tensor & bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2); +static inline Tensor bmm(const Tensor & self, const Tensor & mat2); +static inline Tensor & addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); +static inline Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); +static inline Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); +static inline Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); +static inline Tensor & addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); +static inline Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); +static inline std::tuple gesv_out(Tensor & solution, Tensor & lu, const Tensor & self, const Tensor & A); +static inline std::tuple gesv(const Tensor & self, const Tensor & A); +static inline std::tuple gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A); +static inline std::tuple gels(const Tensor & self, const Tensor & A); +static inline std::tuple trtrs_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false); +static inline std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false); +static inline std::tuple symeig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false, bool upper=true); +static inline std::tuple symeig(const Tensor & self, bool eigenvectors=false, bool upper=true); +static inline std::tuple eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false); +static inline std::tuple eig(const Tensor & self, bool eigenvectors=false); +static inline std::tuple svd_out(Tensor & res1, Tensor & res2, Tensor & res3, const Tensor & self, bool some=true); +static inline std::tuple svd(const Tensor & self, bool some=true); +static inline Tensor & inverse_out(Tensor & output, const Tensor & self); +static inline Tensor inverse(const Tensor & self); +static inline Tensor & potrf_out(Tensor & output, const Tensor & self, bool upper=true); +static inline Tensor potrf(const Tensor & self, bool upper=true); +static inline Tensor & potrs_out(Tensor & result, const Tensor & self, const Tensor & input2, bool upper=true); +static inline Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true); +static inline Tensor & potri_out(Tensor & output, const Tensor & self, bool upper=true); +static inline Tensor potri(const Tensor & self, bool upper=true); +static inline std::tuple pstrf_out(Tensor & res1, Tensor & res2, const Tensor & self, bool upper=true, Scalar tol=-1); +static inline std::tuple pstrf(const Tensor & self, bool upper=true, Scalar tol=-1); +static inline std::tuple qr_out(Tensor & res1, Tensor & res2, const Tensor & self); +static inline std::tuple qr(const Tensor & self); +static inline std::tuple geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self); +static inline std::tuple geqrf(const Tensor & self); +static inline Tensor & orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2); +static inline Tensor orgqr(const Tensor & self, const Tensor & input2); +static inline Tensor & ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false); +static inline Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false); +static inline std::tuple btrifact_out(Tensor & result, Tensor & pivots, const Tensor & self, bool pivot=true); +static inline std::tuple btrifact(const Tensor & self, bool pivot=true); +static inline std::tuple btrifact_with_info_out(Tensor & result, Tensor & pivots, Tensor & info, const Tensor & self, bool pivot=true); +static inline std::tuple btrifact_with_info(const Tensor & self, bool pivot=true); +static inline Tensor & btrisolve_out(Tensor & result, const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots); +static inline Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots); +static inline Tensor & randperm_out(Tensor & result, int64_t n, Generator * generator=nullptr); +static inline Tensor & multinomial_out(Tensor & result, const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr); +static inline Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr); +static inline Tensor & normal_out(Tensor & output, const Tensor & mean, double std=1, Generator * generator=nullptr); +static inline Tensor normal(const Tensor & mean, double std=1, Generator * generator=nullptr); +static inline Tensor & normal_out(Tensor & output, double mean, const Tensor & std, Generator * generator=nullptr); +static inline Tensor normal(double mean, const Tensor & std, Generator * generator=nullptr); +static inline Tensor & normal_out(Tensor & output, const Tensor & mean, const Tensor & std, Generator * generator=nullptr); +static inline Tensor normal(const Tensor & mean, const Tensor & std, Generator * generator=nullptr); +static inline Tensor & rand_out(Tensor & result, IntList size, Generator * generator=nullptr); +static inline Tensor & randn_out(Tensor & result, IntList size, Generator * generator=nullptr); +static inline Tensor & bernoulli_out(Tensor & output, const Tensor & self, Generator * generator=nullptr); +static inline Tensor bernoulli(const Tensor & self, Generator * generator=nullptr); +static inline Tensor & _standard_gamma_out(Tensor & output, const Tensor & self, Generator * generator=nullptr); +static inline Tensor _standard_gamma(const Tensor & self, Generator * generator=nullptr); +static inline Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total); +static inline Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total); +static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size); +static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values); +static inline Tensor alias(const Tensor & self); +static inline Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1); +static inline Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1); +static inline Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1); +static inline Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim=0); +static inline Tensor _cat(TensorList tensors, int64_t dim=0); +static inline Tensor & binary_cross_entropy_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true); +static inline Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true); +static inline Tensor & binary_cross_entropy_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce); +static inline Tensor binary_cross_entropy_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce); +static inline Tensor & binary_cross_entropy_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce); +static inline Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce); +static inline Tensor & kl_div_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor kl_div(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor & kl_div_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor kl_div_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & kl_div_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor kl_div_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor & l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & mse_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor mse_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor & mse_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor mse_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & mse_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & multi_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true); +static inline Tensor multi_margin_loss(const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true); +static inline Tensor & multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average); +static inline Tensor multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average); +static inline Tensor & multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average); +static inline Tensor multi_margin_loss_backward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average); +static inline Tensor & multilabel_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline std::tuple multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline std::tuple multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target); +static inline Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target); +static inline Tensor & nll_loss_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true); +static inline Tensor nll_loss(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true); +static inline std::tuple nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce); +static inline std::tuple nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce); +static inline Tensor & nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight); +static inline Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight); +static inline Tensor & nll_loss2d_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true); +static inline Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true); +static inline std::tuple nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce); +static inline std::tuple nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce); +static inline Tensor & nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight); +static inline Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight); +static inline Tensor & smooth_l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true); +static inline Tensor & smooth_l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor smooth_l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & smooth_l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce); +static inline Tensor & soft_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true); +static inline Tensor soft_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true); +static inline Tensor & soft_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average); +static inline Tensor soft_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average); +static inline Tensor & soft_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, bool size_average); +static inline Tensor soft_margin_loss_backward(const Tensor & self, const Tensor & target, bool size_average); +static inline Tensor & elu_out(Tensor & output, const Tensor & self, Scalar alpha=1, Scalar scale=1); +static inline Tensor elu(const Tensor & self, Scalar alpha=1, Scalar scale=1); +static inline Tensor & elu_forward_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale); +static inline Tensor elu_forward(const Tensor & self, Scalar alpha, Scalar scale); +static inline Tensor & elu_backward_out(Tensor & grad_input, const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output); +static inline Tensor elu_backward(const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output); +static inline Tensor & elu_(Tensor & self, Scalar alpha=1, Scalar scale=1); +static inline Tensor & elu_forward_(Tensor & self, Scalar alpha, Scalar scale); +static inline Tensor & glu_out(Tensor & output, const Tensor & self, int64_t dim=-1); +static inline Tensor glu(const Tensor & self, int64_t dim=-1); +static inline Tensor & glu_forward_out(Tensor & output, const Tensor & self, int64_t dim); +static inline Tensor glu_forward(const Tensor & self, int64_t dim); +static inline Tensor & glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim); +static inline Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim); +static inline Tensor & hardtanh_out(Tensor & output, const Tensor & self, Scalar min_val=-1, Scalar max_val=1); +static inline Tensor hardtanh(const Tensor & self, Scalar min_val=-1, Scalar max_val=1); +static inline Tensor & hardtanh_forward_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val); +static inline Tensor hardtanh_forward(const Tensor & self, Scalar min_val, Scalar max_val); +static inline Tensor & hardtanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val); +static inline Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val); +static inline Tensor & hardtanh_(Tensor & self, Scalar min_val=-1, Scalar max_val=1); +static inline Tensor & hardtanh_forward_(Tensor & self, Scalar min_val, Scalar max_val); +static inline Tensor & leaky_relu_out(Tensor & output, const Tensor & self, Scalar negative_slope=0.01); +static inline Tensor leaky_relu(const Tensor & self, Scalar negative_slope=0.01); +static inline Tensor & leaky_relu_forward_out(Tensor & output, const Tensor & self, Scalar negative_slope); +static inline Tensor leaky_relu_forward(const Tensor & self, Scalar negative_slope); +static inline Tensor & leaky_relu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar negative_slope); +static inline Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, Scalar negative_slope); +static inline Tensor & leaky_relu_(Tensor & self, Scalar negative_slope=0.01); +static inline Tensor & leaky_relu_forward_(Tensor & self, Scalar negative_slope); +static inline Tensor & log_sigmoid_out(Tensor & output, const Tensor & self); +static inline Tensor log_sigmoid(const Tensor & self); +static inline std::tuple log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self); +static inline std::tuple log_sigmoid_forward(const Tensor & self); +static inline Tensor & log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer); +static inline Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer); +static inline Tensor & log_softmax_out(Tensor & output, const Tensor & self, int64_t dim); +static inline Tensor log_softmax(const Tensor & self, int64_t dim); +static inline Tensor & log_softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim); +static inline Tensor log_softmax_forward(const Tensor & self, int64_t dim); +static inline Tensor & log_softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output); +static inline Tensor log_softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output); +static inline Tensor & prelu_out(Tensor & output, const Tensor & self, const Tensor & weight); +static inline Tensor prelu(const Tensor & self, const Tensor & weight); +static inline Tensor & prelu_forward_out(Tensor & output, const Tensor & self, const Tensor & weight); +static inline Tensor prelu_forward(const Tensor & self, const Tensor & weight); +static inline std::tuple prelu_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight); +static inline std::tuple prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, std::array output_mask={{true, true}}); +static inline Tensor & rrelu_with_noise_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr); +static inline Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr); +static inline Tensor & rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator); +static inline Tensor rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator); +static inline Tensor & rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training); +static inline Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training); +static inline Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr); +static inline Tensor & rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator); +static inline Tensor & softmax_out(Tensor & output, const Tensor & self, int64_t dim); +static inline Tensor softmax(const Tensor & self, int64_t dim); +static inline Tensor & softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim); +static inline Tensor softmax_forward(const Tensor & self, int64_t dim); +static inline Tensor & softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output); +static inline Tensor softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output); +static inline Tensor & softplus_out(Tensor & output, const Tensor & self, Scalar beta=1, Scalar threshold=20); +static inline Tensor softplus(const Tensor & self, Scalar beta=1, Scalar threshold=20); +static inline Tensor & softplus_forward_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold); +static inline Tensor softplus_forward(const Tensor & self, Scalar beta, Scalar threshold); +static inline Tensor & softplus_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output); +static inline Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output); +static inline Tensor & softshrink_out(Tensor & output, const Tensor & self, Scalar lambd=0.5); +static inline Tensor softshrink(const Tensor & self, Scalar lambd=0.5); +static inline Tensor & softshrink_forward_out(Tensor & output, const Tensor & self, Scalar lambd); +static inline Tensor softshrink_forward(const Tensor & self, Scalar lambd); +static inline Tensor & softshrink_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar lambd); +static inline Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scalar lambd); +static inline Tensor & threshold_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value); +static inline Tensor threshold(const Tensor & self, Scalar threshold, Scalar value); +static inline Tensor & threshold_forward_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value); +static inline Tensor threshold_forward(const Tensor & self, Scalar threshold, Scalar value); +static inline Tensor & threshold_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value); +static inline Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value); +static inline Tensor & threshold_(Tensor & self, Scalar threshold, Scalar value); +static inline Tensor & threshold_forward_(Tensor & self, Scalar threshold, Scalar value); +static inline Tensor & adaptive_avg_pool2d_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor adaptive_avg_pool2d(const Tensor & self, IntList output_size); +static inline Tensor & adaptive_avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor adaptive_avg_pool2d_forward(const Tensor & self, IntList output_size); +static inline Tensor & adaptive_avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self); +static inline Tensor adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self); +static inline Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor adaptive_avg_pool3d(const Tensor & self, IntList output_size); +static inline Tensor & adaptive_avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor adaptive_avg_pool3d_forward(const Tensor & self, IntList output_size); +static inline Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self); +static inline Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self); +static inline std::tuple adaptive_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool2d(const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool2d_forward(const Tensor & self, IntList output_size); +static inline Tensor & adaptive_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices); +static inline Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices); +static inline std::tuple adaptive_max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool3d(const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool3d_forward(const Tensor & self, IntList output_size); +static inline Tensor & adaptive_max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices); +static inline Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices); +static inline Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false); +static inline Tensor avg_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false); +static inline Tensor & avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor avg_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor & avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor & avg_pool3d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false); +static inline Tensor avg_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false); +static inline Tensor & avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor avg_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor & avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad); +static inline std::tuple fractional_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples); +static inline std::tuple fractional_max_pool2d(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples); +static inline std::tuple fractional_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples); +static inline std::tuple fractional_max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples); +static inline Tensor & fractional_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices); +static inline Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices); +static inline std::tuple max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false); +static inline std::tuple max_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false); +static inline std::tuple max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode); +static inline std::tuple max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode); +static inline Tensor & max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices); +static inline Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices); +static inline std::tuple max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false); +static inline std::tuple max_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false); +static inline std::tuple max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode); +static inline std::tuple max_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode); +static inline Tensor & max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices); +static inline Tensor max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices); +static inline Tensor & max_unpool2d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size); +static inline Tensor max_unpool2d(const Tensor & self, const Tensor & indices, IntList output_size); +static inline Tensor & max_unpool2d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size); +static inline Tensor max_unpool2d_forward(const Tensor & self, const Tensor & indices, IntList output_size); +static inline Tensor & max_unpool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size); +static inline Tensor max_unpool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size); +static inline Tensor & max_unpool3d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding); +static inline Tensor max_unpool3d(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding); +static inline Tensor & max_unpool3d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding); +static inline Tensor max_unpool3d_forward(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding); +static inline Tensor & max_unpool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding); +static inline Tensor max_unpool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding); +static inline Tensor & reflection_pad1d_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor reflection_pad1d(const Tensor & self, IntList padding); +static inline Tensor & reflection_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor reflection_pad1d_forward(const Tensor & self, IntList padding); +static inline Tensor & reflection_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor & reflection_pad2d_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor reflection_pad2d(const Tensor & self, IntList padding); +static inline Tensor & reflection_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor reflection_pad2d_forward(const Tensor & self, IntList padding); +static inline Tensor & reflection_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor & replication_pad1d_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor replication_pad1d(const Tensor & self, IntList padding); +static inline Tensor & replication_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor replication_pad1d_forward(const Tensor & self, IntList padding); +static inline Tensor & replication_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor & replication_pad2d_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor replication_pad2d(const Tensor & self, IntList padding); +static inline Tensor & replication_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor replication_pad2d_forward(const Tensor & self, IntList padding); +static inline Tensor & replication_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor & replication_pad3d_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor replication_pad3d(const Tensor & self, IntList padding); +static inline Tensor & replication_pad3d_forward_out(Tensor & output, const Tensor & self, IntList padding); +static inline Tensor replication_pad3d_forward(const Tensor & self, IntList padding); +static inline Tensor & replication_pad3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, IntList padding); +static inline Tensor & upsample_linear1d_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor upsample_linear1d(const Tensor & self, IntList output_size); +static inline Tensor & upsample_linear1d_forward_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor upsample_linear1d_forward(const Tensor & self, IntList output_size); +static inline Tensor & upsample_linear1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size); +static inline Tensor upsample_linear1d_backward(const Tensor & grad_output, IntList output_size, IntList input_size); +static inline Tensor & upsample_bilinear2d_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor upsample_bilinear2d(const Tensor & self, IntList output_size); +static inline Tensor & upsample_bilinear2d_forward_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor upsample_bilinear2d_forward(const Tensor & self, IntList output_size); +static inline Tensor & upsample_bilinear2d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size); +static inline Tensor upsample_bilinear2d_backward(const Tensor & grad_output, IntList output_size, IntList input_size); +static inline Tensor & upsample_trilinear3d_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor upsample_trilinear3d(const Tensor & self, IntList output_size); +static inline Tensor & upsample_trilinear3d_forward_out(Tensor & output, const Tensor & self, IntList output_size); +static inline Tensor upsample_trilinear3d_forward(const Tensor & self, IntList output_size); +static inline Tensor & upsample_trilinear3d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size); +static inline Tensor upsample_trilinear3d_backward(const Tensor & grad_output, IntList output_size, IntList input_size); +static inline Tensor & upsample_nearest1d_out(Tensor & output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest1d(const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest1d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest1d_forward(const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest1d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest2d_out(Tensor & output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest2d(const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest2d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest2d_forward(const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest2d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest3d_out(Tensor & output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest3d(const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest3d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest3d_forward(const Tensor & self, int64_t scale_factor); +static inline Tensor & upsample_nearest3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor); +static inline Tensor upsample_nearest3d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor); +static inline Tensor & _sigmoid_out(Tensor & output, const Tensor & self); +static inline Tensor _sigmoid(const Tensor & self); +static inline Tensor & _sigmoid_forward_out(Tensor & output, const Tensor & self); +static inline Tensor _sigmoid_forward(const Tensor & self); +static inline Tensor & _sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output); +static inline Tensor _sigmoid_backward(const Tensor & grad_output, const Tensor & output); +static inline Tensor & _tanh_out(Tensor & output, const Tensor & self); +static inline Tensor _tanh(const Tensor & self); +static inline Tensor & _tanh_forward_out(Tensor & output, const Tensor & self); +static inline Tensor _tanh_forward(const Tensor & self); +static inline Tensor & _tanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output); +static inline Tensor _tanh_backward(const Tensor & grad_output, const Tensor & output); +static inline Tensor & thnn_batch_norm_out(Tensor & output, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps); +static inline Tensor thnn_batch_norm(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps); +static inline std::tuple thnn_batch_norm_forward_out(Tensor & output, Tensor & save_mean, Tensor & save_std, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps); +static inline std::tuple thnn_batch_norm_forward(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps); +static inline std::tuple thnn_batch_norm_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std); +static inline std::tuple thnn_batch_norm_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std, std::array output_mask={{true, true, true}}); +static inline Tensor & thnn_conv_transpose2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1); +static inline Tensor thnn_conv_transpose2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1); +static inline std::tuple thnn_conv_transpose2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation); +static inline std::tuple thnn_conv_transpose2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation); +static inline std::tuple thnn_conv_transpose2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones); +static inline std::tuple thnn_conv_transpose2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask={{true, true, true}}); +static inline Tensor & thnn_conv_transpose3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1); +static inline Tensor thnn_conv_transpose3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1); +static inline std::tuple thnn_conv_transpose3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation); +static inline std::tuple thnn_conv_transpose3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation); +static inline std::tuple thnn_conv_transpose3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input); +static inline std::tuple thnn_conv_transpose3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask={{true, true, true}}); +static inline Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0); +static inline Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0); +static inline std::tuple thnn_conv2d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding); +static inline std::tuple thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding); +static inline std::tuple thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input); +static inline std::tuple thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask={{true, true, true}}); +static inline Tensor & thnn_conv_depthwise2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1); +static inline Tensor thnn_conv_depthwise2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1); +static inline Tensor & thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation); +static inline Tensor thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation); +static inline std::tuple thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation); +static inline std::tuple thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array output_mask={{true, true}}); +static inline Tensor & thnn_conv3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0); +static inline Tensor thnn_conv3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0); +static inline std::tuple thnn_conv3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding); +static inline std::tuple thnn_conv3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding); +static inline std::tuple thnn_conv3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input); +static inline std::tuple thnn_conv3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask={{true, true, true}}); +static inline Tensor & thnn_conv_dilated2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1); +static inline Tensor thnn_conv_dilated2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1); +static inline std::tuple thnn_conv_dilated2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation); +static inline std::tuple thnn_conv_dilated2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation); +static inline std::tuple thnn_conv_dilated2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones); +static inline std::tuple thnn_conv_dilated2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask={{true, true, true}}); +static inline Tensor & thnn_conv_dilated3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1); +static inline Tensor thnn_conv_dilated3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1); +static inline std::tuple thnn_conv_dilated3d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation); +static inline std::tuple thnn_conv_dilated3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation); +static inline std::tuple thnn_conv_dilated3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones); +static inline std::tuple thnn_conv_dilated3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask={{true, true, true}}); +static inline Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size); +static inline std::tuple adaptive_max_pool1d(const Tensor & self, IntList output_size); +static inline bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08); +static inline Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); +static inline Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); +static inline Tensor & addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); +static inline Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); +static inline Tensor batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps, bool cudnn_enabled); +static inline Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr); +static inline Tensor & bernoulli_(Tensor & self, double p=0.5, Generator * generator=nullptr); +static inline Tensor cat(TensorList tensors, int64_t dim=0); +static inline Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim=0); +static inline Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); +static inline Tensor & sspaddmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); +static inline std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim=0); +static inline bool cudnn_is_acceptable(const Tensor & self); +static inline Tensor convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups); +static inline Tensor _convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled); +static inline Tensor _convolution_nogroup(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding); +static inline std::tuple _convolution_double_backward(const Tensor & ggI, const Tensor & ggW, const Tensor & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array output_mask); +static inline Tensor conv1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1); +static inline Tensor conv2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1); +static inline Tensor conv3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1); +static inline Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad); +static inline std::tuple conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad); +static inline Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1); +static inline Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1); +static inline Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1); +static inline Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W); +static inline Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W); +static inline std::tuple cudnn_batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double exponential_average_factor, double epsilon); +static inline std::tuple cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, const Tensor & save_mean, const Tensor & save_var, double epsilon); +static inline Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic); +static inline Tensor cudnn_convolution_backward_input(IntList self_size, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic); +static inline std::tuple cudnn_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask); +static inline Tensor cudnn_convolution_backward_bias(const Tensor & grad_output); +static inline Tensor cudnn_convolution_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic); +static inline Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic); +static inline std::tuple cudnn_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask); +static inline Tensor cudnn_convolution_transpose_backward_bias(const Tensor & grad_output); +static inline Tensor cudnn_convolution_transpose_backward_input(const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic); +static inline Tensor cudnn_convolution_transpose_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic); +static inline Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid); +static inline std::tuple cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output); +static inline Tensor det(const Tensor & self); +static inline std::tuple _det_with_svd(const Tensor & self); +static inline Tensor dot(const Tensor & self, const Tensor & tensor); +static inline Tensor embedding(const Tensor & weight, const Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +static inline Tensor embedding_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse); +static inline Tensor embedding_dense_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); +static inline Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type); +static inline Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); +static inline Tensor empty_like(const Tensor & self); +static inline std::tuple embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false); +static inline Tensor embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse); +static inline Tensor embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode); +static inline Tensor embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode); +static inline Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, bool size_average, bool reduce); +static inline Tensor ger(const Tensor & self, const Tensor & vec2); +static inline Tensor & ger_out(Tensor & result, const Tensor & self, const Tensor & vec2); +static inline Tensor index(const Tensor & self, TensorList indices); +static inline Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values); +static inline bool is_cuda(const Tensor & self); +static inline bool is_distributed(const Tensor & self); +static inline bool is_floating_point(const Tensor & self); +static inline bool is_nonzero(const Tensor & self); +static inline bool is_same_size(const Tensor & self, const Tensor & other); +static inline bool is_signed(const Tensor & self); +static inline bool is_sparse(const Tensor & self); +static inline Tensor matmul(const Tensor & self, const Tensor & other); +static inline std::tuple max_pool1d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false); +static inline Tensor mm(const Tensor & self, const Tensor & mat2); +static inline Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2); +static inline Tensor mv(const Tensor & self, const Tensor & vec); +static inline Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec); +static inline Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length); +static inline Tensor pin_memory(const Tensor & self); +static inline Tensor rand_like(const Tensor & self); +static inline Tensor randn_like(const Tensor & self); +static inline Tensor repeat(const Tensor & self, IntList repeats); +static inline std::tuple RoiPooling2d_forward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale); +static inline Tensor RoiPooling2d_backward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, const Tensor & gradOutput, const Tensor & argmaxes); +static inline Tensor rrelu(const Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr); +static inline Tensor & rrelu_(Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr); +static inline Tensor select(const Tensor & self, int64_t dim, int64_t index); +static inline Tensor selu(const Tensor & self); +static inline Tensor & selu_(Tensor & self); +static inline int64_t size(const Tensor & self, int64_t dim); +static inline Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1); +static inline std::vector split(const Tensor & self, int64_t split_size, int64_t dim=0); +static inline Tensor squeeze(const Tensor & self); +static inline Tensor squeeze(const Tensor & self, int64_t dim); +static inline Tensor & squeeze_(Tensor & self); +static inline Tensor & squeeze_(Tensor & self, int64_t dim); +static inline Tensor stack(TensorList tensors, int64_t dim=0); +static inline Tensor & stack_out(Tensor & result, TensorList tensors, int64_t dim=0); +static inline Tensor stft(const Tensor & self, int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided=true, const Tensor & window={}, int64_t pad_end=0); +static inline int64_t stride(const Tensor & self, int64_t dim); +static inline Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1); +static inline Tensor & t_(Tensor & self); +static inline Tensor type_as(const Tensor & self, const Tensor & other); +static inline Tensor unsqueeze(const Tensor & self, int64_t dim); +static inline Tensor & unsqueeze_(Tensor & self, int64_t dim); +static inline Tensor view_as(const Tensor & self, const Tensor & other); +static inline Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other); +static inline Tensor _s_where(const Tensor & condition, const Tensor & self, const Tensor & other); +static inline Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output); +static inline Tensor poisson(const Tensor & self, Generator * generator=nullptr); +static inline Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional); +static inline std::tuple _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state); +static inline std::tuple> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, const Tensor & output, const Tensor & grad_output, const Tensor & grad_hy, const Tensor & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state, const Tensor & reserve, std::array output_mask); + +static inline Type & infer_type(const Tensor & t) { + AT_ASSERT(t.defined(), "undefined Tensor"); + return t.type(); +} +static inline Type & infer_type(const TensorList & tl) { + AT_ASSERT(tl.size() > 0, "expected a non-empty list of Tensors"); + return tl[0].type(); +} +// function definitions are all static inline because +// they are one-line statically dispatched functions that +// invoke the actual dynamic dispatch on the correct argument +static inline Tensor & zeros_out(Tensor & result, IntList size) { + return infer_type(result).zeros_out(result, size); +} +static inline Tensor & zeros_like_out(Tensor & result, const Tensor & input) { + return infer_type(result).zeros_like_out(result, input); +} +static inline Tensor zeros_like(const Tensor & input) { + return infer_type(input).zeros_like(input); +} +static inline Tensor & ones_out(Tensor & result, IntList size) { + return infer_type(result).ones_out(result, size); +} +static inline Tensor & ones_like_out(Tensor & result, const Tensor & input) { + return infer_type(result).ones_like_out(result, input); +} +static inline Tensor ones_like(const Tensor & input) { + return infer_type(input).ones_like(input); +} +static inline int64_t numel(const Tensor & self) { + return infer_type(self).numel(self); +} +static inline Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask) { + return infer_type(self).masked_select_out(result, self, mask); +} +static inline Tensor masked_select(const Tensor & self, const Tensor & mask) { + return infer_type(self).masked_select(self, mask); +} +static inline Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) { + return infer_type(self).transpose(self, dim0, dim1); +} +static inline Tensor t(const Tensor & self) { + return infer_type(self).t(self); +} +static inline Tensor & nonzero_out(Tensor & result, const Tensor & self) { + return infer_type(self).nonzero_out(result, self); +} +static inline Tensor nonzero(const Tensor & self) { + return infer_type(self).nonzero(self); +} +static inline Tensor & index_select_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) { + return infer_type(self).index_select_out(result, self, dim, index); +} +static inline Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) { + return infer_type(self).index_select(self, dim, index); +} +static inline Tensor & take_out(Tensor & result, const Tensor & self, const Tensor & index) { + return infer_type(self).take_out(result, self, index); +} +static inline Tensor take(const Tensor & self, const Tensor & index) { + return infer_type(self).take(self, index); +} +static inline Tensor & range_out(Tensor & result, Scalar start, Scalar end, Scalar step) { + return infer_type(result).range_out(result, start, end, step); +} +static inline Tensor & arange_out(Tensor & result, Scalar start, Scalar end, Scalar step) { + return infer_type(result).arange_out(result, start, end, step); +} +static inline Tensor & arange_out(Tensor & result, Scalar end) { + return infer_type(result).arange_out(result, end); +} +static inline Tensor & gather_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) { + return infer_type(self).gather_out(result, self, dim, index); +} +static inline Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) { + return infer_type(self).gather(self, dim, index); +} +static inline bool equal(const Tensor & self, const Tensor & other) { + return infer_type(self).equal(self, other); +} +static inline Tensor & __and___out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).__and___out(result, self, other); +} +static inline Tensor __and__(const Tensor & self, Scalar other) { + return infer_type(self).__and__(self, other); +} +static inline Tensor & __and___out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).__and___out(result, self, other); +} +static inline Tensor __and__(const Tensor & self, const Tensor & other) { + return infer_type(self).__and__(self, other); +} +static inline Tensor & __iand__(Tensor & self, Scalar other) { + return infer_type(self).__iand__(self, other); +} +static inline Tensor & __iand__(Tensor & self, const Tensor & other) { + return infer_type(self).__iand__(self, other); +} +static inline Tensor & __or___out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).__or___out(result, self, other); +} +static inline Tensor __or__(const Tensor & self, Scalar other) { + return infer_type(self).__or__(self, other); +} +static inline Tensor & __or___out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).__or___out(result, self, other); +} +static inline Tensor __or__(const Tensor & self, const Tensor & other) { + return infer_type(self).__or__(self, other); +} +static inline Tensor & __ior__(Tensor & self, Scalar other) { + return infer_type(self).__ior__(self, other); +} +static inline Tensor & __ior__(Tensor & self, const Tensor & other) { + return infer_type(self).__ior__(self, other); +} +static inline Tensor & __xor___out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).__xor___out(result, self, other); +} +static inline Tensor __xor__(const Tensor & self, Scalar other) { + return infer_type(self).__xor__(self, other); +} +static inline Tensor & __xor___out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).__xor___out(result, self, other); +} +static inline Tensor __xor__(const Tensor & self, const Tensor & other) { + return infer_type(self).__xor__(self, other); +} +static inline Tensor & __ixor__(Tensor & self, Scalar other) { + return infer_type(self).__ixor__(self, other); +} +static inline Tensor & __ixor__(Tensor & self, const Tensor & other) { + return infer_type(self).__ixor__(self, other); +} +static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).__lshift___out(result, self, other); +} +static inline Tensor __lshift__(const Tensor & self, Scalar other) { + return infer_type(self).__lshift__(self, other); +} +static inline Tensor & __lshift___out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).__lshift___out(result, self, other); +} +static inline Tensor __lshift__(const Tensor & self, const Tensor & other) { + return infer_type(self).__lshift__(self, other); +} +static inline Tensor & __ilshift__(Tensor & self, Scalar other) { + return infer_type(self).__ilshift__(self, other); +} +static inline Tensor & __ilshift__(Tensor & self, const Tensor & other) { + return infer_type(self).__ilshift__(self, other); +} +static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).__rshift___out(result, self, other); +} +static inline Tensor __rshift__(const Tensor & self, Scalar other) { + return infer_type(self).__rshift__(self, other); +} +static inline Tensor & __rshift___out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).__rshift___out(result, self, other); +} +static inline Tensor __rshift__(const Tensor & self, const Tensor & other) { + return infer_type(self).__rshift__(self, other); +} +static inline Tensor & __irshift__(Tensor & self, Scalar other) { + return infer_type(self).__irshift__(self, other); +} +static inline Tensor & __irshift__(Tensor & self, const Tensor & other) { + return infer_type(self).__irshift__(self, other); +} +static inline Tensor & lt_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).lt_out(result, self, other); +} +static inline Tensor lt(const Tensor & self, Scalar other) { + return infer_type(self).lt(self, other); +} +static inline Tensor & lt_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).lt_out(result, self, other); +} +static inline Tensor lt(const Tensor & self, const Tensor & other) { + return infer_type(self).lt(self, other); +} +static inline Tensor & gt_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).gt_out(result, self, other); +} +static inline Tensor gt(const Tensor & self, Scalar other) { + return infer_type(self).gt(self, other); +} +static inline Tensor & gt_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).gt_out(result, self, other); +} +static inline Tensor gt(const Tensor & self, const Tensor & other) { + return infer_type(self).gt(self, other); +} +static inline Tensor & le_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).le_out(result, self, other); +} +static inline Tensor le(const Tensor & self, Scalar other) { + return infer_type(self).le(self, other); +} +static inline Tensor & le_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).le_out(result, self, other); +} +static inline Tensor le(const Tensor & self, const Tensor & other) { + return infer_type(self).le(self, other); +} +static inline Tensor & ge_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).ge_out(result, self, other); +} +static inline Tensor ge(const Tensor & self, Scalar other) { + return infer_type(self).ge(self, other); +} +static inline Tensor & ge_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).ge_out(result, self, other); +} +static inline Tensor ge(const Tensor & self, const Tensor & other) { + return infer_type(self).ge(self, other); +} +static inline Tensor & eq_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).eq_out(result, self, other); +} +static inline Tensor eq(const Tensor & self, Scalar other) { + return infer_type(self).eq(self, other); +} +static inline Tensor & eq_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).eq_out(result, self, other); +} +static inline Tensor eq(const Tensor & self, const Tensor & other) { + return infer_type(self).eq(self, other); +} +static inline Tensor & ne_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).ne_out(result, self, other); +} +static inline Tensor ne(const Tensor & self, Scalar other) { + return infer_type(self).ne(self, other); +} +static inline Tensor & ne_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).ne_out(result, self, other); +} +static inline Tensor ne(const Tensor & self, const Tensor & other) { + return infer_type(self).ne(self, other); +} +static inline std::tuple min_out(Tensor & min, Tensor & min_indices, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).min_out(min, min_indices, self, dim, keepdim); +} +static inline std::tuple min(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).min(self, dim, keepdim); +} +static inline Tensor & min_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).min_out(result, self, other); +} +static inline Tensor min(const Tensor & self, const Tensor & other) { + return infer_type(self).min(self, other); +} +static inline Tensor min(const Tensor & self) { + return infer_type(self).min(self); +} +static inline std::tuple max_out(Tensor & max, Tensor & max_indices, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).max_out(max, max_indices, self, dim, keepdim); +} +static inline std::tuple max(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).max(self, dim, keepdim); +} +static inline Tensor & max_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).max_out(result, self, other); +} +static inline Tensor max(const Tensor & self, const Tensor & other) { + return infer_type(self).max(self, other); +} +static inline Tensor max(const Tensor & self) { + return infer_type(self).max(self); +} +static inline std::tuple kthvalue_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool keepdim) { + return infer_type(self).kthvalue_out(values, indices, self, k, dim, keepdim); +} +static inline std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) { + return infer_type(self).kthvalue(self, k, dim, keepdim); +} +static inline std::tuple mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).mode_out(values, indices, self, dim, keepdim); +} +static inline std::tuple mode(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).mode(self, dim, keepdim); +} +static inline std::tuple median_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).median_out(values, indices, self, dim, keepdim); +} +static inline std::tuple median(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).median(self, dim, keepdim); +} +static inline Tensor median(const Tensor & self) { + return infer_type(self).median(self); +} +static inline std::tuple sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) { + return infer_type(self).sort_out(values, indices, self, dim, descending); +} +static inline std::tuple sort(const Tensor & self, int64_t dim, bool descending) { + return infer_type(self).sort(self, dim, descending); +} +static inline std::tuple topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) { + return infer_type(self).topk_out(values, indices, self, k, dim, largest, sorted); +} +static inline std::tuple topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) { + return infer_type(self).topk(self, k, dim, largest, sorted); +} +static inline Tensor & abs_out(Tensor & result, const Tensor & self) { + return infer_type(self).abs_out(result, self); +} +static inline Tensor abs(const Tensor & self) { + return infer_type(self).abs(self); +} +static inline Tensor & sigmoid_out(Tensor & result, const Tensor & self) { + return infer_type(self).sigmoid_out(result, self); +} +static inline Tensor sigmoid(const Tensor & self) { + return infer_type(self).sigmoid(self); +} +static inline Tensor & log_out(Tensor & result, const Tensor & self) { + return infer_type(self).log_out(result, self); +} +static inline Tensor log(const Tensor & self) { + return infer_type(self).log(self); +} +static inline Tensor & log1p_out(Tensor & result, const Tensor & self) { + return infer_type(self).log1p_out(result, self); +} +static inline Tensor log1p(const Tensor & self) { + return infer_type(self).log1p(self); +} +static inline Tensor & lgamma_out(Tensor & result, const Tensor & self) { + return infer_type(self).lgamma_out(result, self); +} +static inline Tensor lgamma(const Tensor & self) { + return infer_type(self).lgamma(self); +} +static inline Tensor & digamma_out(Tensor & result, const Tensor & self) { + return infer_type(self).digamma_out(result, self); +} +static inline Tensor digamma(const Tensor & self) { + return infer_type(self).digamma(self); +} +static inline Tensor & polygamma_out(Tensor & result, int64_t n, const Tensor & self) { + return infer_type(self).polygamma_out(result, n, self); +} +static inline Tensor polygamma(int64_t n, const Tensor & self) { + return infer_type(self).polygamma(n, self); +} +static inline Tensor & exp_out(Tensor & result, const Tensor & self) { + return infer_type(self).exp_out(result, self); +} +static inline Tensor exp(const Tensor & self) { + return infer_type(self).exp(self); +} +static inline Tensor & expm1_out(Tensor & result, const Tensor & self) { + return infer_type(self).expm1_out(result, self); +} +static inline Tensor expm1(const Tensor & self) { + return infer_type(self).expm1(self); +} +static inline Tensor & cos_out(Tensor & result, const Tensor & self) { + return infer_type(self).cos_out(result, self); +} +static inline Tensor cos(const Tensor & self) { + return infer_type(self).cos(self); +} +static inline Tensor & acos_out(Tensor & result, const Tensor & self) { + return infer_type(self).acos_out(result, self); +} +static inline Tensor acos(const Tensor & self) { + return infer_type(self).acos(self); +} +static inline Tensor & cosh_out(Tensor & result, const Tensor & self) { + return infer_type(self).cosh_out(result, self); +} +static inline Tensor cosh(const Tensor & self) { + return infer_type(self).cosh(self); +} +static inline Tensor & sin_out(Tensor & result, const Tensor & self) { + return infer_type(self).sin_out(result, self); +} +static inline Tensor sin(const Tensor & self) { + return infer_type(self).sin(self); +} +static inline Tensor & asin_out(Tensor & result, const Tensor & self) { + return infer_type(self).asin_out(result, self); +} +static inline Tensor asin(const Tensor & self) { + return infer_type(self).asin(self); +} +static inline Tensor & sinh_out(Tensor & result, const Tensor & self) { + return infer_type(self).sinh_out(result, self); +} +static inline Tensor sinh(const Tensor & self) { + return infer_type(self).sinh(self); +} +static inline Tensor & tan_out(Tensor & result, const Tensor & self) { + return infer_type(self).tan_out(result, self); +} +static inline Tensor tan(const Tensor & self) { + return infer_type(self).tan(self); +} +static inline Tensor & atan_out(Tensor & result, const Tensor & self) { + return infer_type(self).atan_out(result, self); +} +static inline Tensor atan(const Tensor & self) { + return infer_type(self).atan(self); +} +static inline Tensor & tanh_out(Tensor & result, const Tensor & self) { + return infer_type(self).tanh_out(result, self); +} +static inline Tensor tanh(const Tensor & self) { + return infer_type(self).tanh(self); +} +static inline Tensor & erf_out(Tensor & result, const Tensor & self) { + return infer_type(self).erf_out(result, self); +} +static inline Tensor erf(const Tensor & self) { + return infer_type(self).erf(self); +} +static inline Tensor & erfc_out(Tensor & result, const Tensor & self) { + return infer_type(self).erfc_out(result, self); +} +static inline Tensor erfc(const Tensor & self) { + return infer_type(self).erfc(self); +} +static inline Tensor & erfinv_out(Tensor & result, const Tensor & self) { + return infer_type(self).erfinv_out(result, self); +} +static inline Tensor erfinv(const Tensor & self) { + return infer_type(self).erfinv(self); +} +static inline Tensor & sqrt_out(Tensor & result, const Tensor & self) { + return infer_type(self).sqrt_out(result, self); +} +static inline Tensor sqrt(const Tensor & self) { + return infer_type(self).sqrt(self); +} +static inline Tensor & rsqrt_out(Tensor & result, const Tensor & self) { + return infer_type(self).rsqrt_out(result, self); +} +static inline Tensor rsqrt(const Tensor & self) { + return infer_type(self).rsqrt(self); +} +static inline Tensor & ceil_out(Tensor & result, const Tensor & self) { + return infer_type(self).ceil_out(result, self); +} +static inline Tensor ceil(const Tensor & self) { + return infer_type(self).ceil(self); +} +static inline Tensor & floor_out(Tensor & result, const Tensor & self) { + return infer_type(self).floor_out(result, self); +} +static inline Tensor floor(const Tensor & self) { + return infer_type(self).floor(self); +} +static inline Tensor & round_out(Tensor & result, const Tensor & self) { + return infer_type(self).round_out(result, self); +} +static inline Tensor round(const Tensor & self) { + return infer_type(self).round(self); +} +static inline Tensor & trunc_out(Tensor & result, const Tensor & self) { + return infer_type(self).trunc_out(result, self); +} +static inline Tensor trunc(const Tensor & self) { + return infer_type(self).trunc(self); +} +static inline Tensor & frac_out(Tensor & result, const Tensor & self) { + return infer_type(self).frac_out(result, self); +} +static inline Tensor frac(const Tensor & self) { + return infer_type(self).frac(self); +} +static inline Tensor & mean_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).mean_out(result, self, dim, keepdim); +} +static inline Tensor mean(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).mean(self, dim, keepdim); +} +static inline Tensor mean(const Tensor & self) { + return infer_type(self).mean(self); +} +static inline Tensor & var_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased, bool keepdim) { + return infer_type(self).var_out(result, self, dim, unbiased, keepdim); +} +static inline Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) { + return infer_type(self).var(self, dim, unbiased, keepdim); +} +static inline Tensor var(const Tensor & self, bool unbiased) { + return infer_type(self).var(self, unbiased); +} +static inline Tensor & std_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased, bool keepdim) { + return infer_type(self).std_out(result, self, dim, unbiased, keepdim); +} +static inline Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) { + return infer_type(self).std(self, dim, unbiased, keepdim); +} +static inline Tensor std(const Tensor & self, bool unbiased) { + return infer_type(self).std(self, unbiased); +} +static inline Tensor & norm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, bool keepdim) { + return infer_type(self).norm_out(result, self, p, dim, keepdim); +} +static inline Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) { + return infer_type(self).norm(self, p, dim, keepdim); +} +static inline Tensor norm(const Tensor & self, Scalar p) { + return infer_type(self).norm(self, p); +} +static inline Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + return infer_type(self).renorm_out(result, self, p, dim, maxnorm); +} +static inline Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + return infer_type(self).renorm(self, p, dim, maxnorm); +} +static inline Tensor dist(const Tensor & self, const Tensor & other, Scalar p) { + return infer_type(self).dist(self, other, p); +} +static inline Tensor & reciprocal_out(Tensor & result, const Tensor & self) { + return infer_type(self).reciprocal_out(result, self); +} +static inline Tensor reciprocal(const Tensor & self) { + return infer_type(self).reciprocal(self); +} +static inline Tensor & neg_out(Tensor & result, const Tensor & self) { + return infer_type(self).neg_out(result, self); +} +static inline Tensor neg(const Tensor & self) { + return infer_type(self).neg(self); +} +static inline Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).atan2_out(result, self, other); +} +static inline Tensor atan2(const Tensor & self, const Tensor & other) { + return infer_type(self).atan2(self, other); +} +static inline Tensor & pow_out(Tensor & result, const Tensor & self, Scalar exponent) { + return infer_type(self).pow_out(result, self, exponent); +} +static inline Tensor pow(const Tensor & self, Scalar exponent) { + return infer_type(self).pow(self, exponent); +} +static inline Tensor & pow_out(Tensor & result, const Tensor & self, const Tensor & exponent) { + return infer_type(self).pow_out(result, self, exponent); +} +static inline Tensor pow(const Tensor & self, const Tensor & exponent) { + return infer_type(self).pow(self, exponent); +} +static inline Tensor & pow_out(Tensor & result, Scalar base, const Tensor & self) { + return infer_type(self).pow_out(result, base, self); +} +static inline Tensor pow(Scalar base, const Tensor & self) { + return infer_type(self).pow(base, self); +} +static inline Tensor & lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight) { + return infer_type(self).lerp_out(result, self, end, weight); +} +static inline Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) { + return infer_type(self).lerp(self, end, weight); +} +static inline Tensor & linspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps) { + return infer_type(result).linspace_out(result, start, end, steps); +} +static inline Tensor & logspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps) { + return infer_type(result).logspace_out(result, start, end, steps); +} +static inline Tensor & histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) { + return infer_type(self).histc_out(result, self, bins, min, max); +} +static inline Tensor histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) { + return infer_type(self).histc(self, bins, min, max); +} +static inline Tensor & sum_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).sum_out(result, self, dim, keepdim); +} +static inline Tensor sum(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).sum(self, dim, keepdim); +} +static inline Tensor sum(const Tensor & self) { + return infer_type(self).sum(self); +} +static inline Tensor & prod_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).prod_out(result, self, dim, keepdim); +} +static inline Tensor prod(const Tensor & self, int64_t dim, bool keepdim) { + return infer_type(self).prod(self, dim, keepdim); +} +static inline Tensor prod(const Tensor & self) { + return infer_type(self).prod(self); +} +static inline Tensor & cumsum_out(Tensor & result, const Tensor & self, int64_t dim) { + return infer_type(self).cumsum_out(result, self, dim); +} +static inline Tensor cumsum(const Tensor & self, int64_t dim) { + return infer_type(self).cumsum(self, dim); +} +static inline Tensor & cumprod_out(Tensor & result, const Tensor & self, int64_t dim) { + return infer_type(self).cumprod_out(result, self, dim); +} +static inline Tensor cumprod(const Tensor & self, int64_t dim) { + return infer_type(self).cumprod(self, dim); +} +static inline Tensor & sign_out(Tensor & result, const Tensor & self) { + return infer_type(self).sign_out(result, self); +} +static inline Tensor sign(const Tensor & self) { + return infer_type(self).sign(self); +} +static inline Tensor trace(const Tensor & self) { + return infer_type(self).trace(self); +} +static inline Tensor & add_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha) { + return infer_type(self).add_out(result, self, other, alpha); +} +static inline Tensor add(const Tensor & self, Scalar other, Scalar alpha) { + return infer_type(self).add(self, other, alpha); +} +static inline Tensor & add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha) { + return infer_type(self).add_out(result, self, other, alpha); +} +static inline Tensor add(const Tensor & self, const Tensor & other, Scalar alpha) { + return infer_type(self).add(self, other, alpha); +} +static inline Tensor & add_out(Tensor & result, const Tensor & self, SparseTensor other, Scalar alpha) { + return infer_type(self).add_out(result, self, other, alpha); +} +static inline Tensor add(const Tensor & self, SparseTensor other, Scalar alpha) { + return infer_type(self).add(self, other, alpha); +} +static inline Tensor & sub_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha) { + return infer_type(self).sub_out(result, self, other, alpha); +} +static inline Tensor sub(const Tensor & self, Scalar other, Scalar alpha) { + return infer_type(self).sub(self, other, alpha); +} +static inline Tensor & sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha) { + return infer_type(self).sub_out(result, self, other, alpha); +} +static inline Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha) { + return infer_type(self).sub(self, other, alpha); +} +static inline Tensor & mul_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).mul_out(result, self, other); +} +static inline Tensor mul(const Tensor & self, Scalar other) { + return infer_type(self).mul(self, other); +} +static inline Tensor & mul_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).mul_out(result, self, other); +} +static inline Tensor mul(const Tensor & self, const Tensor & other) { + return infer_type(self).mul(self, other); +} +static inline Tensor & div_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).div_out(result, self, other); +} +static inline Tensor div(const Tensor & self, Scalar other) { + return infer_type(self).div(self, other); +} +static inline Tensor & div_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).div_out(result, self, other); +} +static inline Tensor div(const Tensor & self, const Tensor & other) { + return infer_type(self).div(self, other); +} +static inline Tensor & fmod_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).fmod_out(result, self, other); +} +static inline Tensor fmod(const Tensor & self, Scalar other) { + return infer_type(self).fmod(self, other); +} +static inline Tensor & fmod_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).fmod_out(result, self, other); +} +static inline Tensor fmod(const Tensor & self, const Tensor & other) { + return infer_type(self).fmod(self, other); +} +static inline Tensor & remainder_out(Tensor & result, const Tensor & self, Scalar other) { + return infer_type(self).remainder_out(result, self, other); +} +static inline Tensor remainder(const Tensor & self, Scalar other) { + return infer_type(self).remainder(self, other); +} +static inline Tensor & remainder_out(Tensor & result, const Tensor & self, const Tensor & other) { + return infer_type(self).remainder_out(result, self, other); +} +static inline Tensor remainder(const Tensor & self, const Tensor & other) { + return infer_type(self).remainder(self, other); +} +static inline Tensor & clamp_out(Tensor & result, const Tensor & self, Scalar min, Scalar max) { + return infer_type(self).clamp_out(result, self, min, max); +} +static inline Tensor clamp(const Tensor & self, Scalar min, Scalar max) { + return infer_type(self).clamp(self, min, max); +} +static inline Tensor & clamp_(Tensor & self, Scalar min, Scalar max) { + return infer_type(self).clamp_(self, min, max); +} +static inline Tensor & clamp_min_out(Tensor & result, const Tensor & self, Scalar min) { + return infer_type(self).clamp_min_out(result, self, min); +} +static inline Tensor clamp_min(const Tensor & self, Scalar min) { + return infer_type(self).clamp_min(self, min); +} +static inline Tensor & clamp_min_(Tensor & self, Scalar min) { + return infer_type(self).clamp_min_(self, min); +} +static inline Tensor & clamp_max_out(Tensor & result, const Tensor & self, Scalar max) { + return infer_type(self).clamp_max_out(result, self, max); +} +static inline Tensor clamp_max(const Tensor & self, Scalar max) { + return infer_type(self).clamp_max(self, max); +} +static inline Tensor & clamp_max_(Tensor & self, Scalar max) { + return infer_type(self).clamp_max_(self, max); +} +static inline Tensor _dot(const Tensor & self, const Tensor & tensor) { + return infer_type(self)._dot(self, tensor); +} +static inline Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) { + return infer_type(self).tril_out(result, self, diagonal); +} +static inline Tensor tril(const Tensor & self, int64_t diagonal) { + return infer_type(self).tril(self, diagonal); +} +static inline Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) { + return infer_type(self).triu_out(result, self, diagonal); +} +static inline Tensor triu(const Tensor & self, int64_t diagonal) { + return infer_type(self).triu(self, diagonal); +} +static inline Tensor & cross_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim) { + return infer_type(self).cross_out(result, self, other, dim); +} +static inline Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) { + return infer_type(self).cross(self, other, dim); +} +static inline Tensor & eye_out(Tensor & result, int64_t n, int64_t m) { + return infer_type(result).eye_out(result, n, m); +} +static inline Tensor & diag_out(Tensor & result, const Tensor & self, int64_t diagonal) { + return infer_type(self).diag_out(result, self, diagonal); +} +static inline Tensor diag(const Tensor & self, int64_t diagonal) { + return infer_type(self).diag(self, diagonal); +} +static inline Tensor & addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return infer_type(self).addmm_out(result, self, mat1, mat2, beta, alpha); +} +static inline Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return infer_type(self).addmm(self, mat1, mat2, beta, alpha); +} +static inline Tensor & addmm_out(Tensor & result, const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return infer_type(self).addmm_out(result, self, mat1, mat2, beta, alpha); +} +static inline Tensor addmm(const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return infer_type(self).addmm(self, mat1, mat2, beta, alpha); +} +static inline Tensor & _addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return infer_type(self)._addmv_out(result, self, mat, vec, beta, alpha); +} +static inline Tensor _addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return infer_type(self)._addmv(self, mat, vec, beta, alpha); +} +static inline Tensor & _addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return infer_type(self)._addr_out(result, self, vec1, vec2, beta, alpha); +} +static inline Tensor _addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return infer_type(self)._addr(self, vec1, vec2, beta, alpha); +} +static inline Tensor & _ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) { + return infer_type(self)._ger_out(result, self, vec2); +} +static inline Tensor _ger(const Tensor & self, const Tensor & vec2) { + return infer_type(self)._ger(self, vec2); +} +static inline Tensor & _mv_out(Tensor & result, const Tensor & self, const Tensor & vec) { + return infer_type(self)._mv_out(result, self, vec); +} +static inline Tensor _mv(const Tensor & self, const Tensor & vec) { + return infer_type(self)._mv(self, vec); +} +static inline Tensor & _mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) { + return infer_type(self)._mm_out(result, self, mat2); +} +static inline Tensor _mm(const Tensor & self, const Tensor & mat2) { + return infer_type(self)._mm(self, mat2); +} +static inline Tensor & bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2) { + return infer_type(self).bmm_out(result, self, mat2); +} +static inline Tensor bmm(const Tensor & self, const Tensor & mat2) { + return infer_type(self).bmm(self, mat2); +} +static inline Tensor & addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return infer_type(self).addbmm_out(result, self, batch1, batch2, beta, alpha); +} +static inline Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return infer_type(self).addbmm(self, batch1, batch2, beta, alpha); +} +static inline Tensor & baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return infer_type(self).baddbmm_out(result, self, batch1, batch2, beta, alpha); +} +static inline Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return infer_type(self).baddbmm(self, batch1, batch2, beta, alpha); +} +static inline Tensor & addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return infer_type(self).addcmul_out(result, self, tensor1, tensor2, value); +} +static inline Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return infer_type(self).addcmul(self, tensor1, tensor2, value); +} +static inline Tensor & addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return infer_type(self).addcdiv_out(result, self, tensor1, tensor2, value); +} +static inline Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return infer_type(self).addcdiv(self, tensor1, tensor2, value); +} +static inline std::tuple gesv_out(Tensor & solution, Tensor & lu, const Tensor & self, const Tensor & A) { + return infer_type(self).gesv_out(solution, lu, self, A); +} +static inline std::tuple gesv(const Tensor & self, const Tensor & A) { + return infer_type(self).gesv(self, A); +} +static inline std::tuple gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) { + return infer_type(self).gels_out(res1, res2, self, A); +} +static inline std::tuple gels(const Tensor & self, const Tensor & A) { + return infer_type(self).gels(self, A); +} +static inline std::tuple trtrs_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) { + return infer_type(self).trtrs_out(res1, res2, self, A, upper, transpose, unitriangular); +} +static inline std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) { + return infer_type(self).trtrs(self, A, upper, transpose, unitriangular); +} +static inline std::tuple symeig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors, bool upper) { + return infer_type(self).symeig_out(res1, res2, self, eigenvectors, upper); +} +static inline std::tuple symeig(const Tensor & self, bool eigenvectors, bool upper) { + return infer_type(self).symeig(self, eigenvectors, upper); +} +static inline std::tuple eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) { + return infer_type(self).eig_out(res1, res2, self, eigenvectors); +} +static inline std::tuple eig(const Tensor & self, bool eigenvectors) { + return infer_type(self).eig(self, eigenvectors); +} +static inline std::tuple svd_out(Tensor & res1, Tensor & res2, Tensor & res3, const Tensor & self, bool some) { + return infer_type(self).svd_out(res1, res2, res3, self, some); +} +static inline std::tuple svd(const Tensor & self, bool some) { + return infer_type(self).svd(self, some); +} +static inline Tensor & inverse_out(Tensor & output, const Tensor & self) { + return infer_type(self).inverse_out(output, self); +} +static inline Tensor inverse(const Tensor & self) { + return infer_type(self).inverse(self); +} +static inline Tensor & potrf_out(Tensor & output, const Tensor & self, bool upper) { + return infer_type(self).potrf_out(output, self, upper); +} +static inline Tensor potrf(const Tensor & self, bool upper) { + return infer_type(self).potrf(self, upper); +} +static inline Tensor & potrs_out(Tensor & result, const Tensor & self, const Tensor & input2, bool upper) { + return infer_type(self).potrs_out(result, self, input2, upper); +} +static inline Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) { + return infer_type(self).potrs(self, input2, upper); +} +static inline Tensor & potri_out(Tensor & output, const Tensor & self, bool upper) { + return infer_type(self).potri_out(output, self, upper); +} +static inline Tensor potri(const Tensor & self, bool upper) { + return infer_type(self).potri(self, upper); +} +static inline std::tuple pstrf_out(Tensor & res1, Tensor & res2, const Tensor & self, bool upper, Scalar tol) { + return infer_type(self).pstrf_out(res1, res2, self, upper, tol); +} +static inline std::tuple pstrf(const Tensor & self, bool upper, Scalar tol) { + return infer_type(self).pstrf(self, upper, tol); +} +static inline std::tuple qr_out(Tensor & res1, Tensor & res2, const Tensor & self) { + return infer_type(self).qr_out(res1, res2, self); +} +static inline std::tuple qr(const Tensor & self) { + return infer_type(self).qr(self); +} +static inline std::tuple geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) { + return infer_type(self).geqrf_out(res1, res2, self); +} +static inline std::tuple geqrf(const Tensor & self) { + return infer_type(self).geqrf(self); +} +static inline Tensor & orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) { + return infer_type(self).orgqr_out(result, self, input2); +} +static inline Tensor orgqr(const Tensor & self, const Tensor & input2) { + return infer_type(self).orgqr(self, input2); +} +static inline Tensor & ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) { + return infer_type(self).ormqr_out(result, self, input2, input3, left, transpose); +} +static inline Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) { + return infer_type(self).ormqr(self, input2, input3, left, transpose); +} +static inline std::tuple btrifact_out(Tensor & result, Tensor & pivots, const Tensor & self, bool pivot) { + return infer_type(self).btrifact_out(result, pivots, self, pivot); +} +static inline std::tuple btrifact(const Tensor & self, bool pivot) { + return infer_type(self).btrifact(self, pivot); +} +static inline std::tuple btrifact_with_info_out(Tensor & result, Tensor & pivots, Tensor & info, const Tensor & self, bool pivot) { + return infer_type(self).btrifact_with_info_out(result, pivots, info, self, pivot); +} +static inline std::tuple btrifact_with_info(const Tensor & self, bool pivot) { + return infer_type(self).btrifact_with_info(self, pivot); +} +static inline Tensor & btrisolve_out(Tensor & result, const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) { + return infer_type(self).btrisolve_out(result, self, LU_data, LU_pivots); +} +static inline Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) { + return infer_type(self).btrisolve(self, LU_data, LU_pivots); +} +static inline Tensor & randperm_out(Tensor & result, int64_t n, Generator * generator) { + return infer_type(result).randperm_out(result, n, generator); +} +static inline Tensor & multinomial_out(Tensor & result, const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) { + return infer_type(self).multinomial_out(result, self, num_samples, replacement, generator); +} +static inline Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) { + return infer_type(self).multinomial(self, num_samples, replacement, generator); +} +static inline Tensor & normal_out(Tensor & output, const Tensor & mean, double std, Generator * generator) { + return infer_type(output).normal_out(output, mean, std, generator); +} +static inline Tensor normal(const Tensor & mean, double std, Generator * generator) { + return infer_type(mean).normal(mean, std, generator); +} +static inline Tensor & normal_out(Tensor & output, double mean, const Tensor & std, Generator * generator) { + return infer_type(output).normal_out(output, mean, std, generator); +} +static inline Tensor normal(double mean, const Tensor & std, Generator * generator) { + return infer_type(std).normal(mean, std, generator); +} +static inline Tensor & normal_out(Tensor & output, const Tensor & mean, const Tensor & std, Generator * generator) { + return infer_type(output).normal_out(output, mean, std, generator); +} +static inline Tensor normal(const Tensor & mean, const Tensor & std, Generator * generator) { + return infer_type(mean).normal(mean, std, generator); +} +static inline Tensor & rand_out(Tensor & result, IntList size, Generator * generator) { + return infer_type(result).rand_out(result, size, generator); +} +static inline Tensor & randn_out(Tensor & result, IntList size, Generator * generator) { + return infer_type(result).randn_out(result, size, generator); +} +static inline Tensor & bernoulli_out(Tensor & output, const Tensor & self, Generator * generator) { + return infer_type(self).bernoulli_out(output, self, generator); +} +static inline Tensor bernoulli(const Tensor & self, Generator * generator) { + return infer_type(self).bernoulli(self, generator); +} +static inline Tensor & _standard_gamma_out(Tensor & output, const Tensor & self, Generator * generator) { + return infer_type(self)._standard_gamma_out(output, self, generator); +} +static inline Tensor _standard_gamma(const Tensor & self, Generator * generator) { + return infer_type(self)._standard_gamma(self, generator); +} +static inline Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total) { + return infer_type(output)._dirichlet_grad_out(output, x, alpha, total); +} +static inline Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total) { + return infer_type(x)._dirichlet_grad(x, alpha, total); +} +static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) { + return infer_type(values).sparse_coo_tensor(indices, values, size); +} +static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) { + return infer_type(values).sparse_coo_tensor(indices, values); +} +static inline Tensor alias(const Tensor & self) { + return infer_type(self).alias(self); +} +static inline Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset) { + return infer_type(self).as_strided_out(result, self, size, stride, storage_offset); +} +static inline Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) { + return infer_type(self).as_strided(self, size, stride, storage_offset); +} +static inline Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) { + return infer_type(self).as_strided_(self, size, stride, storage_offset); +} +static inline Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim) { + return infer_type(self)._cat_out(self, tensors, dim); +} +static inline Tensor _cat(TensorList tensors, int64_t dim) { + return infer_type(tensors)._cat(tensors, dim); +} +static inline Tensor & binary_cross_entropy_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) { + return infer_type(self).binary_cross_entropy_out(output, self, target, weight, size_average, reduce); +} +static inline Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) { + return infer_type(self).binary_cross_entropy(self, target, weight, size_average, reduce); +} +static inline Tensor & binary_cross_entropy_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) { + return infer_type(self).binary_cross_entropy_forward_out(output, self, target, weight, size_average, reduce); +} +static inline Tensor binary_cross_entropy_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) { + return infer_type(self).binary_cross_entropy_forward(self, target, weight, size_average, reduce); +} +static inline Tensor & binary_cross_entropy_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) { + return infer_type(self).binary_cross_entropy_backward_out(grad_input, grad_output, self, target, weight, size_average, reduce); +} +static inline Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) { + return infer_type(self).binary_cross_entropy_backward(grad_output, self, target, weight, size_average, reduce); +} +static inline Tensor & kl_div_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).kl_div_out(output, self, target, size_average, reduce); +} +static inline Tensor kl_div(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).kl_div(self, target, size_average, reduce); +} +static inline Tensor & kl_div_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).kl_div_forward_out(output, self, target, size_average, reduce); +} +static inline Tensor kl_div_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).kl_div_forward(self, target, size_average, reduce); +} +static inline Tensor & kl_div_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).kl_div_backward_out(grad_input, grad_output, self, target, size_average, reduce); +} +static inline Tensor kl_div_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).kl_div_backward(grad_output, self, target, size_average, reduce); +} +static inline Tensor & l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).l1_loss_out(output, self, target, size_average, reduce); +} +static inline Tensor l1_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).l1_loss(self, target, size_average, reduce); +} +static inline Tensor & l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).l1_loss_forward_out(output, self, target, size_average, reduce); +} +static inline Tensor l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).l1_loss_forward(self, target, size_average, reduce); +} +static inline Tensor & l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).l1_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce); +} +static inline Tensor l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).l1_loss_backward(grad_output, self, target, size_average, reduce); +} +static inline Tensor & mse_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).mse_loss_out(output, self, target, size_average, reduce); +} +static inline Tensor mse_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).mse_loss(self, target, size_average, reduce); +} +static inline Tensor & mse_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).mse_loss_forward_out(output, self, target, size_average, reduce); +} +static inline Tensor mse_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).mse_loss_forward(self, target, size_average, reduce); +} +static inline Tensor & mse_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).mse_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce); +} +static inline Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).mse_loss_backward(grad_output, self, target, size_average, reduce); +} +static inline Tensor & multi_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) { + return infer_type(self).multi_margin_loss_out(output, self, target, p, margin, weight, size_average); +} +static inline Tensor multi_margin_loss(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) { + return infer_type(self).multi_margin_loss(self, target, p, margin, weight, size_average); +} +static inline Tensor & multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) { + return infer_type(self).multi_margin_loss_forward_out(output, self, target, p, margin, weight, size_average); +} +static inline Tensor multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) { + return infer_type(self).multi_margin_loss_forward(self, target, p, margin, weight, size_average); +} +static inline Tensor & multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) { + return infer_type(self).multi_margin_loss_backward_out(grad_input, self, target, p, margin, weight, size_average); +} +static inline Tensor multi_margin_loss_backward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) { + return infer_type(self).multi_margin_loss_backward(self, target, p, margin, weight, size_average); +} +static inline Tensor & multilabel_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).multilabel_margin_loss_out(output, self, target, size_average, reduce); +} +static inline Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).multilabel_margin_loss(self, target, size_average, reduce); +} +static inline std::tuple multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).multilabel_margin_loss_forward_out(output, is_target, self, target, size_average, reduce); +} +static inline std::tuple multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).multilabel_margin_loss_forward(self, target, size_average, reduce); +} +static inline Tensor & multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) { + return infer_type(self).multilabel_margin_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce, is_target); +} +static inline Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) { + return infer_type(self).multilabel_margin_loss_backward(grad_output, self, target, size_average, reduce, is_target); +} +static inline Tensor & nll_loss_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss_out(output, self, target, weight, size_average, ignore_index, reduce); +} +static inline Tensor nll_loss(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss(self, target, weight, size_average, ignore_index, reduce); +} +static inline std::tuple nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss_forward_out(output, total_weight, self, target, weight, size_average, ignore_index, reduce); +} +static inline std::tuple nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss_forward(self, target, weight, size_average, ignore_index, reduce); +} +static inline Tensor & nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) { + return infer_type(self).nll_loss_backward_out(grad_input, grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight); +} +static inline Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) { + return infer_type(self).nll_loss_backward(grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight); +} +static inline Tensor & nll_loss2d_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss2d_out(output, self, target, weight, size_average, ignore_index, reduce); +} +static inline Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss2d(self, target, weight, size_average, ignore_index, reduce); +} +static inline std::tuple nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss2d_forward_out(output, total_weight, self, target, weight, size_average, ignore_index, reduce); +} +static inline std::tuple nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) { + return infer_type(self).nll_loss2d_forward(self, target, weight, size_average, ignore_index, reduce); +} +static inline Tensor & nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) { + return infer_type(self).nll_loss2d_backward_out(grad_input, grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight); +} +static inline Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) { + return infer_type(self).nll_loss2d_backward(grad_output, self, target, weight, size_average, ignore_index, reduce, total_weight); +} +static inline Tensor & smooth_l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).smooth_l1_loss_out(output, self, target, size_average, reduce); +} +static inline Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).smooth_l1_loss(self, target, size_average, reduce); +} +static inline Tensor & smooth_l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).smooth_l1_loss_forward_out(output, self, target, size_average, reduce); +} +static inline Tensor smooth_l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).smooth_l1_loss_forward(self, target, size_average, reduce); +} +static inline Tensor & smooth_l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).smooth_l1_loss_backward_out(grad_input, grad_output, self, target, size_average, reduce); +} +static inline Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) { + return infer_type(self).smooth_l1_loss_backward(grad_output, self, target, size_average, reduce); +} +static inline Tensor & soft_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average) { + return infer_type(self).soft_margin_loss_out(output, self, target, size_average); +} +static inline Tensor soft_margin_loss(const Tensor & self, const Tensor & target, bool size_average) { + return infer_type(self).soft_margin_loss(self, target, size_average); +} +static inline Tensor & soft_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average) { + return infer_type(self).soft_margin_loss_forward_out(output, self, target, size_average); +} +static inline Tensor soft_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average) { + return infer_type(self).soft_margin_loss_forward(self, target, size_average); +} +static inline Tensor & soft_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, bool size_average) { + return infer_type(self).soft_margin_loss_backward_out(grad_input, self, target, size_average); +} +static inline Tensor soft_margin_loss_backward(const Tensor & self, const Tensor & target, bool size_average) { + return infer_type(self).soft_margin_loss_backward(self, target, size_average); +} +static inline Tensor & elu_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale) { + return infer_type(self).elu_out(output, self, alpha, scale); +} +static inline Tensor elu(const Tensor & self, Scalar alpha, Scalar scale) { + return infer_type(self).elu(self, alpha, scale); +} +static inline Tensor & elu_forward_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale) { + return infer_type(self).elu_forward_out(output, self, alpha, scale); +} +static inline Tensor elu_forward(const Tensor & self, Scalar alpha, Scalar scale) { + return infer_type(self).elu_forward(self, alpha, scale); +} +static inline Tensor & elu_backward_out(Tensor & grad_input, const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) { + return infer_type(grad_input).elu_backward_out(grad_input, grad_output, alpha, scale, output); +} +static inline Tensor elu_backward(const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) { + return infer_type(grad_output).elu_backward(grad_output, alpha, scale, output); +} +static inline Tensor & elu_(Tensor & self, Scalar alpha, Scalar scale) { + return infer_type(self).elu_(self, alpha, scale); +} +static inline Tensor & elu_forward_(Tensor & self, Scalar alpha, Scalar scale) { + return infer_type(self).elu_forward_(self, alpha, scale); +} +static inline Tensor & glu_out(Tensor & output, const Tensor & self, int64_t dim) { + return infer_type(self).glu_out(output, self, dim); +} +static inline Tensor glu(const Tensor & self, int64_t dim) { + return infer_type(self).glu(self, dim); +} +static inline Tensor & glu_forward_out(Tensor & output, const Tensor & self, int64_t dim) { + return infer_type(self).glu_forward_out(output, self, dim); +} +static inline Tensor glu_forward(const Tensor & self, int64_t dim) { + return infer_type(self).glu_forward(self, dim); +} +static inline Tensor & glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim) { + return infer_type(self).glu_backward_out(grad_input, grad_output, self, dim); +} +static inline Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim) { + return infer_type(self).glu_backward(grad_output, self, dim); +} +static inline Tensor & hardtanh_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_out(output, self, min_val, max_val); +} +static inline Tensor hardtanh(const Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh(self, min_val, max_val); +} +static inline Tensor & hardtanh_forward_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_forward_out(output, self, min_val, max_val); +} +static inline Tensor hardtanh_forward(const Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_forward(self, min_val, max_val); +} +static inline Tensor & hardtanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_backward_out(grad_input, grad_output, self, min_val, max_val); +} +static inline Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_backward(grad_output, self, min_val, max_val); +} +static inline Tensor & hardtanh_(Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_(self, min_val, max_val); +} +static inline Tensor & hardtanh_forward_(Tensor & self, Scalar min_val, Scalar max_val) { + return infer_type(self).hardtanh_forward_(self, min_val, max_val); +} +static inline Tensor & leaky_relu_out(Tensor & output, const Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_out(output, self, negative_slope); +} +static inline Tensor leaky_relu(const Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu(self, negative_slope); +} +static inline Tensor & leaky_relu_forward_out(Tensor & output, const Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_forward_out(output, self, negative_slope); +} +static inline Tensor leaky_relu_forward(const Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_forward(self, negative_slope); +} +static inline Tensor & leaky_relu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_backward_out(grad_input, grad_output, self, negative_slope); +} +static inline Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_backward(grad_output, self, negative_slope); +} +static inline Tensor & leaky_relu_(Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_(self, negative_slope); +} +static inline Tensor & leaky_relu_forward_(Tensor & self, Scalar negative_slope) { + return infer_type(self).leaky_relu_forward_(self, negative_slope); +} +static inline Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) { + return infer_type(self).log_sigmoid_out(output, self); +} +static inline Tensor log_sigmoid(const Tensor & self) { + return infer_type(self).log_sigmoid(self); +} +static inline std::tuple log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self) { + return infer_type(self).log_sigmoid_forward_out(output, buffer, self); +} +static inline std::tuple log_sigmoid_forward(const Tensor & self) { + return infer_type(self).log_sigmoid_forward(self); +} +static inline Tensor & log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer) { + return infer_type(self).log_sigmoid_backward_out(grad_input, grad_output, self, buffer); +} +static inline Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer) { + return infer_type(self).log_sigmoid_backward(grad_output, self, buffer); +} +static inline Tensor & log_softmax_out(Tensor & output, const Tensor & self, int64_t dim) { + return infer_type(self).log_softmax_out(output, self, dim); +} +static inline Tensor log_softmax(const Tensor & self, int64_t dim) { + return infer_type(self).log_softmax(self, dim); +} +static inline Tensor & log_softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) { + return infer_type(self).log_softmax_forward_out(output, self, dim); +} +static inline Tensor log_softmax_forward(const Tensor & self, int64_t dim) { + return infer_type(self).log_softmax_forward(self, dim); +} +static inline Tensor & log_softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) { + return infer_type(self).log_softmax_backward_out(grad_input, grad_output, self, dim, output); +} +static inline Tensor log_softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) { + return infer_type(self).log_softmax_backward(grad_output, self, dim, output); +} +static inline Tensor & prelu_out(Tensor & output, const Tensor & self, const Tensor & weight) { + return infer_type(self).prelu_out(output, self, weight); +} +static inline Tensor prelu(const Tensor & self, const Tensor & weight) { + return infer_type(self).prelu(self, weight); +} +static inline Tensor & prelu_forward_out(Tensor & output, const Tensor & self, const Tensor & weight) { + return infer_type(self).prelu_forward_out(output, self, weight); +} +static inline Tensor prelu_forward(const Tensor & self, const Tensor & weight) { + return infer_type(self).prelu_forward(self, weight); +} +static inline std::tuple prelu_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight) { + return infer_type(self).prelu_backward_out(grad_input, grad_weight, grad_output, self, weight); +} +static inline std::tuple prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, std::array output_mask) { + return infer_type(self).prelu_backward(grad_output, self, weight, output_mask); +} +static inline Tensor & rrelu_with_noise_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_with_noise_out(output, self, noise, lower, upper, training, generator); +} +static inline Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_with_noise(self, noise, lower, upper, training, generator); +} +static inline Tensor & rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_with_noise_forward_out(output, self, noise, lower, upper, training, generator); +} +static inline Tensor rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_with_noise_forward(self, noise, lower, upper, training, generator); +} +static inline Tensor & rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) { + return infer_type(self).rrelu_with_noise_backward_out(grad_input, grad_output, self, noise, lower, upper, training); +} +static inline Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) { + return infer_type(self).rrelu_with_noise_backward(grad_output, self, noise, lower, upper, training); +} +static inline Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_with_noise_(self, noise, lower, upper, training, generator); +} +static inline Tensor & rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_with_noise_forward_(self, noise, lower, upper, training, generator); +} +static inline Tensor & softmax_out(Tensor & output, const Tensor & self, int64_t dim) { + return infer_type(self).softmax_out(output, self, dim); +} +static inline Tensor softmax(const Tensor & self, int64_t dim) { + return infer_type(self).softmax(self, dim); +} +static inline Tensor & softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) { + return infer_type(self).softmax_forward_out(output, self, dim); +} +static inline Tensor softmax_forward(const Tensor & self, int64_t dim) { + return infer_type(self).softmax_forward(self, dim); +} +static inline Tensor & softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) { + return infer_type(self).softmax_backward_out(grad_input, grad_output, self, dim, output); +} +static inline Tensor softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) { + return infer_type(self).softmax_backward(grad_output, self, dim, output); +} +static inline Tensor & softplus_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold) { + return infer_type(self).softplus_out(output, self, beta, threshold); +} +static inline Tensor softplus(const Tensor & self, Scalar beta, Scalar threshold) { + return infer_type(self).softplus(self, beta, threshold); +} +static inline Tensor & softplus_forward_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold) { + return infer_type(self).softplus_forward_out(output, self, beta, threshold); +} +static inline Tensor softplus_forward(const Tensor & self, Scalar beta, Scalar threshold) { + return infer_type(self).softplus_forward(self, beta, threshold); +} +static inline Tensor & softplus_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) { + return infer_type(self).softplus_backward_out(grad_input, grad_output, self, beta, threshold, output); +} +static inline Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) { + return infer_type(self).softplus_backward(grad_output, self, beta, threshold, output); +} +static inline Tensor & softshrink_out(Tensor & output, const Tensor & self, Scalar lambd) { + return infer_type(self).softshrink_out(output, self, lambd); +} +static inline Tensor softshrink(const Tensor & self, Scalar lambd) { + return infer_type(self).softshrink(self, lambd); +} +static inline Tensor & softshrink_forward_out(Tensor & output, const Tensor & self, Scalar lambd) { + return infer_type(self).softshrink_forward_out(output, self, lambd); +} +static inline Tensor softshrink_forward(const Tensor & self, Scalar lambd) { + return infer_type(self).softshrink_forward(self, lambd); +} +static inline Tensor & softshrink_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar lambd) { + return infer_type(self).softshrink_backward_out(grad_input, grad_output, self, lambd); +} +static inline Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scalar lambd) { + return infer_type(self).softshrink_backward(grad_output, self, lambd); +} +static inline Tensor & threshold_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_out(output, self, threshold, value); +} +static inline Tensor threshold(const Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold(self, threshold, value); +} +static inline Tensor & threshold_forward_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_forward_out(output, self, threshold, value); +} +static inline Tensor threshold_forward(const Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_forward(self, threshold, value); +} +static inline Tensor & threshold_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_backward_out(grad_input, grad_output, self, threshold, value); +} +static inline Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_backward(grad_output, self, threshold, value); +} +static inline Tensor & threshold_(Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_(self, threshold, value); +} +static inline Tensor & threshold_forward_(Tensor & self, Scalar threshold, Scalar value) { + return infer_type(self).threshold_forward_(self, threshold, value); +} +static inline Tensor & adaptive_avg_pool2d_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool2d_out(output, self, output_size); +} +static inline Tensor adaptive_avg_pool2d(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool2d(self, output_size); +} +static inline Tensor & adaptive_avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool2d_forward_out(output, self, output_size); +} +static inline Tensor adaptive_avg_pool2d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool2d_forward(self, output_size); +} +static inline Tensor & adaptive_avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) { + return infer_type(self).adaptive_avg_pool2d_backward_out(grad_input, grad_output, self); +} +static inline Tensor adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self) { + return infer_type(self).adaptive_avg_pool2d_backward(grad_output, self); +} +static inline Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool3d_out(output, self, output_size); +} +static inline Tensor adaptive_avg_pool3d(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool3d(self, output_size); +} +static inline Tensor & adaptive_avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool3d_forward_out(output, self, output_size); +} +static inline Tensor adaptive_avg_pool3d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool3d_forward(self, output_size); +} +static inline Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) { + return infer_type(self).adaptive_avg_pool3d_backward_out(grad_input, grad_output, self); +} +static inline Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self) { + return infer_type(self).adaptive_avg_pool3d_backward(grad_output, self); +} +static inline std::tuple adaptive_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool2d_out(output, indices, self, output_size); +} +static inline std::tuple adaptive_max_pool2d(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool2d(self, output_size); +} +static inline std::tuple adaptive_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool2d_forward_out(output, indices, self, output_size); +} +static inline std::tuple adaptive_max_pool2d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool2d_forward(self, output_size); +} +static inline Tensor & adaptive_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) { + return infer_type(self).adaptive_max_pool2d_backward_out(grad_input, grad_output, self, indices); +} +static inline Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) { + return infer_type(self).adaptive_max_pool2d_backward(grad_output, self, indices); +} +static inline std::tuple adaptive_max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool3d_out(output, indices, self, output_size); +} +static inline std::tuple adaptive_max_pool3d(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool3d(self, output_size); +} +static inline std::tuple adaptive_max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool3d_forward_out(output, indices, self, output_size); +} +static inline std::tuple adaptive_max_pool3d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool3d_forward(self, output_size); +} +static inline Tensor & adaptive_max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) { + return infer_type(self).adaptive_max_pool3d_backward_out(grad_input, grad_output, self, indices); +} +static inline Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) { + return infer_type(self).adaptive_max_pool3d_backward(grad_output, self, indices); +} +static inline Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool2d_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor avg_pool2d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool2d(self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor & avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool2d_forward_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor avg_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool2d_forward(self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor & avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool2d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool2d_backward(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor & avg_pool3d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool3d_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor avg_pool3d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool3d(self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor & avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool3d_forward_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor avg_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool3d_forward(self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor & avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool3d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) { + return infer_type(self).avg_pool3d_backward(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); +} +static inline std::tuple fractional_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) { + return infer_type(self).fractional_max_pool2d_out(output, indices, self, kernel_size, output_size, random_samples); +} +static inline std::tuple fractional_max_pool2d(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) { + return infer_type(self).fractional_max_pool2d(self, kernel_size, output_size, random_samples); +} +static inline std::tuple fractional_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) { + return infer_type(self).fractional_max_pool2d_forward_out(output, indices, self, kernel_size, output_size, random_samples); +} +static inline std::tuple fractional_max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) { + return infer_type(self).fractional_max_pool2d_forward(self, kernel_size, output_size, random_samples); +} +static inline Tensor & fractional_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) { + return infer_type(self).fractional_max_pool2d_backward_out(grad_input, grad_output, self, kernel_size, output_size, indices); +} +static inline Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) { + return infer_type(self).fractional_max_pool2d_backward(grad_output, self, kernel_size, output_size, indices); +} +static inline std::tuple max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool2d_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline std::tuple max_pool2d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool2d(self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline std::tuple max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool2d_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline std::tuple max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool2d_forward(self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline Tensor & max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) { + return infer_type(self).max_pool2d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices); +} +static inline Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) { + return infer_type(self).max_pool2d_backward(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices); +} +static inline std::tuple max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool3d_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline std::tuple max_pool3d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool3d(self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline std::tuple max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool3d_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline std::tuple max_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool3d_forward(self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline Tensor & max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) { + return infer_type(self).max_pool3d_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices); +} +static inline Tensor max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) { + return infer_type(self).max_pool3d_backward(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices); +} +static inline Tensor & max_unpool2d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) { + return infer_type(self).max_unpool2d_out(output, self, indices, output_size); +} +static inline Tensor max_unpool2d(const Tensor & self, const Tensor & indices, IntList output_size) { + return infer_type(self).max_unpool2d(self, indices, output_size); +} +static inline Tensor & max_unpool2d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) { + return infer_type(self).max_unpool2d_forward_out(output, self, indices, output_size); +} +static inline Tensor max_unpool2d_forward(const Tensor & self, const Tensor & indices, IntList output_size) { + return infer_type(self).max_unpool2d_forward(self, indices, output_size); +} +static inline Tensor & max_unpool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) { + return infer_type(self).max_unpool2d_backward_out(grad_input, grad_output, self, indices, output_size); +} +static inline Tensor max_unpool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) { + return infer_type(self).max_unpool2d_backward(grad_output, self, indices, output_size); +} +static inline Tensor & max_unpool3d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) { + return infer_type(self).max_unpool3d_out(output, self, indices, output_size, stride, padding); +} +static inline Tensor max_unpool3d(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) { + return infer_type(self).max_unpool3d(self, indices, output_size, stride, padding); +} +static inline Tensor & max_unpool3d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) { + return infer_type(self).max_unpool3d_forward_out(output, self, indices, output_size, stride, padding); +} +static inline Tensor max_unpool3d_forward(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) { + return infer_type(self).max_unpool3d_forward(self, indices, output_size, stride, padding); +} +static inline Tensor & max_unpool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) { + return infer_type(self).max_unpool3d_backward_out(grad_input, grad_output, self, indices, output_size, stride, padding); +} +static inline Tensor max_unpool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) { + return infer_type(self).max_unpool3d_backward(grad_output, self, indices, output_size, stride, padding); +} +static inline Tensor & reflection_pad1d_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad1d_out(output, self, padding); +} +static inline Tensor reflection_pad1d(const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad1d(self, padding); +} +static inline Tensor & reflection_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad1d_forward_out(output, self, padding); +} +static inline Tensor reflection_pad1d_forward(const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad1d_forward(self, padding); +} +static inline Tensor & reflection_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad1d_backward_out(grad_input, grad_output, self, padding); +} +static inline Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad1d_backward(grad_output, self, padding); +} +static inline Tensor & reflection_pad2d_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad2d_out(output, self, padding); +} +static inline Tensor reflection_pad2d(const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad2d(self, padding); +} +static inline Tensor & reflection_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad2d_forward_out(output, self, padding); +} +static inline Tensor reflection_pad2d_forward(const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad2d_forward(self, padding); +} +static inline Tensor & reflection_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad2d_backward_out(grad_input, grad_output, self, padding); +} +static inline Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).reflection_pad2d_backward(grad_output, self, padding); +} +static inline Tensor & replication_pad1d_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad1d_out(output, self, padding); +} +static inline Tensor replication_pad1d(const Tensor & self, IntList padding) { + return infer_type(self).replication_pad1d(self, padding); +} +static inline Tensor & replication_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad1d_forward_out(output, self, padding); +} +static inline Tensor replication_pad1d_forward(const Tensor & self, IntList padding) { + return infer_type(self).replication_pad1d_forward(self, padding); +} +static inline Tensor & replication_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad1d_backward_out(grad_input, grad_output, self, padding); +} +static inline Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad1d_backward(grad_output, self, padding); +} +static inline Tensor & replication_pad2d_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad2d_out(output, self, padding); +} +static inline Tensor replication_pad2d(const Tensor & self, IntList padding) { + return infer_type(self).replication_pad2d(self, padding); +} +static inline Tensor & replication_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad2d_forward_out(output, self, padding); +} +static inline Tensor replication_pad2d_forward(const Tensor & self, IntList padding) { + return infer_type(self).replication_pad2d_forward(self, padding); +} +static inline Tensor & replication_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad2d_backward_out(grad_input, grad_output, self, padding); +} +static inline Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad2d_backward(grad_output, self, padding); +} +static inline Tensor & replication_pad3d_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad3d_out(output, self, padding); +} +static inline Tensor replication_pad3d(const Tensor & self, IntList padding) { + return infer_type(self).replication_pad3d(self, padding); +} +static inline Tensor & replication_pad3d_forward_out(Tensor & output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad3d_forward_out(output, self, padding); +} +static inline Tensor replication_pad3d_forward(const Tensor & self, IntList padding) { + return infer_type(self).replication_pad3d_forward(self, padding); +} +static inline Tensor & replication_pad3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad3d_backward_out(grad_input, grad_output, self, padding); +} +static inline Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) { + return infer_type(self).replication_pad3d_backward(grad_output, self, padding); +} +static inline Tensor & upsample_linear1d_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).upsample_linear1d_out(output, self, output_size); +} +static inline Tensor upsample_linear1d(const Tensor & self, IntList output_size) { + return infer_type(self).upsample_linear1d(self, output_size); +} +static inline Tensor & upsample_linear1d_forward_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).upsample_linear1d_forward_out(output, self, output_size); +} +static inline Tensor upsample_linear1d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).upsample_linear1d_forward(self, output_size); +} +static inline Tensor & upsample_linear1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) { + return infer_type(grad_input).upsample_linear1d_backward_out(grad_input, grad_output, output_size, input_size); +} +static inline Tensor upsample_linear1d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) { + return infer_type(grad_output).upsample_linear1d_backward(grad_output, output_size, input_size); +} +static inline Tensor & upsample_bilinear2d_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).upsample_bilinear2d_out(output, self, output_size); +} +static inline Tensor upsample_bilinear2d(const Tensor & self, IntList output_size) { + return infer_type(self).upsample_bilinear2d(self, output_size); +} +static inline Tensor & upsample_bilinear2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).upsample_bilinear2d_forward_out(output, self, output_size); +} +static inline Tensor upsample_bilinear2d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).upsample_bilinear2d_forward(self, output_size); +} +static inline Tensor & upsample_bilinear2d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) { + return infer_type(grad_input).upsample_bilinear2d_backward_out(grad_input, grad_output, output_size, input_size); +} +static inline Tensor upsample_bilinear2d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) { + return infer_type(grad_output).upsample_bilinear2d_backward(grad_output, output_size, input_size); +} +static inline Tensor & upsample_trilinear3d_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).upsample_trilinear3d_out(output, self, output_size); +} +static inline Tensor upsample_trilinear3d(const Tensor & self, IntList output_size) { + return infer_type(self).upsample_trilinear3d(self, output_size); +} +static inline Tensor & upsample_trilinear3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) { + return infer_type(self).upsample_trilinear3d_forward_out(output, self, output_size); +} +static inline Tensor upsample_trilinear3d_forward(const Tensor & self, IntList output_size) { + return infer_type(self).upsample_trilinear3d_forward(self, output_size); +} +static inline Tensor & upsample_trilinear3d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) { + return infer_type(grad_input).upsample_trilinear3d_backward_out(grad_input, grad_output, output_size, input_size); +} +static inline Tensor upsample_trilinear3d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) { + return infer_type(grad_output).upsample_trilinear3d_backward(grad_output, output_size, input_size); +} +static inline Tensor & upsample_nearest1d_out(Tensor & output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest1d_out(output, self, scale_factor); +} +static inline Tensor upsample_nearest1d(const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest1d(self, scale_factor); +} +static inline Tensor & upsample_nearest1d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest1d_forward_out(output, self, scale_factor); +} +static inline Tensor upsample_nearest1d_forward(const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest1d_forward(self, scale_factor); +} +static inline Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest1d_backward_out(grad_input, grad_output, self, scale_factor); +} +static inline Tensor upsample_nearest1d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest1d_backward(grad_output, self, scale_factor); +} +static inline Tensor & upsample_nearest2d_out(Tensor & output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest2d_out(output, self, scale_factor); +} +static inline Tensor upsample_nearest2d(const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest2d(self, scale_factor); +} +static inline Tensor & upsample_nearest2d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest2d_forward_out(output, self, scale_factor); +} +static inline Tensor upsample_nearest2d_forward(const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest2d_forward(self, scale_factor); +} +static inline Tensor & upsample_nearest2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest2d_backward_out(grad_input, grad_output, self, scale_factor); +} +static inline Tensor upsample_nearest2d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest2d_backward(grad_output, self, scale_factor); +} +static inline Tensor & upsample_nearest3d_out(Tensor & output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest3d_out(output, self, scale_factor); +} +static inline Tensor upsample_nearest3d(const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest3d(self, scale_factor); +} +static inline Tensor & upsample_nearest3d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest3d_forward_out(output, self, scale_factor); +} +static inline Tensor upsample_nearest3d_forward(const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest3d_forward(self, scale_factor); +} +static inline Tensor & upsample_nearest3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest3d_backward_out(grad_input, grad_output, self, scale_factor); +} +static inline Tensor upsample_nearest3d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) { + return infer_type(self).upsample_nearest3d_backward(grad_output, self, scale_factor); +} +static inline Tensor & _sigmoid_out(Tensor & output, const Tensor & self) { + return infer_type(self)._sigmoid_out(output, self); +} +static inline Tensor _sigmoid(const Tensor & self) { + return infer_type(self)._sigmoid(self); +} +static inline Tensor & _sigmoid_forward_out(Tensor & output, const Tensor & self) { + return infer_type(self)._sigmoid_forward_out(output, self); +} +static inline Tensor _sigmoid_forward(const Tensor & self) { + return infer_type(self)._sigmoid_forward(self); +} +static inline Tensor & _sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) { + return infer_type(grad_input)._sigmoid_backward_out(grad_input, grad_output, output); +} +static inline Tensor _sigmoid_backward(const Tensor & grad_output, const Tensor & output) { + return infer_type(grad_output)._sigmoid_backward(grad_output, output); +} +static inline Tensor & _tanh_out(Tensor & output, const Tensor & self) { + return infer_type(self)._tanh_out(output, self); +} +static inline Tensor _tanh(const Tensor & self) { + return infer_type(self)._tanh(self); +} +static inline Tensor & _tanh_forward_out(Tensor & output, const Tensor & self) { + return infer_type(self)._tanh_forward_out(output, self); +} +static inline Tensor _tanh_forward(const Tensor & self) { + return infer_type(self)._tanh_forward(self); +} +static inline Tensor & _tanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) { + return infer_type(grad_input)._tanh_backward_out(grad_input, grad_output, output); +} +static inline Tensor _tanh_backward(const Tensor & grad_output, const Tensor & output) { + return infer_type(grad_output)._tanh_backward(grad_output, output); +} +static inline Tensor & thnn_batch_norm_out(Tensor & output, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) { + return infer_type(self).thnn_batch_norm_out(output, self, weight, bias, running_mean, running_var, training, momentum, eps); +} +static inline Tensor thnn_batch_norm(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) { + return infer_type(self).thnn_batch_norm(self, weight, bias, running_mean, running_var, training, momentum, eps); +} +static inline std::tuple thnn_batch_norm_forward_out(Tensor & output, Tensor & save_mean, Tensor & save_std, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) { + return infer_type(self).thnn_batch_norm_forward_out(output, save_mean, save_std, self, weight, bias, running_mean, running_var, training, momentum, eps); +} +static inline std::tuple thnn_batch_norm_forward(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) { + return infer_type(self).thnn_batch_norm_forward(self, weight, bias, running_mean, running_var, training, momentum, eps); +} +static inline std::tuple thnn_batch_norm_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std) { + return infer_type(self).thnn_batch_norm_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, running_mean, running_var, training, eps, save_mean, save_std); +} +static inline std::tuple thnn_batch_norm_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std, std::array output_mask) { + return infer_type(self).thnn_batch_norm_backward(grad_output, self, weight, running_mean, running_var, training, eps, save_mean, save_std, output_mask); +} +static inline Tensor & thnn_conv_transpose2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose2d_out(output, self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline Tensor thnn_conv_transpose2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose2d(self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline std::tuple thnn_conv_transpose2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose2d_forward_out(output, columns, ones, self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline std::tuple thnn_conv_transpose2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose2d_forward(self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline std::tuple thnn_conv_transpose2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones) { + return infer_type(self).thnn_conv_transpose2d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, columns, ones); +} +static inline std::tuple thnn_conv_transpose2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask) { + return infer_type(self).thnn_conv_transpose2d_backward(grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, columns, ones, output_mask); +} +static inline Tensor & thnn_conv_transpose3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose3d_out(output, self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline Tensor thnn_conv_transpose3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose3d(self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline std::tuple thnn_conv_transpose3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose3d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline std::tuple thnn_conv_transpose3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) { + return infer_type(self).thnn_conv_transpose3d_forward(self, weight, kernel_size, bias, stride, padding, output_padding, dilation); +} +static inline std::tuple thnn_conv_transpose3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input) { + return infer_type(self).thnn_conv_transpose3d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, finput, fgrad_input); +} +static inline std::tuple thnn_conv_transpose3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask) { + return infer_type(self).thnn_conv_transpose3d_backward(grad_output, self, weight, kernel_size, stride, padding, output_padding, dilation, finput, fgrad_input, output_mask); +} +static inline Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv2d_out(output, self, weight, kernel_size, bias, stride, padding); +} +static inline Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv2d(self, weight, kernel_size, bias, stride, padding); +} +static inline std::tuple thnn_conv2d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv2d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding); +} +static inline std::tuple thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv2d_forward(self, weight, kernel_size, bias, stride, padding); +} +static inline std::tuple thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) { + return infer_type(self).thnn_conv2d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input); +} +static inline std::tuple thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask) { + return infer_type(self).thnn_conv2d_backward(grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input, output_mask); +} +static inline Tensor & thnn_conv_depthwise2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_depthwise2d_out(output, self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline Tensor thnn_conv_depthwise2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_depthwise2d(self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline Tensor & thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_depthwise2d_forward_out(output, self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline Tensor thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_depthwise2d_forward(self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_depthwise2d_backward_out(grad_input, grad_weight, grad_output, self, weight, kernel_size, stride, padding, dilation); +} +static inline std::tuple thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array output_mask) { + return infer_type(self).thnn_conv_depthwise2d_backward(grad_output, self, weight, kernel_size, stride, padding, dilation, output_mask); +} +static inline Tensor & thnn_conv3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv3d_out(output, self, weight, kernel_size, bias, stride, padding); +} +static inline Tensor thnn_conv3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv3d(self, weight, kernel_size, bias, stride, padding); +} +static inline std::tuple thnn_conv3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv3d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding); +} +static inline std::tuple thnn_conv3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) { + return infer_type(self).thnn_conv3d_forward(self, weight, kernel_size, bias, stride, padding); +} +static inline std::tuple thnn_conv3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) { + return infer_type(self).thnn_conv3d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input); +} +static inline std::tuple thnn_conv3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask) { + return infer_type(self).thnn_conv3d_backward(grad_output, self, weight, kernel_size, stride, padding, finput, fgrad_input, output_mask); +} +static inline Tensor & thnn_conv_dilated2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated2d_out(output, self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline Tensor thnn_conv_dilated2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated2d(self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_dilated2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated2d_forward_out(output, columns, ones, self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_dilated2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated2d_forward(self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_dilated2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) { + return infer_type(self).thnn_conv_dilated2d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones); +} +static inline std::tuple thnn_conv_dilated2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask) { + return infer_type(self).thnn_conv_dilated2d_backward(grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones, output_mask); +} +static inline Tensor & thnn_conv_dilated3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated3d_out(output, self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline Tensor thnn_conv_dilated3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated3d(self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_dilated3d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated3d_forward_out(output, columns, ones, self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_dilated3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) { + return infer_type(self).thnn_conv_dilated3d_forward(self, weight, kernel_size, bias, stride, padding, dilation); +} +static inline std::tuple thnn_conv_dilated3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) { + return infer_type(self).thnn_conv_dilated3d_backward_out(grad_input, grad_weight, grad_bias, grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones); +} +static inline std::tuple thnn_conv_dilated3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask) { + return infer_type(self).thnn_conv_dilated3d_backward(grad_output, self, weight, kernel_size, stride, padding, dilation, columns, ones, output_mask); +} +static inline Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_avg_pool1d(self, output_size); +} +static inline std::tuple adaptive_max_pool1d(const Tensor & self, IntList output_size) { + return infer_type(self).adaptive_max_pool1d(self, output_size); +} +static inline bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol) { + return infer_type(self).allclose(self, other, rtol, atol); +} +static inline Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return infer_type(self).addmv(self, mat, vec, beta, alpha); +} +static inline Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return infer_type(self).addmv_(self, mat, vec, beta, alpha); +} +static inline Tensor & addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return infer_type(self).addmv_out(result, self, mat, vec, beta, alpha); +} +static inline Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return infer_type(self).addr(self, vec1, vec2, beta, alpha); +} +static inline Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return infer_type(self).addr_(self, vec1, vec2, beta, alpha); +} +static inline Tensor & addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return infer_type(self).addr_out(result, self, vec1, vec2, beta, alpha); +} +static inline Tensor batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps, bool cudnn_enabled) { + return infer_type(input).batch_norm(input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled); +} +static inline Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) { + return infer_type(self).bernoulli_(self, p, generator); +} +static inline Tensor & bernoulli_(Tensor & self, double p, Generator * generator) { + return infer_type(self).bernoulli_(self, p, generator); +} +static inline Tensor cat(TensorList tensors, int64_t dim) { + return infer_type(tensors).cat(tensors, dim); +} +static inline Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim) { + return infer_type(result).cat_out(result, tensors, dim); +} +static inline Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return infer_type(self).sspaddmm(self, mat1, mat2, beta, alpha); +} +static inline Tensor & sspaddmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return infer_type(self).sspaddmm_out(result, self, mat1, mat2, beta, alpha); +} +static inline std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim) { + return infer_type(self).chunk(self, chunks, dim); +} +static inline bool cudnn_is_acceptable(const Tensor & self) { + return infer_type(self).cudnn_is_acceptable(self); +} +static inline Tensor convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) { + return infer_type(input).convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups); +} +static inline Tensor _convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return infer_type(input)._convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); +} +static inline Tensor _convolution_nogroup(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) { + return infer_type(input)._convolution_nogroup(input, weight, bias, stride, padding, dilation, transposed, output_padding); +} +static inline std::tuple _convolution_double_backward(const Tensor & ggI, const Tensor & ggW, const Tensor & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array output_mask) { + return infer_type(self)._convolution_double_backward(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, output_mask); +} +static inline Tensor conv1d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, int64_t groups) { + return infer_type(input).conv1d(input, weight, bias, stride, padding, dilation, groups); +} +static inline Tensor conv2d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, int64_t groups) { + return infer_type(input).conv2d(input, weight, bias, stride, padding, dilation, groups); +} +static inline Tensor conv3d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, int64_t groups) { + return infer_type(input).conv3d(input, weight, bias, stride, padding, dilation, groups); +} +static inline Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad) { + return infer_type(self).conv_tbc(self, weight, bias, pad); +} +static inline std::tuple conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad) { + return infer_type(self).conv_tbc_backward(self, input, weight, bias, pad); +} +static inline Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) { + return infer_type(input).conv_transpose1d(input, weight, bias, stride, padding, output_padding, groups, dilation); +} +static inline Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) { + return infer_type(input).conv_transpose2d(input, weight, bias, stride, padding, output_padding, groups, dilation); +} +static inline Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) { + return infer_type(input).conv_transpose3d(input, weight, bias, stride, padding, output_padding, groups, dilation); +} +static inline Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W) { + return infer_type(theta).cudnn_affine_grid_generator(theta, N, C, H, W); +} +static inline Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W) { + return infer_type(grad).cudnn_affine_grid_generator_backward(grad, N, C, H, W); +} +static inline std::tuple cudnn_batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double exponential_average_factor, double epsilon) { + return infer_type(input).cudnn_batch_norm(input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon); +} +static inline std::tuple cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, const Tensor & save_mean, const Tensor & save_var, double epsilon) { + return infer_type(input).cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon); +} +static inline Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { + return infer_type(self).cudnn_convolution(self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic); +} +static inline Tensor cudnn_convolution_backward_input(IntList self_size, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { + return infer_type(grad_output).cudnn_convolution_backward_input(self_size, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); +} +static inline std::tuple cudnn_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { + return infer_type(self).cudnn_convolution_backward(self, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, output_mask); +} +static inline Tensor cudnn_convolution_backward_bias(const Tensor & grad_output) { + return infer_type(grad_output).cudnn_convolution_backward_bias(grad_output); +} +static inline Tensor cudnn_convolution_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { + return infer_type(self).cudnn_convolution_backward_weight(weight_size, grad_output, self, padding, stride, dilation, groups, benchmark, deterministic); +} +static inline Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { + return infer_type(self).cudnn_convolution_transpose(self, weight, bias, padding, output_padding, stride, dilation, groups, benchmark, deterministic); +} +static inline std::tuple cudnn_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { + return infer_type(self).cudnn_convolution_transpose_backward(self, grad_output, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, output_mask); +} +static inline Tensor cudnn_convolution_transpose_backward_bias(const Tensor & grad_output) { + return infer_type(grad_output).cudnn_convolution_transpose_backward_bias(grad_output); +} +static inline Tensor cudnn_convolution_transpose_backward_input(const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { + return infer_type(grad_output).cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); +} +static inline Tensor cudnn_convolution_transpose_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { + return infer_type(self).cudnn_convolution_transpose_backward_weight(weight_size, grad_output, self, padding, stride, dilation, groups, benchmark, deterministic); +} +static inline Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid) { + return infer_type(self).cudnn_grid_sampler(self, grid); +} +static inline std::tuple cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output) { + return infer_type(self).cudnn_grid_sampler_backward(self, grid, grad_output); +} +static inline Tensor det(const Tensor & self) { + return infer_type(self).det(self); +} +static inline std::tuple _det_with_svd(const Tensor & self) { + return infer_type(self)._det_with_svd(self); +} +static inline Tensor dot(const Tensor & self, const Tensor & tensor) { + return infer_type(self).dot(self, tensor); +} +static inline Tensor embedding(const Tensor & weight, const Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) { + return infer_type(weight).embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse); +} +static inline Tensor embedding_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) { + return infer_type(grad).embedding_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq, sparse); +} +static inline Tensor embedding_dense_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) { + return infer_type(grad).embedding_dense_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq); +} +static inline Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type) { + return infer_type(self).embedding_renorm_(self, indices, max_norm, norm_type); +} +static inline Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) { + return infer_type(grad).embedding_sparse_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq); +} +static inline Tensor empty_like(const Tensor & self) { + return infer_type(self).empty_like(self); +} +static inline std::tuple embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse) { + return infer_type(weight).embedding_bag(weight, indices, offsets, scale_grad_by_freq, mode, sparse); +} +static inline Tensor embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) { + return infer_type(grad).embedding_bag_backward(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, sparse); +} +static inline Tensor embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) { + return infer_type(grad).embedding_bag_sparse_backward(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode); +} +static inline Tensor embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) { + return infer_type(grad).embedding_bag_dense_backward(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode); +} +static inline Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, bool size_average, bool reduce) { + return infer_type(self).hinge_embedding_loss(self, target, margin, size_average, reduce); +} +static inline Tensor ger(const Tensor & self, const Tensor & vec2) { + return infer_type(self).ger(self, vec2); +} +static inline Tensor & ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) { + return infer_type(self).ger_out(result, self, vec2); +} +static inline Tensor index(const Tensor & self, TensorList indices) { + return infer_type(self).index(self, indices); +} +static inline Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) { + return infer_type(self).index_put_(self, indices, values); +} +static inline bool is_cuda(const Tensor & self) { + return infer_type(self).is_cuda(self); +} +static inline bool is_distributed(const Tensor & self) { + return infer_type(self).is_distributed(self); +} +static inline bool is_floating_point(const Tensor & self) { + return infer_type(self).is_floating_point(self); +} +static inline bool is_nonzero(const Tensor & self) { + return infer_type(self).is_nonzero(self); +} +static inline bool is_same_size(const Tensor & self, const Tensor & other) { + return infer_type(self).is_same_size(self, other); +} +static inline bool is_signed(const Tensor & self) { + return infer_type(self).is_signed(self); +} +static inline bool is_sparse(const Tensor & self) { + return infer_type(self).is_sparse(self); +} +static inline Tensor matmul(const Tensor & self, const Tensor & other) { + return infer_type(self).matmul(self, other); +} +static inline std::tuple max_pool1d(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) { + return infer_type(self).max_pool1d(self, kernel_size, stride, padding, dilation, ceil_mode); +} +static inline Tensor mm(const Tensor & self, const Tensor & mat2) { + return infer_type(self).mm(self, mat2); +} +static inline Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) { + return infer_type(self).mm_out(result, self, mat2); +} +static inline Tensor mv(const Tensor & self, const Tensor & vec) { + return infer_type(self).mv(self, vec); +} +static inline Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec) { + return infer_type(self).mv_out(result, self, vec); +} +static inline Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) { + return infer_type(self).narrow(self, dim, start, length); +} +static inline Tensor pin_memory(const Tensor & self) { + return infer_type(self).pin_memory(self); +} +static inline Tensor rand_like(const Tensor & self) { + return infer_type(self).rand_like(self); +} +static inline Tensor randn_like(const Tensor & self) { + return infer_type(self).randn_like(self); +} +static inline Tensor repeat(const Tensor & self, IntList repeats) { + return infer_type(self).repeat(self, repeats); +} +static inline std::tuple RoiPooling2d_forward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) { + return infer_type(input).RoiPooling2d_forward(input, rois, pooledHeight, pooledWidth, spatialScale); +} +static inline Tensor RoiPooling2d_backward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, const Tensor & gradOutput, const Tensor & argmaxes) { + return infer_type(input).RoiPooling2d_backward(input, rois, pooledHeight, pooledWidth, spatialScale, gradOutput, argmaxes); +} +static inline Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu(self, lower, upper, training, generator); +} +static inline Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Generator * generator) { + return infer_type(self).rrelu_(self, lower, upper, training, generator); +} +static inline Tensor select(const Tensor & self, int64_t dim, int64_t index) { + return infer_type(self).select(self, dim, index); +} +static inline Tensor selu(const Tensor & self) { + return infer_type(self).selu(self); +} +static inline Tensor & selu_(Tensor & self) { + return infer_type(self).selu_(self); +} +static inline int64_t size(const Tensor & self, int64_t dim) { + return infer_type(self).size(self, dim); +} +static inline Tensor slice(const Tensor & self, int64_t dim, int64_t start, int64_t end, int64_t step) { + return infer_type(self).slice(self, dim, start, end, step); +} +static inline std::vector split(const Tensor & self, int64_t split_size, int64_t dim) { + return infer_type(self).split(self, split_size, dim); +} +static inline Tensor squeeze(const Tensor & self) { + return infer_type(self).squeeze(self); +} +static inline Tensor squeeze(const Tensor & self, int64_t dim) { + return infer_type(self).squeeze(self, dim); +} +static inline Tensor & squeeze_(Tensor & self) { + return infer_type(self).squeeze_(self); +} +static inline Tensor & squeeze_(Tensor & self, int64_t dim) { + return infer_type(self).squeeze_(self, dim); +} +static inline Tensor stack(TensorList tensors, int64_t dim) { + return infer_type(tensors).stack(tensors, dim); +} +static inline Tensor & stack_out(Tensor & result, TensorList tensors, int64_t dim) { + return infer_type(result).stack_out(result, tensors, dim); +} +static inline Tensor stft(const Tensor & self, int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided, const Tensor & window, int64_t pad_end) { + return infer_type(self).stft(self, frame_length, hop, fft_size, return_onesided, window, pad_end); +} +static inline int64_t stride(const Tensor & self, int64_t dim) { + return infer_type(self).stride(self, dim); +} +static inline Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) { + return infer_type(self).transpose_(self, dim0, dim1); +} +static inline Tensor & t_(Tensor & self) { + return infer_type(self).t_(self); +} +static inline Tensor type_as(const Tensor & self, const Tensor & other) { + return infer_type(self).type_as(self, other); +} +static inline Tensor unsqueeze(const Tensor & self, int64_t dim) { + return infer_type(self).unsqueeze(self, dim); +} +static inline Tensor & unsqueeze_(Tensor & self, int64_t dim) { + return infer_type(self).unsqueeze_(self, dim); +} +static inline Tensor view_as(const Tensor & self, const Tensor & other) { + return infer_type(self).view_as(self, other); +} +static inline Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) { + return infer_type(self).where(condition, self, other); +} +static inline Tensor _s_where(const Tensor & condition, const Tensor & self, const Tensor & other) { + return infer_type(self)._s_where(condition, self, other); +} +static inline Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output) { + return infer_type(self)._standard_gamma_grad(self, output); +} +static inline Tensor poisson(const Tensor & self, Generator * generator) { + return infer_type(self).poisson(self, generator); +} +static inline Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) { + return infer_type(weight_arr)._cudnn_rnn_flatten_weight(weight_arr, weight_stride0, input_size, mode, hidden_size, num_layers, batch_first, bidirectional); +} +static inline std::tuple _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state) { + return infer_type(input)._cudnn_rnn(input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state); +} +static inline std::tuple> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, const Tensor & output, const Tensor & grad_output, const Tensor & grad_hy, const Tensor & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state, const Tensor & reserve, std::array output_mask) { + return infer_type(input)._cudnn_rnn_backward(input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask); +} + +} diff --git a/aten/doc/Tensor.h b/aten/doc/Tensor.h new file mode 100644 index 0000000..7cfc669 --- /dev/null +++ b/aten/doc/Tensor.h @@ -0,0 +1,464 @@ +#pragma once + +#include "ATen/Generator.h" +#include "ATen/Scalar.h" +#include "ATen/ScalarType.h" +#include "ATen/TensorAccessor.h" +#include "ATen/TensorImpl.h" +#include "ATen/TensorBase.h" +#include "ATen/Storage.h" +#include "ATen/SparseTensorRef.h" +#include "ATen/Utils.h" + +namespace at { +struct Type; + +// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which +// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr. +// +// For example: +// +// void func(Tensor a) { +// Tensor b = a; +// ... +// } +// +// In this example, when we say Tensor b = a, we are creating a new object that points to the +// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the +// destructor decrements the reference count by calling release() on the TensorImpl it points to. +// The existing constructors, operator overloads, etc. take care to implement the correct semantics. +// +// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and +// special care must be taken to handle this. +struct Tensor : public detail::TensorBase { + Tensor() : TensorBase() {} + Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {} + Tensor(const TensorBase & rhs) : TensorBase(rhs) {} + Tensor(const Tensor & rhs) = default; + Tensor(Tensor && rhs) noexcept = default; + + // reimplemented from TensorBase so the return type is Tensor rather than TensorBase + Tensor & operator=(Tensor && rhs) & { + rhs.swap(*this); + return *this; + } + Tensor & operator=(Tensor const & rhs) & { + //Tensor ctor retains original rhs.pImpl + //then rhs.pImpl is swapped with this->pImpl + //finally Tensor dtor releases rhs.pImpl, which was originally this->pImpl + Tensor(rhs).swap(*this); + return *this; + } + + inline Tensor & operator=(Tensor const & rhs) &&; + Tensor & operator=(Scalar v) &&; + const char * toString() const { + return pImpl->toString(); + } + IntList sizes() const { + return pImpl->sizes(); + } + IntList strides() const { + return pImpl->strides(); + } + int64_t ndimension() const { + return dim(); + } + Type & type() const { + return pImpl->type(); + } + std::unique_ptr storage() const { + return pImpl->storage(); + } + inline Tensor toType(const Type & t) const; + inline Tensor & copy_(const Tensor & src, bool non_blocking=false); + inline Tensor toType(ScalarType t) const; + inline Tensor toBackend(Backend b) const; + + template + T * data() const; + + void * unsafeGetTH(bool retain) const { + return pImpl->unsafeGetTH(retain); + } + + // Purposely not defined here to avoid inlining + void print() const; + + //toLongData(), toFloatData() etc. + #define TO_TYPE_DATA(T,name,_) \ + T * to##name##Data() const; + AT_FORALL_SCALAR_TYPES(TO_TYPE_DATA) + #undef TO_TYPE_DATA + + #define TO_C_TYPE(T,name,_) \ + T toC##name () const; + AT_FORALL_SCALAR_TYPES(TO_C_TYPE) + #undef TO_C_TYPE + + template + TensorAccessor accessor() { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_ASSERT(dim() == N, "expected %d dims but tensor has %d",N,dim()); + return TensorAccessor(data(),sizes().data(),strides().data()); + } + + Tensor operator-() const; + Tensor& operator+=(const Tensor & other); + Tensor& operator+=(Scalar other); + Tensor& operator-=(const Tensor & other); + Tensor& operator-=(Scalar other); + Tensor& operator*=(const Tensor & other); + Tensor& operator*=(Scalar other); + Tensor& operator/=(const Tensor & other); + Tensor& operator/=(Scalar other); + Tensor operator[](int64_t idx) const; + + // STOP. Thinking of adding a method here, which only makes use + // of other ATen methods? Define it in native_functions.yaml. + + //example + //Tensor * add(Tensor & b); + int64_t storage_offset() const; + Tensor & resize_(IntList size); + int64_t numel() const; + Tensor & set_(Storage & source); + Tensor & set_(Storage & source, int64_t storage_offset, IntList size, IntList stride={}); + Tensor & set_(const Tensor & source); + Tensor & set_(); + Tensor & fill_(Scalar value); + Tensor & fill_(const Tensor & value); + bool is_contiguous() const; + bool is_set_to(const Tensor & tensor) const; + Tensor & masked_fill_(const Tensor & mask, Scalar value); + Tensor & masked_fill_(const Tensor & mask, const Tensor & value); + Tensor & masked_scatter_(const Tensor & mask, const Tensor & source); + Tensor masked_select(const Tensor & mask) const; + Tensor transpose(int64_t dim0, int64_t dim1) const; + Tensor t() const; + Tensor nonzero() const; + Tensor contiguous() const; + Tensor clone() const; + Tensor view(IntList size) const; + Tensor & resize_as_(const Tensor & the_template); + Tensor index_select(int64_t dim, const Tensor & index) const; + Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source); + Tensor take(const Tensor & index) const; + Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false); + Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source); + Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value); + Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value); + Tensor unfold(int64_t dimension, int64_t size, int64_t step) const; + Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src); + Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value); + Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src); + Tensor gather(int64_t dim, const Tensor & index) const; + void* data_ptr() const; + bool equal(const Tensor & other) const; + Tensor __and__(Scalar other) const; + Tensor __and__(const Tensor & other) const; + Tensor & __iand__(Scalar other); + Tensor & __iand__(const Tensor & other); + Tensor __or__(Scalar other) const; + Tensor __or__(const Tensor & other) const; + Tensor & __ior__(Scalar other); + Tensor & __ior__(const Tensor & other); + Tensor __xor__(Scalar other) const; + Tensor __xor__(const Tensor & other) const; + Tensor & __ixor__(Scalar other); + Tensor & __ixor__(const Tensor & other); + Tensor __lshift__(Scalar other) const; + Tensor __lshift__(const Tensor & other) const; + Tensor & __ilshift__(Scalar other); + Tensor & __ilshift__(const Tensor & other); + Tensor __rshift__(Scalar other) const; + Tensor __rshift__(const Tensor & other) const; + Tensor & __irshift__(Scalar other); + Tensor & __irshift__(const Tensor & other); + Tensor lt(Scalar other) const; + Tensor lt(const Tensor & other) const; + Tensor & lt_(Scalar other); + Tensor & lt_(const Tensor & other); + Tensor gt(Scalar other) const; + Tensor gt(const Tensor & other) const; + Tensor & gt_(Scalar other); + Tensor & gt_(const Tensor & other); + Tensor le(Scalar other) const; + Tensor le(const Tensor & other) const; + Tensor & le_(Scalar other); + Tensor & le_(const Tensor & other); + Tensor ge(Scalar other) const; + Tensor ge(const Tensor & other) const; + Tensor & ge_(Scalar other); + Tensor & ge_(const Tensor & other); + Tensor eq(Scalar other) const; + Tensor eq(const Tensor & other) const; + Tensor & eq_(Scalar other); + Tensor & eq_(const Tensor & other); + Tensor ne(Scalar other) const; + Tensor ne(const Tensor & other) const; + Tensor & ne_(Scalar other); + Tensor & ne_(const Tensor & other); + std::tuple min(int64_t dim, bool keepdim=false) const; + Tensor min(const Tensor & other) const; + Tensor min() const; + std::tuple max(int64_t dim, bool keepdim=false) const; + Tensor max(const Tensor & other) const; + Tensor max() const; + std::tuple kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const; + std::tuple mode(int64_t dim=-1, bool keepdim=false) const; + std::tuple median(int64_t dim, bool keepdim=false) const; + Tensor median() const; + std::tuple sort(int64_t dim=-1, bool descending=false) const; + std::tuple topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const; + Tensor all() const; + Tensor any() const; + int64_t get_device() const; + Tensor abs() const; + Tensor & abs_(); + Tensor & sigmoid_(); + Tensor sigmoid() const; + Tensor & log_(); + Tensor log() const; + Tensor & log1p_(); + Tensor log1p() const; + Tensor lgamma() const; + Tensor & lgamma_(); + Tensor digamma() const; + Tensor & digamma_(); + Tensor polygamma(int64_t n) const; + Tensor & polygamma_(int64_t n); + Tensor & exp_(); + Tensor exp() const; + Tensor & expm1_(); + Tensor expm1() const; + Tensor & cos_(); + Tensor cos() const; + Tensor & acos_(); + Tensor acos() const; + Tensor & cosh_(); + Tensor cosh() const; + Tensor & sin_(); + Tensor sin() const; + Tensor & asin_(); + Tensor asin() const; + Tensor & sinh_(); + Tensor sinh() const; + Tensor & tan_(); + Tensor tan() const; + Tensor & atan_(); + Tensor atan() const; + Tensor & tanh_(); + Tensor tanh() const; + Tensor & erf_(); + Tensor erf() const; + Tensor & erfc_(); + Tensor erfc() const; + Tensor & erfinv_(); + Tensor erfinv() const; + Tensor & sqrt_(); + Tensor sqrt() const; + Tensor & rsqrt_(); + Tensor rsqrt() const; + Tensor & ceil_(); + Tensor ceil() const; + Tensor & floor_(); + Tensor floor() const; + Tensor & round_(); + Tensor round() const; + Tensor & trunc_(); + Tensor trunc() const; + Tensor & frac_(); + Tensor frac() const; + Tensor mean(int64_t dim, bool keepdim=false) const; + Tensor mean() const; + Tensor var(int64_t dim, bool unbiased=true, bool keepdim=false) const; + Tensor var(bool unbiased=true) const; + Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const; + Tensor std(bool unbiased=true) const; + Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const; + Tensor norm(Scalar p=2) const; + Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const; + Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm); + Tensor dist(const Tensor & other, Scalar p=2) const; + Tensor reciprocal() const; + Tensor & reciprocal_(); + Tensor neg() const; + Tensor & neg_(); + Tensor atan2(const Tensor & other) const; + Tensor & atan2_(const Tensor & other); + Tensor pow(Scalar exponent) const; + Tensor pow(const Tensor & exponent) const; + Tensor & pow_(Scalar exponent); + Tensor & pow_(const Tensor & exponent); + Tensor lerp(const Tensor & end, Scalar weight) const; + Tensor & lerp_(const Tensor & end, Scalar weight); + Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const; + Tensor & zero_(); + Tensor sum(int64_t dim, bool keepdim=false) const; + Tensor sum() const; + Tensor prod(int64_t dim, bool keepdim=false) const; + Tensor prod() const; + Tensor cumsum(int64_t dim) const; + Tensor cumprod(int64_t dim) const; + Tensor sign() const; + Tensor & sign_(); + Tensor trace() const; + Tensor add(Scalar other, Scalar alpha=1) const; + Tensor add(const Tensor & other, Scalar alpha=1) const; + Tensor add(SparseTensor other, Scalar alpha=1) const; + Tensor & add_(Scalar other, Scalar alpha=1); + Tensor & add_(const Tensor & other, Scalar alpha=1); + Tensor & add_(SparseTensor other, Scalar alpha=1); + Tensor sub(Scalar other, Scalar alpha=1) const; + Tensor sub(const Tensor & other, Scalar alpha=1) const; + Tensor & sub_(Scalar other, Scalar alpha=1); + Tensor & sub_(const Tensor & other, Scalar alpha=1); + Tensor mul(Scalar other) const; + Tensor mul(const Tensor & other) const; + Tensor & mul_(Scalar other); + Tensor & mul_(const Tensor & other); + Tensor div(Scalar other) const; + Tensor div(const Tensor & other) const; + Tensor & div_(Scalar other); + Tensor & div_(const Tensor & other); + Tensor fmod(Scalar other) const; + Tensor fmod(const Tensor & other) const; + Tensor & fmod_(Scalar other); + Tensor & fmod_(const Tensor & other); + Tensor remainder(Scalar other) const; + Tensor remainder(const Tensor & other) const; + Tensor & remainder_(Scalar other); + Tensor & remainder_(const Tensor & other); + Tensor clamp(Scalar min, Scalar max) const; + Tensor & clamp_(Scalar min, Scalar max); + Tensor clamp_min(Scalar min) const; + Tensor & clamp_min_(Scalar min); + Tensor clamp_max(Scalar max) const; + Tensor & clamp_max_(Scalar max); + Tensor _dot(const Tensor & tensor) const; + Tensor tril(int64_t diagonal=0) const; + Tensor & tril_(int64_t diagonal=0); + Tensor triu(int64_t diagonal=0) const; + Tensor & triu_(int64_t diagonal=0); + Tensor cross(const Tensor & other, int64_t dim=-1) const; + Tensor diag(int64_t diagonal=0) const; + Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor addmm(SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); + Tensor & addmm_(SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); + Tensor _addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + Tensor & _addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); + Tensor _addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + Tensor & _addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); + Tensor _ger(const Tensor & vec2) const; + Tensor _mv(const Tensor & vec) const; + Tensor _mm(const Tensor & mat2) const; + Tensor bmm(const Tensor & mat2) const; + Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); + Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); + Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); + Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); + std::tuple gesv(const Tensor & A) const; + std::tuple gels(const Tensor & A) const; + std::tuple trtrs(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const; + std::tuple symeig(bool eigenvectors=false, bool upper=true) const; + std::tuple eig(bool eigenvectors=false) const; + std::tuple svd(bool some=true) const; + Tensor inverse() const; + Tensor potrf(bool upper=true) const; + Tensor potrs(const Tensor & input2, bool upper=true) const; + Tensor potri(bool upper=true) const; + std::tuple pstrf(bool upper=true, Scalar tol=-1) const; + std::tuple qr() const; + std::tuple geqrf() const; + Tensor orgqr(const Tensor & input2) const; + Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const; + std::tuple btrifact(bool pivot=true) const; + std::tuple btrifact_with_info(bool pivot=true) const; + Tensor btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const; + Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr); + Tensor & random_(int64_t to, Generator * generator=nullptr); + Tensor & random_(Generator * generator=nullptr); + Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const; + Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr); + Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr); + Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr); + Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr); + Tensor & exponential_(double lambd=1, Generator * generator=nullptr); + Tensor & geometric_(double p, Generator * generator=nullptr); + Tensor bernoulli(Generator * generator=nullptr) const; + Tensor _standard_gamma(Generator * generator=nullptr) const; + Tensor & _copy_ignoring_overlaps_(const Tensor & src); + Tensor as_strided(IntList size, IntList stride, int64_t storage_offset=-1) const; + Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset=-1); + Tensor & sparse_raw_resize_(IntList size, int64_t nDimI, int64_t nDimV); + Tensor & reshape_(IntList size, IntList stride); + Tensor _sparse_mask(SparseTensor mask) const; + Tensor to_dense() const; + int64_t _dimI() const; + int64_t _dimV() const; + int64_t _nnz() const; + Tensor coalesce() const; + bool is_coalesced() const; + Tensor _indices() const; + Tensor _values() const; + bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08) const; + Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); + Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); + Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr); + Tensor & bernoulli_(double p=0.5, Generator * generator=nullptr); + Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + std::vector chunk(int64_t chunks, int64_t dim=0) const; + Tensor conv_tbc(const Tensor & weight, const Tensor & bias, int64_t pad) const; + std::tuple conv_tbc_backward(const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad) const; + Tensor det() const; + std::tuple _det_with_svd() const; + Tensor dot(const Tensor & tensor) const; + Tensor expand(IntList size) const; + Tensor expand_as(const Tensor & other) const; + Tensor ger(const Tensor & vec2) const; + Tensor index(TensorList indices) const; + Tensor & index_put_(TensorList indices, const Tensor & values); + bool is_cuda() const; + bool is_distributed() const; + bool is_floating_point() const; + bool is_nonzero() const; + bool is_same_size(const Tensor & other) const; + bool is_signed() const; + bool is_sparse() const; + Tensor matmul(const Tensor & other) const; + Tensor mm(const Tensor & mat2) const; + Tensor mv(const Tensor & vec) const; + Tensor narrow(int64_t dim, int64_t start, int64_t length) const; + Tensor permute(IntList dims) const; + Tensor pin_memory() const; + Tensor repeat(IntList repeats) const; + Tensor select(int64_t dim, int64_t index) const; + int64_t size(int64_t dim) const; + Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const; + std::vector split(int64_t split_size, int64_t dim=0) const; + Tensor squeeze() const; + Tensor squeeze(int64_t dim) const; + Tensor & squeeze_(); + Tensor & squeeze_(int64_t dim); + Tensor stft(int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided=true, const Tensor & window={}, int64_t pad_end=0) const; + int64_t stride(int64_t dim) const; + Tensor & transpose_(int64_t dim0, int64_t dim1); + Tensor & t_(); + Tensor type_as(const Tensor & other) const; + Tensor unsqueeze(int64_t dim) const; + Tensor & unsqueeze_(int64_t dim); + Tensor view_as(const Tensor & other) const; + Tensor where(const Tensor & condition, const Tensor & other) const; + Tensor _s_where(const Tensor & condition, const Tensor & other) const; + Tensor _standard_gamma_grad(const Tensor & output) const; +}; + +} //namespace at diff --git a/aten/doc/Type.h b/aten/doc/Type.h new file mode 100644 index 0000000..5d8ff4f --- /dev/null +++ b/aten/doc/Type.h @@ -0,0 +1,1134 @@ +#pragma once + +#include +#include +#include + +#include "ATen/ATenGeneral.h" +#include "ATen/ArrayRef.h" +#include "ATen/Generator.h" +#include "ATen/Half.h" +#include "ATen/SparseTensorRef.h" +#include "ATen/ScalarType.h" +#include "ATen/Scalar.h" +#include "ATen/Tensor.h" +#include "ATen/Allocator.h" + +// To solve the conflict of s_addr in inaddr.h +#ifdef _MSC_VER +#ifdef s_addr +#undef s_addr +#endif +#endif + +namespace at { + +class Context; +struct Storage; +struct Generator; +struct Allocator; + +// Note [Empty versus 0-dim tensors] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Unlike Torch, ATen treats zero-dimension tensors as having ONE +// element (that is to say, a zero-dimensional tensor is a scalar!) +// This is in contrast to Torch, where a zero-dimension tensor has +// zero elements. +// +// Because we are backed by Torch tensors, we need to be able to +// represent this state (of numel==0). These tensors are represented +// by one-dimensional tensors with size[0] == 0 and stride[0] == 1 +// (the stride is arbitrary but matches the NumPy equivalent). +constexpr std::array kEmptySizes { {0} }; +constexpr std::array kEmptyStrides { {1} }; + +static inline void noop_deleter(void*) {} + +enum class TypeID { + CPUByte, + CPUChar, + CPUDouble, + CPUFloat, + CPUInt, + CPULong, + CPUShort, + CPUHalf, + SparseCPUByte, + SparseCPUChar, + SparseCPUDouble, + SparseCPUFloat, + SparseCPUInt, + SparseCPULong, + SparseCPUShort, + CUDAByte, + CUDAChar, + CUDADouble, + CUDAFloat, + CUDAInt, + CUDALong, + CUDAShort, + CUDAHalf, + SparseCUDAByte, + SparseCUDAChar, + SparseCUDADouble, + SparseCUDAFloat, + SparseCUDAInt, + SparseCUDALong, + SparseCUDAShort, + Undefined, + NumOptions +}; + + +struct AT_API Type { + explicit Type(Context * context) + : context(context) {} + virtual ~Type() {} + virtual ScalarType scalarType() const = 0; + virtual Backend backend() const = 0; + virtual bool is_cuda() const = 0; + virtual bool is_sparse() const = 0; + virtual bool is_distributed() const = 0; + static void registerAll(Context * context); + virtual std::unique_ptr storage() const = 0; + virtual std::unique_ptr storage(size_t size) const = 0; + virtual std::unique_ptr storageFromBlob(void * data, int64_t size, const std::function & deleter=noop_deleter) const = 0; + virtual std::unique_ptr storageWithAllocator(int64_t size, std::unique_ptr allocator) const = 0; + virtual std::unique_ptr generator() const = 0; + virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0; + virtual std::unique_ptr unsafeStorageFromTH(void * th_pointer, bool retain) const = 0; + virtual const char * toString() const = 0; + virtual size_t elementSizeInBytes() const = 0; + virtual Type & toBackend(Backend b) const; + virtual Type & toScalarType(ScalarType s) const; + Context& get_context() const { return *context; } + + // contingious IDs for all types in the system + // for external dispatch + virtual TypeID ID() const = 0; + + Tensor copy(const Tensor & src, bool non_blocking=false) const; + Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const; + virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; + + Tensor tensorFromBlob(void * data, IntList sizes, const std::function & deleter=noop_deleter) const; + Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function & deleter=noop_deleter) const; + Tensor tensorWithAllocator(IntList sizes, std::unique_ptr allocator) const; + Tensor tensorWithAllocator(IntList sizes, IntList strides, std::unique_ptr allocator) const; + Tensor scalarTensor(Scalar s) const; + + bool operator==(const Type& other) const; + bool operator!=(const Type& other) const; + + // example + // virtual Tensor * add(Tensor & a, Tensor & b) = 0; + virtual int64_t storage_offset(const Tensor & self) const; + virtual Tensor & resize_(Tensor & self, IntList size) const; + virtual Tensor & zeros_out(Tensor & result, IntList size) const; + virtual Tensor zeros(IntList size) const; + virtual Tensor & zeros_like_out(Tensor & result, const Tensor & input) const; + virtual Tensor zeros_like(const Tensor & input) const; + virtual Tensor & ones_out(Tensor & result, IntList size) const; + virtual Tensor ones(IntList size) const; + virtual Tensor & ones_like_out(Tensor & result, const Tensor & input) const; + virtual Tensor ones_like(const Tensor & input) const; + virtual int64_t numel(const Tensor & self) const; + virtual Tensor & set_(Tensor & self, Storage & source) const; + virtual Tensor & set_(Tensor & self, Storage & source, int64_t storage_offset, IntList size, IntList stride={}) const; + virtual Tensor & set_(Tensor & self, const Tensor & source) const; + virtual Tensor & set_(Tensor & self) const; + virtual Tensor & fill_(Tensor & self, Scalar value) const; + virtual Tensor & fill_(Tensor & self, const Tensor & value) const; + virtual bool is_contiguous(const Tensor & self) const; + virtual bool is_set_to(const Tensor & self, const Tensor & tensor) const; + Tensor & masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const; + virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const; + Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const; + virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const; + Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const; + virtual Tensor & s_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const; + Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask) const; + virtual Tensor & s_masked_select_out(Tensor & result, const Tensor & self, const Tensor & mask) const; + Tensor masked_select(const Tensor & self, const Tensor & mask) const; + virtual Tensor s_masked_select(const Tensor & self, const Tensor & mask) const; + virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const; + virtual Tensor t(const Tensor & self) const; + virtual Tensor & nonzero_out(Tensor & result, const Tensor & self) const; + virtual Tensor nonzero(const Tensor & self) const; + virtual Tensor contiguous(const Tensor & self) const; + virtual Tensor clone(const Tensor & self) const; + virtual Tensor view(const Tensor & self, IntList size) const; + virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const; + virtual Tensor & index_select_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) const; + virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const; + virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const; + virtual Tensor & take_out(Tensor & result, const Tensor & self, const Tensor & index) const; + virtual Tensor take(const Tensor & self, const Tensor & index) const; + virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate=false) const; + virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const; + virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const; + virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const; + virtual Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step) const; + virtual Tensor & range_out(Tensor & result, Scalar start, Scalar end, Scalar step=1) const; + virtual Tensor range(Scalar start, Scalar end, Scalar step=1) const; + virtual Tensor & arange_out(Tensor & result, Scalar start, Scalar end, Scalar step=1) const; + virtual Tensor arange(Scalar start, Scalar end, Scalar step=1) const; + virtual Tensor & arange_out(Tensor & result, Scalar end) const; + virtual Tensor arange(Scalar end) const; + virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const; + virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const; + virtual Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const; + virtual Tensor & gather_out(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) const; + virtual Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) const; + virtual void* data_ptr(const Tensor & self) const; + virtual bool equal(const Tensor & self, const Tensor & other) const; + virtual Tensor & __and___out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor __and__(const Tensor & self, Scalar other) const; + Tensor & __and___out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s___and___out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor __and__(const Tensor & self, const Tensor & other) const; + virtual Tensor s___and__(const Tensor & self, const Tensor & other) const; + virtual Tensor & __iand__(Tensor & self, Scalar other) const; + Tensor & __iand__(Tensor & self, const Tensor & other) const; + virtual Tensor & s___iand__(Tensor & self, const Tensor & other) const; + virtual Tensor & __or___out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor __or__(const Tensor & self, Scalar other) const; + Tensor & __or___out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s___or___out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor __or__(const Tensor & self, const Tensor & other) const; + virtual Tensor s___or__(const Tensor & self, const Tensor & other) const; + virtual Tensor & __ior__(Tensor & self, Scalar other) const; + Tensor & __ior__(Tensor & self, const Tensor & other) const; + virtual Tensor & s___ior__(Tensor & self, const Tensor & other) const; + virtual Tensor & __xor___out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor __xor__(const Tensor & self, Scalar other) const; + Tensor & __xor___out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s___xor___out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor __xor__(const Tensor & self, const Tensor & other) const; + virtual Tensor s___xor__(const Tensor & self, const Tensor & other) const; + virtual Tensor & __ixor__(Tensor & self, Scalar other) const; + Tensor & __ixor__(Tensor & self, const Tensor & other) const; + virtual Tensor & s___ixor__(Tensor & self, const Tensor & other) const; + virtual Tensor & __lshift___out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor __lshift__(const Tensor & self, Scalar other) const; + Tensor & __lshift___out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s___lshift___out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor __lshift__(const Tensor & self, const Tensor & other) const; + virtual Tensor s___lshift__(const Tensor & self, const Tensor & other) const; + virtual Tensor & __ilshift__(Tensor & self, Scalar other) const; + Tensor & __ilshift__(Tensor & self, const Tensor & other) const; + virtual Tensor & s___ilshift__(Tensor & self, const Tensor & other) const; + virtual Tensor & __rshift___out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor __rshift__(const Tensor & self, Scalar other) const; + Tensor & __rshift___out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s___rshift___out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor __rshift__(const Tensor & self, const Tensor & other) const; + virtual Tensor s___rshift__(const Tensor & self, const Tensor & other) const; + virtual Tensor & __irshift__(Tensor & self, Scalar other) const; + Tensor & __irshift__(Tensor & self, const Tensor & other) const; + virtual Tensor & s___irshift__(Tensor & self, const Tensor & other) const; + virtual Tensor & lt_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor lt(const Tensor & self, Scalar other) const; + Tensor & lt_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_lt_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor lt(const Tensor & self, const Tensor & other) const; + virtual Tensor s_lt(const Tensor & self, const Tensor & other) const; + virtual Tensor & lt_(Tensor & self, Scalar other) const; + Tensor & lt_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_lt_(Tensor & self, const Tensor & other) const; + virtual Tensor & gt_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor gt(const Tensor & self, Scalar other) const; + Tensor & gt_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_gt_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor gt(const Tensor & self, const Tensor & other) const; + virtual Tensor s_gt(const Tensor & self, const Tensor & other) const; + virtual Tensor & gt_(Tensor & self, Scalar other) const; + Tensor & gt_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_gt_(Tensor & self, const Tensor & other) const; + virtual Tensor & le_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor le(const Tensor & self, Scalar other) const; + Tensor & le_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_le_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor le(const Tensor & self, const Tensor & other) const; + virtual Tensor s_le(const Tensor & self, const Tensor & other) const; + virtual Tensor & le_(Tensor & self, Scalar other) const; + Tensor & le_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_le_(Tensor & self, const Tensor & other) const; + virtual Tensor & ge_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor ge(const Tensor & self, Scalar other) const; + Tensor & ge_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_ge_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor ge(const Tensor & self, const Tensor & other) const; + virtual Tensor s_ge(const Tensor & self, const Tensor & other) const; + virtual Tensor & ge_(Tensor & self, Scalar other) const; + Tensor & ge_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_ge_(Tensor & self, const Tensor & other) const; + virtual Tensor & eq_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor eq(const Tensor & self, Scalar other) const; + Tensor & eq_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_eq_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor eq(const Tensor & self, const Tensor & other) const; + virtual Tensor s_eq(const Tensor & self, const Tensor & other) const; + virtual Tensor & eq_(Tensor & self, Scalar other) const; + Tensor & eq_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_eq_(Tensor & self, const Tensor & other) const; + virtual Tensor & ne_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor ne(const Tensor & self, Scalar other) const; + Tensor & ne_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_ne_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor ne(const Tensor & self, const Tensor & other) const; + virtual Tensor s_ne(const Tensor & self, const Tensor & other) const; + virtual Tensor & ne_(Tensor & self, Scalar other) const; + Tensor & ne_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_ne_(Tensor & self, const Tensor & other) const; + virtual std::tuple min_out(Tensor & min, Tensor & min_indices, const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual std::tuple min(const Tensor & self, int64_t dim, bool keepdim=false) const; + Tensor & min_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_min_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor min(const Tensor & self, const Tensor & other) const; + virtual Tensor s_min(const Tensor & self, const Tensor & other) const; + virtual Tensor min(const Tensor & self) const; + virtual std::tuple max_out(Tensor & max, Tensor & max_indices, const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual std::tuple max(const Tensor & self, int64_t dim, bool keepdim=false) const; + Tensor & max_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_max_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor max(const Tensor & self, const Tensor & other) const; + virtual Tensor s_max(const Tensor & self, const Tensor & other) const; + virtual Tensor max(const Tensor & self) const; + virtual std::tuple kthvalue_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const; + virtual std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const; + virtual std::tuple mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool keepdim=false) const; + virtual std::tuple mode(const Tensor & self, int64_t dim=-1, bool keepdim=false) const; + virtual std::tuple median_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual std::tuple median(const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor median(const Tensor & self) const; + virtual std::tuple sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim=-1, bool descending=false) const; + virtual std::tuple sort(const Tensor & self, int64_t dim=-1, bool descending=false) const; + virtual std::tuple topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const; + virtual std::tuple topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const; + virtual Tensor all(const Tensor & self) const; + virtual Tensor any(const Tensor & self) const; + virtual int64_t get_device(const Tensor & self) const; + virtual Tensor & abs_out(Tensor & result, const Tensor & self) const; + virtual Tensor abs(const Tensor & self) const; + virtual Tensor & abs_(Tensor & self) const; + virtual Tensor & sigmoid_(Tensor & self) const; + virtual Tensor & sigmoid_out(Tensor & result, const Tensor & self) const; + virtual Tensor sigmoid(const Tensor & self) const; + virtual Tensor & log_(Tensor & self) const; + virtual Tensor & log_out(Tensor & result, const Tensor & self) const; + virtual Tensor log(const Tensor & self) const; + virtual Tensor & log1p_(Tensor & self) const; + virtual Tensor & log1p_out(Tensor & result, const Tensor & self) const; + virtual Tensor log1p(const Tensor & self) const; + virtual Tensor & lgamma_out(Tensor & result, const Tensor & self) const; + virtual Tensor lgamma(const Tensor & self) const; + virtual Tensor & lgamma_(Tensor & self) const; + virtual Tensor & digamma_out(Tensor & result, const Tensor & self) const; + virtual Tensor digamma(const Tensor & self) const; + virtual Tensor & digamma_(Tensor & self) const; + virtual Tensor & polygamma_out(Tensor & result, int64_t n, const Tensor & self) const; + virtual Tensor polygamma(int64_t n, const Tensor & self) const; + virtual Tensor & polygamma_(Tensor & self, int64_t n) const; + virtual Tensor & exp_(Tensor & self) const; + virtual Tensor & exp_out(Tensor & result, const Tensor & self) const; + virtual Tensor exp(const Tensor & self) const; + virtual Tensor & expm1_(Tensor & self) const; + virtual Tensor & expm1_out(Tensor & result, const Tensor & self) const; + virtual Tensor expm1(const Tensor & self) const; + virtual Tensor & cos_(Tensor & self) const; + virtual Tensor & cos_out(Tensor & result, const Tensor & self) const; + virtual Tensor cos(const Tensor & self) const; + virtual Tensor & acos_(Tensor & self) const; + virtual Tensor & acos_out(Tensor & result, const Tensor & self) const; + virtual Tensor acos(const Tensor & self) const; + virtual Tensor & cosh_(Tensor & self) const; + virtual Tensor & cosh_out(Tensor & result, const Tensor & self) const; + virtual Tensor cosh(const Tensor & self) const; + virtual Tensor & sin_(Tensor & self) const; + virtual Tensor & sin_out(Tensor & result, const Tensor & self) const; + virtual Tensor sin(const Tensor & self) const; + virtual Tensor & asin_(Tensor & self) const; + virtual Tensor & asin_out(Tensor & result, const Tensor & self) const; + virtual Tensor asin(const Tensor & self) const; + virtual Tensor & sinh_(Tensor & self) const; + virtual Tensor & sinh_out(Tensor & result, const Tensor & self) const; + virtual Tensor sinh(const Tensor & self) const; + virtual Tensor & tan_(Tensor & self) const; + virtual Tensor & tan_out(Tensor & result, const Tensor & self) const; + virtual Tensor tan(const Tensor & self) const; + virtual Tensor & atan_(Tensor & self) const; + virtual Tensor & atan_out(Tensor & result, const Tensor & self) const; + virtual Tensor atan(const Tensor & self) const; + virtual Tensor & tanh_(Tensor & self) const; + virtual Tensor & tanh_out(Tensor & result, const Tensor & self) const; + virtual Tensor tanh(const Tensor & self) const; + virtual Tensor & erf_(Tensor & self) const; + virtual Tensor & erf_out(Tensor & result, const Tensor & self) const; + virtual Tensor erf(const Tensor & self) const; + virtual Tensor & erfc_(Tensor & self) const; + virtual Tensor & erfc_out(Tensor & result, const Tensor & self) const; + virtual Tensor erfc(const Tensor & self) const; + virtual Tensor & erfinv_(Tensor & self) const; + virtual Tensor & erfinv_out(Tensor & result, const Tensor & self) const; + virtual Tensor erfinv(const Tensor & self) const; + virtual Tensor & sqrt_(Tensor & self) const; + virtual Tensor & sqrt_out(Tensor & result, const Tensor & self) const; + virtual Tensor sqrt(const Tensor & self) const; + virtual Tensor & rsqrt_(Tensor & self) const; + virtual Tensor & rsqrt_out(Tensor & result, const Tensor & self) const; + virtual Tensor rsqrt(const Tensor & self) const; + virtual Tensor & ceil_(Tensor & self) const; + virtual Tensor & ceil_out(Tensor & result, const Tensor & self) const; + virtual Tensor ceil(const Tensor & self) const; + virtual Tensor & floor_(Tensor & self) const; + virtual Tensor & floor_out(Tensor & result, const Tensor & self) const; + virtual Tensor floor(const Tensor & self) const; + virtual Tensor & round_(Tensor & self) const; + virtual Tensor & round_out(Tensor & result, const Tensor & self) const; + virtual Tensor round(const Tensor & self) const; + virtual Tensor & trunc_(Tensor & self) const; + virtual Tensor & trunc_out(Tensor & result, const Tensor & self) const; + virtual Tensor trunc(const Tensor & self) const; + virtual Tensor & frac_(Tensor & self) const; + virtual Tensor & frac_out(Tensor & result, const Tensor & self) const; + virtual Tensor frac(const Tensor & self) const; + virtual Tensor & mean_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor mean(const Tensor & self) const; + virtual Tensor & var_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const; + virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const; + virtual Tensor var(const Tensor & self, bool unbiased=true) const; + virtual Tensor & std_out(Tensor & result, const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const; + virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const; + virtual Tensor std(const Tensor & self, bool unbiased=true) const; + virtual Tensor & norm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const; + virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const; + virtual Tensor norm(const Tensor & self, Scalar p=2) const; + virtual Tensor & renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const; + virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const; + virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const; + Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2) const; + virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p=2) const; + virtual Tensor & reciprocal_out(Tensor & result, const Tensor & self) const; + virtual Tensor reciprocal(const Tensor & self) const; + virtual Tensor & reciprocal_(Tensor & self) const; + virtual Tensor & neg_out(Tensor & result, const Tensor & self) const; + virtual Tensor neg(const Tensor & self) const; + virtual Tensor & neg_(Tensor & self) const; + Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_atan2_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor atan2(const Tensor & self, const Tensor & other) const; + virtual Tensor s_atan2(const Tensor & self, const Tensor & other) const; + Tensor & atan2_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_atan2_(Tensor & self, const Tensor & other) const; + virtual Tensor & pow_out(Tensor & result, const Tensor & self, Scalar exponent) const; + virtual Tensor pow(const Tensor & self, Scalar exponent) const; + Tensor & pow_out(Tensor & result, const Tensor & self, const Tensor & exponent) const; + virtual Tensor & s_pow_out(Tensor & result, const Tensor & self, const Tensor & exponent) const; + Tensor pow(const Tensor & self, const Tensor & exponent) const; + virtual Tensor s_pow(const Tensor & self, const Tensor & exponent) const; + virtual Tensor & pow_out(Tensor & result, Scalar base, const Tensor & self) const; + virtual Tensor pow(Scalar base, const Tensor & self) const; + virtual Tensor & pow_(Tensor & self, Scalar exponent) const; + Tensor & pow_(Tensor & self, const Tensor & exponent) const; + virtual Tensor & s_pow_(Tensor & self, const Tensor & exponent) const; + Tensor & lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight) const; + virtual Tensor & s_lerp_out(Tensor & result, const Tensor & self, const Tensor & end, Scalar weight) const; + Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const; + virtual Tensor s_lerp(const Tensor & self, const Tensor & end, Scalar weight) const; + Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const; + virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const; + virtual Tensor & linspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100) const; + virtual Tensor linspace(Scalar start, Scalar end, int64_t steps=100) const; + virtual Tensor & logspace_out(Tensor & result, Scalar start, Scalar end, int64_t steps=100) const; + virtual Tensor logspace(Scalar start, Scalar end, int64_t steps=100) const; + virtual Tensor & histc_out(Tensor & result, const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const; + virtual Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const; + virtual Tensor & zero_(Tensor & self) const; + virtual Tensor & sum_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor sum(const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor sum(const Tensor & self) const; + virtual Tensor & prod_out(Tensor & result, const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false) const; + virtual Tensor prod(const Tensor & self) const; + virtual Tensor & cumsum_out(Tensor & result, const Tensor & self, int64_t dim) const; + virtual Tensor cumsum(const Tensor & self, int64_t dim) const; + virtual Tensor & cumprod_out(Tensor & result, const Tensor & self, int64_t dim) const; + virtual Tensor cumprod(const Tensor & self, int64_t dim) const; + virtual Tensor & sign_out(Tensor & result, const Tensor & self) const; + virtual Tensor sign(const Tensor & self) const; + virtual Tensor & sign_(Tensor & self) const; + virtual Tensor trace(const Tensor & self) const; + virtual Tensor & add_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1) const; + virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha=1) const; + Tensor & add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & s_add_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const; + Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor s_add(const Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & add_out(Tensor & result, const Tensor & self, SparseTensor other, Scalar alpha=1) const; + virtual Tensor add(const Tensor & self, SparseTensor other, Scalar alpha=1) const; + virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha=1) const; + Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & s_add_(Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & add_(Tensor & self, SparseTensor other, Scalar alpha=1) const; + virtual Tensor & sub_out(Tensor & result, const Tensor & self, Scalar other, Scalar alpha=1) const; + virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1) const; + Tensor & sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & s_sub_out(Tensor & result, const Tensor & self, const Tensor & other, Scalar alpha=1) const; + Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor s_sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha=1) const; + Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & s_sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const; + virtual Tensor & mul_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor mul(const Tensor & self, Scalar other) const; + Tensor & mul_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_mul_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor mul(const Tensor & self, const Tensor & other) const; + virtual Tensor s_mul(const Tensor & self, const Tensor & other) const; + virtual Tensor & mul_(Tensor & self, Scalar other) const; + Tensor & mul_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_mul_(Tensor & self, const Tensor & other) const; + virtual Tensor & div_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor div(const Tensor & self, Scalar other) const; + Tensor & div_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_div_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor div(const Tensor & self, const Tensor & other) const; + virtual Tensor s_div(const Tensor & self, const Tensor & other) const; + virtual Tensor & div_(Tensor & self, Scalar other) const; + Tensor & div_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_div_(Tensor & self, const Tensor & other) const; + virtual Tensor & fmod_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor fmod(const Tensor & self, Scalar other) const; + Tensor & fmod_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_fmod_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor fmod(const Tensor & self, const Tensor & other) const; + virtual Tensor s_fmod(const Tensor & self, const Tensor & other) const; + virtual Tensor & fmod_(Tensor & self, Scalar other) const; + Tensor & fmod_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_fmod_(Tensor & self, const Tensor & other) const; + virtual Tensor & remainder_out(Tensor & result, const Tensor & self, Scalar other) const; + virtual Tensor remainder(const Tensor & self, Scalar other) const; + Tensor & remainder_out(Tensor & result, const Tensor & self, const Tensor & other) const; + virtual Tensor & s_remainder_out(Tensor & result, const Tensor & self, const Tensor & other) const; + Tensor remainder(const Tensor & self, const Tensor & other) const; + virtual Tensor s_remainder(const Tensor & self, const Tensor & other) const; + virtual Tensor & remainder_(Tensor & self, Scalar other) const; + Tensor & remainder_(Tensor & self, const Tensor & other) const; + virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const; + virtual Tensor & clamp_out(Tensor & result, const Tensor & self, Scalar min, Scalar max) const; + virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const; + virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const; + virtual Tensor & clamp_min_out(Tensor & result, const Tensor & self, Scalar min) const; + virtual Tensor clamp_min(const Tensor & self, Scalar min) const; + virtual Tensor & clamp_min_(Tensor & self, Scalar min) const; + virtual Tensor & clamp_max_out(Tensor & result, const Tensor & self, Scalar max) const; + virtual Tensor clamp_max(const Tensor & self, Scalar max) const; + virtual Tensor & clamp_max_(Tensor & self, Scalar max) const; + virtual Tensor _dot(const Tensor & self, const Tensor & tensor) const; + virtual Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal=0) const; + virtual Tensor tril(const Tensor & self, int64_t diagonal=0) const; + virtual Tensor & tril_(Tensor & self, int64_t diagonal=0) const; + virtual Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal=0) const; + virtual Tensor triu(const Tensor & self, int64_t diagonal=0) const; + virtual Tensor & triu_(Tensor & self, int64_t diagonal=0) const; + virtual Tensor & cross_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim=-1) const; + virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1) const; + virtual Tensor & eye_out(Tensor & result, int64_t n, int64_t m=-1) const; + virtual Tensor eye(int64_t n, int64_t m=-1) const; + virtual Tensor & diag_out(Tensor & result, const Tensor & self, int64_t diagonal=0) const; + virtual Tensor diag(const Tensor & self, int64_t diagonal=0) const; + Tensor & addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & s_addmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor s_addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addmm_out(Tensor & result, const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor addmm(const Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addmm_(Tensor & self, SparseTensor mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor & _addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & s__addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + Tensor _addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor s__addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & _addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + Tensor & _addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & s__addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + Tensor _addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor s__addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & _addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & _ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) const; + virtual Tensor _ger(const Tensor & self, const Tensor & vec2) const; + virtual Tensor & _mv_out(Tensor & result, const Tensor & self, const Tensor & vec) const; + virtual Tensor _mv(const Tensor & self, const Tensor & vec) const; + virtual Tensor & _mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const; + virtual Tensor _mm(const Tensor & self, const Tensor & mat2) const; + virtual Tensor & bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const; + virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const; + Tensor & addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & s_addbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & s_baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor s_baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual Tensor & s_addcmul_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual Tensor & s_addcdiv_out(Tensor & result, const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + virtual std::tuple gesv_out(Tensor & solution, Tensor & lu, const Tensor & self, const Tensor & A) const; + virtual std::tuple gesv(const Tensor & self, const Tensor & A) const; + virtual std::tuple gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) const; + virtual std::tuple gels(const Tensor & self, const Tensor & A) const; + virtual std::tuple trtrs_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const; + virtual std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const; + virtual std::tuple symeig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false, bool upper=true) const; + virtual std::tuple symeig(const Tensor & self, bool eigenvectors=false, bool upper=true) const; + virtual std::tuple eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors=false) const; + virtual std::tuple eig(const Tensor & self, bool eigenvectors=false) const; + virtual std::tuple svd_out(Tensor & res1, Tensor & res2, Tensor & res3, const Tensor & self, bool some=true) const; + virtual std::tuple svd(const Tensor & self, bool some=true) const; + virtual Tensor & inverse_out(Tensor & output, const Tensor & self) const; + virtual Tensor inverse(const Tensor & self) const; + virtual Tensor & potrf_out(Tensor & output, const Tensor & self, bool upper=true) const; + virtual Tensor potrf(const Tensor & self, bool upper=true) const; + virtual Tensor & potrs_out(Tensor & result, const Tensor & self, const Tensor & input2, bool upper=true) const; + virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true) const; + virtual Tensor & potri_out(Tensor & output, const Tensor & self, bool upper=true) const; + virtual Tensor potri(const Tensor & self, bool upper=true) const; + virtual std::tuple pstrf_out(Tensor & res1, Tensor & res2, const Tensor & self, bool upper=true, Scalar tol=-1) const; + virtual std::tuple pstrf(const Tensor & self, bool upper=true, Scalar tol=-1) const; + virtual std::tuple qr_out(Tensor & res1, Tensor & res2, const Tensor & self) const; + virtual std::tuple qr(const Tensor & self) const; + virtual std::tuple geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) const; + virtual std::tuple geqrf(const Tensor & self) const; + virtual Tensor & orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) const; + virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const; + virtual Tensor & ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const; + virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const; + virtual std::tuple btrifact_out(Tensor & result, Tensor & pivots, const Tensor & self, bool pivot=true) const; + virtual std::tuple btrifact(const Tensor & self, bool pivot=true) const; + virtual std::tuple btrifact_with_info_out(Tensor & result, Tensor & pivots, Tensor & info, const Tensor & self, bool pivot=true) const; + virtual std::tuple btrifact_with_info(const Tensor & self, bool pivot=true) const; + virtual Tensor & btrisolve_out(Tensor & result, const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const; + virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const; + virtual Tensor & randperm_out(Tensor & result, int64_t n, Generator * generator=nullptr) const; + virtual Tensor randperm(int64_t n, Generator * generator=nullptr) const; + virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator=nullptr) const; + virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator=nullptr) const; + virtual Tensor & random_(Tensor & self, Generator * generator=nullptr) const; + virtual Tensor & multinomial_out(Tensor & result, const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const; + virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const; + virtual Tensor & uniform_(Tensor & self, double from=0, double to=1, Generator * generator=nullptr) const; + virtual Tensor & normal_out(Tensor & output, const Tensor & mean, double std=1, Generator * generator=nullptr) const; + virtual Tensor normal(const Tensor & mean, double std=1, Generator * generator=nullptr) const; + virtual Tensor & normal_out(Tensor & output, double mean, const Tensor & std, Generator * generator=nullptr) const; + virtual Tensor normal(double mean, const Tensor & std, Generator * generator=nullptr) const; + virtual Tensor & normal_out(Tensor & output, const Tensor & mean, const Tensor & std, Generator * generator=nullptr) const; + virtual Tensor normal(const Tensor & mean, const Tensor & std, Generator * generator=nullptr) const; + virtual Tensor & normal_(Tensor & self, double mean=0, double std=1, Generator * generator=nullptr) const; + virtual Tensor & cauchy_(Tensor & self, double median=0, double sigma=1, Generator * generator=nullptr) const; + virtual Tensor & log_normal_(Tensor & self, double mean=1, double std=2, Generator * generator=nullptr) const; + virtual Tensor & exponential_(Tensor & self, double lambd=1, Generator * generator=nullptr) const; + virtual Tensor & rand_out(Tensor & result, IntList size, Generator * generator=nullptr) const; + virtual Tensor rand(IntList size, Generator * generator=nullptr) const; + virtual Tensor & randn_out(Tensor & result, IntList size, Generator * generator=nullptr) const; + virtual Tensor randn(IntList size, Generator * generator=nullptr) const; + virtual Tensor & geometric_(Tensor & self, double p, Generator * generator=nullptr) const; + virtual Tensor & bernoulli_out(Tensor & output, const Tensor & self, Generator * generator=nullptr) const; + virtual Tensor bernoulli(const Tensor & self, Generator * generator=nullptr) const; + virtual Tensor & _standard_gamma_out(Tensor & output, const Tensor & self, Generator * generator=nullptr) const; + virtual Tensor _standard_gamma(const Tensor & self, Generator * generator=nullptr) const; + virtual Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total) const; + virtual Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total) const; + virtual Tensor tensor(Storage & storage, int64_t storageOffset, IntList size, IntList stride={}) const; + virtual Tensor tensor(IntList size) const; + virtual Tensor tensor(IntList size, IntList stride) const; + virtual Tensor tensor() const; + virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const; + virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const; + virtual Tensor alias(const Tensor & self) const; + virtual Tensor & _copy_ignoring_overlaps_(Tensor & self, const Tensor & src) const; + virtual Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const; + virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const; + virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const; + virtual Tensor & sparse_raw_resize_(Tensor & self, IntList size, int64_t nDimI, int64_t nDimV) const; + virtual Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim=0) const; + virtual Tensor _cat(TensorList tensors, int64_t dim=0) const; + virtual Tensor & reshape_(Tensor & self, IntList size, IntList stride) const; + virtual Tensor _sparse_mask(const Tensor & self, SparseTensor mask) const; + virtual Tensor to_dense(const Tensor & self) const; + virtual int64_t _dimI(const Tensor & self) const; + virtual int64_t _dimV(const Tensor & self) const; + virtual int64_t _nnz(const Tensor & self) const; + virtual Tensor coalesce(const Tensor & self) const; + virtual bool is_coalesced(const Tensor & self) const; + virtual Tensor _indices(const Tensor & self) const; + virtual Tensor _values(const Tensor & self) const; + virtual Tensor & binary_cross_entropy_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true) const; + virtual Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, bool reduce=true) const; + virtual Tensor & binary_cross_entropy_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const; + virtual Tensor binary_cross_entropy_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const; + virtual Tensor & binary_cross_entropy_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const; + virtual Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, bool reduce) const; + virtual Tensor & kl_div_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor kl_div(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor & kl_div_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor kl_div_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & kl_div_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor kl_div_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor & l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & mse_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor mse_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor & mse_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor mse_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & mse_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & multi_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true) const; + virtual Tensor multi_margin_loss(const Tensor & self, const Tensor & target, Scalar p=1, Scalar margin=1, const Tensor & weight={}, bool size_average=true) const; + virtual Tensor & multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const; + virtual Tensor multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const; + virtual Tensor & multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const; + virtual Tensor multi_margin_loss_backward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, bool size_average) const; + virtual Tensor & multilabel_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual std::tuple multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual std::tuple multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) const; + virtual Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce, const Tensor & is_target) const; + virtual Tensor & nll_loss_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const; + virtual Tensor nll_loss(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const; + virtual std::tuple nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const; + virtual std::tuple nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const; + virtual Tensor & nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const; + virtual Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const; + virtual Tensor & nll_loss2d_out(Tensor & output, const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const; + virtual Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & weight={}, bool size_average=true, int64_t ignore_index=-100, bool reduce=true) const; + virtual std::tuple nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const; + virtual std::tuple nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce) const; + virtual Tensor & nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const; + virtual Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, bool size_average, int64_t ignore_index, bool reduce, const Tensor & total_weight) const; + virtual Tensor & smooth_l1_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, bool size_average=true, bool reduce=true) const; + virtual Tensor & smooth_l1_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor smooth_l1_loss_forward(const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & smooth_l1_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, bool size_average, bool reduce) const; + virtual Tensor & soft_margin_loss_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average=true) const; + virtual Tensor soft_margin_loss(const Tensor & self, const Tensor & target, bool size_average=true) const; + virtual Tensor & soft_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, bool size_average) const; + virtual Tensor soft_margin_loss_forward(const Tensor & self, const Tensor & target, bool size_average) const; + virtual Tensor & soft_margin_loss_backward_out(Tensor & grad_input, const Tensor & self, const Tensor & target, bool size_average) const; + virtual Tensor soft_margin_loss_backward(const Tensor & self, const Tensor & target, bool size_average) const; + virtual Tensor & elu_out(Tensor & output, const Tensor & self, Scalar alpha=1, Scalar scale=1) const; + virtual Tensor elu(const Tensor & self, Scalar alpha=1, Scalar scale=1) const; + virtual Tensor & elu_forward_out(Tensor & output, const Tensor & self, Scalar alpha, Scalar scale) const; + virtual Tensor elu_forward(const Tensor & self, Scalar alpha, Scalar scale) const; + virtual Tensor & elu_backward_out(Tensor & grad_input, const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) const; + virtual Tensor elu_backward(const Tensor & grad_output, Scalar alpha, Scalar scale, const Tensor & output) const; + virtual Tensor & elu_(Tensor & self, Scalar alpha=1, Scalar scale=1) const; + virtual Tensor & elu_forward_(Tensor & self, Scalar alpha, Scalar scale) const; + virtual Tensor & glu_out(Tensor & output, const Tensor & self, int64_t dim=-1) const; + virtual Tensor glu(const Tensor & self, int64_t dim=-1) const; + virtual Tensor & glu_forward_out(Tensor & output, const Tensor & self, int64_t dim) const; + virtual Tensor glu_forward(const Tensor & self, int64_t dim) const; + virtual Tensor & glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim) const; + virtual Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim) const; + virtual Tensor & hardtanh_out(Tensor & output, const Tensor & self, Scalar min_val=-1, Scalar max_val=1) const; + virtual Tensor hardtanh(const Tensor & self, Scalar min_val=-1, Scalar max_val=1) const; + virtual Tensor & hardtanh_forward_out(Tensor & output, const Tensor & self, Scalar min_val, Scalar max_val) const; + virtual Tensor hardtanh_forward(const Tensor & self, Scalar min_val, Scalar max_val) const; + virtual Tensor & hardtanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) const; + virtual Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, Scalar min_val, Scalar max_val) const; + virtual Tensor & hardtanh_(Tensor & self, Scalar min_val=-1, Scalar max_val=1) const; + virtual Tensor & hardtanh_forward_(Tensor & self, Scalar min_val, Scalar max_val) const; + virtual Tensor & leaky_relu_out(Tensor & output, const Tensor & self, Scalar negative_slope=0.01) const; + virtual Tensor leaky_relu(const Tensor & self, Scalar negative_slope=0.01) const; + virtual Tensor & leaky_relu_forward_out(Tensor & output, const Tensor & self, Scalar negative_slope) const; + virtual Tensor leaky_relu_forward(const Tensor & self, Scalar negative_slope) const; + virtual Tensor & leaky_relu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar negative_slope) const; + virtual Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, Scalar negative_slope) const; + virtual Tensor & leaky_relu_(Tensor & self, Scalar negative_slope=0.01) const; + virtual Tensor & leaky_relu_forward_(Tensor & self, Scalar negative_slope) const; + virtual Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) const; + virtual Tensor log_sigmoid(const Tensor & self) const; + virtual std::tuple log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self) const; + virtual std::tuple log_sigmoid_forward(const Tensor & self) const; + virtual Tensor & log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer) const; + virtual Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer) const; + virtual Tensor & log_softmax_out(Tensor & output, const Tensor & self, int64_t dim) const; + virtual Tensor log_softmax(const Tensor & self, int64_t dim) const; + virtual Tensor & log_softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) const; + virtual Tensor log_softmax_forward(const Tensor & self, int64_t dim) const; + virtual Tensor & log_softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const; + virtual Tensor log_softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const; + virtual Tensor & prelu_out(Tensor & output, const Tensor & self, const Tensor & weight) const; + virtual Tensor prelu(const Tensor & self, const Tensor & weight) const; + virtual Tensor & prelu_forward_out(Tensor & output, const Tensor & self, const Tensor & weight) const; + virtual Tensor prelu_forward(const Tensor & self, const Tensor & weight) const; + virtual std::tuple prelu_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight) const; + virtual std::tuple prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, std::array output_mask={{true, true}}) const; + virtual Tensor & rrelu_with_noise_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const; + virtual Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const; + virtual Tensor & rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) const; + virtual Tensor rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) const; + virtual Tensor & rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) const; + virtual Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) const; + virtual Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const; + virtual Tensor & rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, Generator * generator) const; + virtual Tensor & softmax_out(Tensor & output, const Tensor & self, int64_t dim) const; + virtual Tensor softmax(const Tensor & self, int64_t dim) const; + virtual Tensor & softmax_forward_out(Tensor & output, const Tensor & self, int64_t dim) const; + virtual Tensor softmax_forward(const Tensor & self, int64_t dim) const; + virtual Tensor & softmax_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const; + virtual Tensor softmax_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, const Tensor & output) const; + virtual Tensor & softplus_out(Tensor & output, const Tensor & self, Scalar beta=1, Scalar threshold=20) const; + virtual Tensor softplus(const Tensor & self, Scalar beta=1, Scalar threshold=20) const; + virtual Tensor & softplus_forward_out(Tensor & output, const Tensor & self, Scalar beta, Scalar threshold) const; + virtual Tensor softplus_forward(const Tensor & self, Scalar beta, Scalar threshold) const; + virtual Tensor & softplus_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) const; + virtual Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, Scalar beta, Scalar threshold, const Tensor & output) const; + virtual Tensor & softshrink_out(Tensor & output, const Tensor & self, Scalar lambd=0.5) const; + virtual Tensor softshrink(const Tensor & self, Scalar lambd=0.5) const; + virtual Tensor & softshrink_forward_out(Tensor & output, const Tensor & self, Scalar lambd) const; + virtual Tensor softshrink_forward(const Tensor & self, Scalar lambd) const; + virtual Tensor & softshrink_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar lambd) const; + virtual Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scalar lambd) const; + virtual Tensor & threshold_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor threshold(const Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor & threshold_forward_out(Tensor & output, const Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor threshold_forward(const Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor & threshold_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor & threshold_(Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor & threshold_forward_(Tensor & self, Scalar threshold, Scalar value) const; + virtual Tensor & adaptive_avg_pool2d_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor adaptive_avg_pool2d(const Tensor & self, IntList output_size) const; + virtual Tensor & adaptive_avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor adaptive_avg_pool2d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & adaptive_avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) const; + virtual Tensor adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self) const; + virtual Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor adaptive_avg_pool3d(const Tensor & self, IntList output_size) const; + virtual Tensor & adaptive_avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor adaptive_avg_pool3d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) const; + virtual Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self) const; + virtual std::tuple adaptive_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool2d(const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool2d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & adaptive_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) const; + virtual Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) const; + virtual std::tuple adaptive_max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool3d(const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool3d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & adaptive_max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices) const; + virtual Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices) const; + virtual Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const; + virtual Tensor avg_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const; + virtual Tensor & avg_pool2d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor avg_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor & avg_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor & avg_pool3d_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const; + virtual Tensor avg_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, bool ceil_mode=false, bool count_include_pad=false) const; + virtual Tensor & avg_pool3d_forward_out(Tensor & output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor avg_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor & avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, bool ceil_mode, bool count_include_pad) const; + virtual std::tuple fractional_max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const; + virtual std::tuple fractional_max_pool2d(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const; + virtual std::tuple fractional_max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const; + virtual std::tuple fractional_max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & random_samples) const; + virtual Tensor & fractional_max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) const; + virtual Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList output_size, const Tensor & indices) const; + virtual std::tuple max_pool2d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const; + virtual std::tuple max_pool2d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const; + virtual std::tuple max_pool2d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const; + virtual std::tuple max_pool2d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const; + virtual Tensor & max_pool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const; + virtual Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const; + virtual std::tuple max_pool3d_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const; + virtual std::tuple max_pool3d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const; + virtual std::tuple max_pool3d_forward_out(Tensor & output, Tensor & indices, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const; + virtual std::tuple max_pool3d_forward(const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode) const; + virtual Tensor & max_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const; + virtual Tensor max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, const Tensor & indices) const; + virtual Tensor & max_unpool2d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) const; + virtual Tensor max_unpool2d(const Tensor & self, const Tensor & indices, IntList output_size) const; + virtual Tensor & max_unpool2d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size) const; + virtual Tensor max_unpool2d_forward(const Tensor & self, const Tensor & indices, IntList output_size) const; + virtual Tensor & max_unpool2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) const; + virtual Tensor max_unpool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size) const; + virtual Tensor & max_unpool3d_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const; + virtual Tensor max_unpool3d(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const; + virtual Tensor & max_unpool3d_forward_out(Tensor & output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const; + virtual Tensor max_unpool3d_forward(const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const; + virtual Tensor & max_unpool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const; + virtual Tensor max_unpool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices, IntList output_size, IntList stride, IntList padding) const; + virtual Tensor & reflection_pad1d_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor reflection_pad1d(const Tensor & self, IntList padding) const; + virtual Tensor & reflection_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor reflection_pad1d_forward(const Tensor & self, IntList padding) const; + virtual Tensor & reflection_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor & reflection_pad2d_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor reflection_pad2d(const Tensor & self, IntList padding) const; + virtual Tensor & reflection_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor reflection_pad2d_forward(const Tensor & self, IntList padding) const; + virtual Tensor & reflection_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad1d_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad1d(const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad1d_forward_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad1d_forward(const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad2d_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad2d(const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad2d_forward_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad2d_forward(const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad3d_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad3d(const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad3d_forward_out(Tensor & output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad3d_forward(const Tensor & self, IntList padding) const; + virtual Tensor & replication_pad3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, IntList padding) const; + virtual Tensor & upsample_linear1d_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor upsample_linear1d(const Tensor & self, IntList output_size) const; + virtual Tensor & upsample_linear1d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor upsample_linear1d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & upsample_linear1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) const; + virtual Tensor upsample_linear1d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) const; + virtual Tensor & upsample_bilinear2d_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor upsample_bilinear2d(const Tensor & self, IntList output_size) const; + virtual Tensor & upsample_bilinear2d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor upsample_bilinear2d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & upsample_bilinear2d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) const; + virtual Tensor upsample_bilinear2d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) const; + virtual Tensor & upsample_trilinear3d_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor upsample_trilinear3d(const Tensor & self, IntList output_size) const; + virtual Tensor & upsample_trilinear3d_forward_out(Tensor & output, const Tensor & self, IntList output_size) const; + virtual Tensor upsample_trilinear3d_forward(const Tensor & self, IntList output_size) const; + virtual Tensor & upsample_trilinear3d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntList output_size, IntList input_size) const; + virtual Tensor upsample_trilinear3d_backward(const Tensor & grad_output, IntList output_size, IntList input_size) const; + virtual Tensor & upsample_nearest1d_out(Tensor & output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest1d(const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest1d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest1d_forward(const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest1d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest2d_out(Tensor & output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest2d(const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest2d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest2d_forward(const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest2d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest3d_out(Tensor & output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest3d(const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest3d_forward_out(Tensor & output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest3d_forward(const Tensor & self, int64_t scale_factor) const; + virtual Tensor & upsample_nearest3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor upsample_nearest3d_backward(const Tensor & grad_output, const Tensor & self, int64_t scale_factor) const; + virtual Tensor & _sigmoid_out(Tensor & output, const Tensor & self) const; + virtual Tensor _sigmoid(const Tensor & self) const; + virtual Tensor & _sigmoid_forward_out(Tensor & output, const Tensor & self) const; + virtual Tensor _sigmoid_forward(const Tensor & self) const; + virtual Tensor & _sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) const; + virtual Tensor _sigmoid_backward(const Tensor & grad_output, const Tensor & output) const; + virtual Tensor & _tanh_out(Tensor & output, const Tensor & self) const; + virtual Tensor _tanh(const Tensor & self) const; + virtual Tensor & _tanh_forward_out(Tensor & output, const Tensor & self) const; + virtual Tensor _tanh_forward(const Tensor & self) const; + virtual Tensor & _tanh_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & output) const; + virtual Tensor _tanh_backward(const Tensor & grad_output, const Tensor & output) const; + virtual Tensor & thnn_batch_norm_out(Tensor & output, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const; + virtual Tensor thnn_batch_norm(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const; + virtual std::tuple thnn_batch_norm_forward_out(Tensor & output, Tensor & save_mean, Tensor & save_std, const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const; + virtual std::tuple thnn_batch_norm_forward(const Tensor & self, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps) const; + virtual std::tuple thnn_batch_norm_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std) const; + virtual std::tuple thnn_batch_norm_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, bool training, double eps, const Tensor & save_mean, const Tensor & save_std, std::array output_mask={{true, true, true}}) const; + virtual Tensor & thnn_conv_transpose2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const; + virtual Tensor thnn_conv_transpose2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const; + virtual std::tuple thnn_conv_transpose2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const; + virtual std::tuple thnn_conv_transpose2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const; + virtual std::tuple thnn_conv_transpose2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones) const; + virtual std::tuple thnn_conv_transpose2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask={{true, true, true}}) const; + virtual Tensor & thnn_conv_transpose3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const; + virtual Tensor thnn_conv_transpose3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, IntList dilation=1) const; + virtual std::tuple thnn_conv_transpose3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const; + virtual std::tuple thnn_conv_transpose3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList output_padding, IntList dilation) const; + virtual std::tuple thnn_conv_transpose3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input) const; + virtual std::tuple thnn_conv_transpose3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList output_padding, IntList dilation, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask={{true, true, true}}) const; + virtual Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const; + virtual Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const; + virtual std::tuple thnn_conv2d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const; + virtual std::tuple thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const; + virtual std::tuple thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) const; + virtual std::tuple thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask={{true, true, true}}) const; + virtual Tensor & thnn_conv_depthwise2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const; + virtual Tensor thnn_conv_depthwise2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const; + virtual Tensor & thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const; + virtual Tensor thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const; + virtual std::tuple thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation) const; + virtual std::tuple thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, std::array output_mask={{true, true}}) const; + virtual Tensor & thnn_conv3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const; + virtual Tensor thnn_conv3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0) const; + virtual std::tuple thnn_conv3d_forward_out(Tensor & output, Tensor & finput, Tensor & fgrad_input, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const; + virtual std::tuple thnn_conv3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding) const; + virtual std::tuple thnn_conv3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input) const; + virtual std::tuple thnn_conv3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, const Tensor & finput, const Tensor & fgrad_input, std::array output_mask={{true, true, true}}) const; + virtual Tensor & thnn_conv_dilated2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const; + virtual Tensor thnn_conv_dilated2d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const; + virtual std::tuple thnn_conv_dilated2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const; + virtual std::tuple thnn_conv_dilated2d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const; + virtual std::tuple thnn_conv_dilated2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) const; + virtual std::tuple thnn_conv_dilated2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask={{true, true, true}}) const; + virtual Tensor & thnn_conv_dilated3d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const; + virtual Tensor thnn_conv_dilated3d(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1) const; + virtual std::tuple thnn_conv_dilated3d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const; + virtual std::tuple thnn_conv_dilated3d_forward(const Tensor & self, const Tensor & weight, IntList kernel_size, const Tensor & bias, IntList stride, IntList padding, IntList dilation) const; + virtual std::tuple thnn_conv_dilated3d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones) const; + virtual std::tuple thnn_conv_dilated3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, const Tensor & columns, const Tensor & ones, std::array output_mask={{true, true, true}}) const; + virtual Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size) const; + virtual std::tuple adaptive_max_pool1d(const Tensor & self, IntList output_size) const; + virtual bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08) const; + virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addmv_out(Tensor & result, const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps, bool cudnn_enabled) const; + virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr) const; + virtual Tensor & bernoulli_(Tensor & self, double p=0.5, Generator * generator=nullptr) const; + virtual Tensor cat(TensorList tensors, int64_t dim=0) const; + virtual Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim=0) const; + virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual Tensor & sspaddmm_out(Tensor & result, const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + virtual std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim=0) const; + virtual bool cudnn_is_acceptable(const Tensor & self) const; + virtual Tensor convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) const; + virtual Tensor _convolution(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) const; + virtual Tensor _convolution_nogroup(const Tensor & input, const Tensor & weight, const Tensor & bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) const; + virtual std::tuple _convolution_double_backward(const Tensor & ggI, const Tensor & ggW, const Tensor & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array output_mask) const; + virtual Tensor conv1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1) const; + virtual Tensor conv2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1) const; + virtual Tensor conv3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList dilation=1, int64_t groups=1) const; + virtual Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad) const; + virtual std::tuple conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad) const; + virtual Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1) const; + virtual Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1) const; + virtual Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const Tensor & bias={}, IntList stride=1, IntList padding=0, IntList output_padding=0, int64_t groups=1, IntList dilation=1) const; + virtual Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W) const; + virtual Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W) const; + virtual std::tuple cudnn_batch_norm(const Tensor & input, const Tensor & weight, const Tensor & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double exponential_average_factor, double epsilon) const; + virtual std::tuple cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const Tensor & running_mean, const Tensor & running_var, const Tensor & save_mean, const Tensor & save_var, double epsilon) const; + virtual Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const; + virtual Tensor cudnn_convolution_backward_input(IntList self_size, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const; + virtual std::tuple cudnn_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) const; + virtual Tensor cudnn_convolution_backward_bias(const Tensor & grad_output) const; + virtual Tensor cudnn_convolution_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const; + virtual Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, const Tensor & bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const; + virtual std::tuple cudnn_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) const; + virtual Tensor cudnn_convolution_transpose_backward_bias(const Tensor & grad_output) const; + virtual Tensor cudnn_convolution_transpose_backward_input(const Tensor & grad_output, const Tensor & weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const; + virtual Tensor cudnn_convolution_transpose_backward_weight(IntList weight_size, const Tensor & grad_output, const Tensor & self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) const; + virtual Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid) const; + virtual std::tuple cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output) const; + virtual Tensor det(const Tensor & self) const; + virtual std::tuple _det_with_svd(const Tensor & self) const; + virtual Tensor dot(const Tensor & self, const Tensor & tensor) const; + virtual Tensor embedding(const Tensor & weight, const Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) const; + virtual Tensor embedding_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) const; + virtual Tensor embedding_dense_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) const; + virtual Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type) const; + virtual Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) const; + virtual Tensor empty_like(const Tensor & self) const; + virtual std::tuple embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) const; + virtual Tensor embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) const; + virtual Tensor embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) const; + virtual Tensor embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) const; + virtual Tensor expand(const Tensor & self, IntList size) const; + virtual Tensor expand_as(const Tensor & self, const Tensor & other) const; + virtual Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, bool size_average, bool reduce) const; + virtual Tensor ger(const Tensor & self, const Tensor & vec2) const; + virtual Tensor & ger_out(Tensor & result, const Tensor & self, const Tensor & vec2) const; + virtual Tensor index(const Tensor & self, TensorList indices) const; + virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const; + virtual bool is_cuda(const Tensor & self) const; + virtual bool is_distributed(const Tensor & self) const; + virtual bool is_floating_point(const Tensor & self) const; + virtual bool is_nonzero(const Tensor & self) const; + virtual bool is_same_size(const Tensor & self, const Tensor & other) const; + virtual bool is_signed(const Tensor & self) const; + virtual bool is_sparse(const Tensor & self) const; + virtual Tensor matmul(const Tensor & self, const Tensor & other) const; + virtual std::tuple max_pool1d(const Tensor & self, IntList kernel_size, IntList stride={}, IntList padding=0, IntList dilation=1, bool ceil_mode=false) const; + virtual Tensor mm(const Tensor & self, const Tensor & mat2) const; + virtual Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const; + virtual Tensor mv(const Tensor & self, const Tensor & vec) const; + virtual Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec) const; + virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const; + virtual Tensor permute(const Tensor & self, IntList dims) const; + virtual Tensor pin_memory(const Tensor & self) const; + virtual Tensor rand_like(const Tensor & self) const; + virtual Tensor randn_like(const Tensor & self) const; + virtual Tensor repeat(const Tensor & self, IntList repeats) const; + virtual std::tuple RoiPooling2d_forward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) const; + virtual Tensor RoiPooling2d_backward(const Tensor & input, const Tensor & rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, const Tensor & gradOutput, const Tensor & argmaxes) const; + virtual Tensor rrelu(const Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const; + virtual Tensor & rrelu_(Tensor & self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator * generator=nullptr) const; + virtual Tensor select(const Tensor & self, int64_t dim, int64_t index) const; + virtual Tensor selu(const Tensor & self) const; + virtual Tensor & selu_(Tensor & self) const; + virtual int64_t size(const Tensor & self, int64_t dim) const; + virtual Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const; + virtual std::vector split(const Tensor & self, int64_t split_size, int64_t dim=0) const; + virtual Tensor squeeze(const Tensor & self) const; + virtual Tensor squeeze(const Tensor & self, int64_t dim) const; + virtual Tensor & squeeze_(Tensor & self) const; + virtual Tensor & squeeze_(Tensor & self, int64_t dim) const; + virtual Tensor stack(TensorList tensors, int64_t dim=0) const; + virtual Tensor & stack_out(Tensor & result, TensorList tensors, int64_t dim=0) const; + virtual Tensor stft(const Tensor & self, int64_t frame_length, int64_t hop, int64_t fft_size, bool return_onesided=true, const Tensor & window={}, int64_t pad_end=0) const; + virtual int64_t stride(const Tensor & self, int64_t dim) const; + virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const; + virtual Tensor & t_(Tensor & self) const; + virtual Tensor type_as(const Tensor & self, const Tensor & other) const; + virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const; + virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const; + virtual Tensor view_as(const Tensor & self, const Tensor & other) const; + virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const; + virtual Tensor _s_where(const Tensor & condition, const Tensor & self, const Tensor & other) const; + virtual Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output) const; + virtual Tensor poisson(const Tensor & self, Generator * generator=nullptr) const; + virtual Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) const; + virtual std::tuple _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state) const; + virtual std::tuple> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const Tensor & cx, const Tensor & output, const Tensor & grad_output, const Tensor & grad_hy, const Tensor & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, const Tensor & dropout_state, const Tensor & reserve, std::array output_mask) const; +protected: + Context* context; +}; + + +} diff --git a/aten/src/ATen/.gitignore b/aten/src/ATen/.gitignore new file mode 100644 index 0000000..12bfd61 --- /dev/null +++ b/aten/src/ATen/.gitignore @@ -0,0 +1 @@ +Config.h diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h new file mode 100644 index 0000000..a7084d4 --- /dev/null +++ b/aten/src/ATen/ATen.h @@ -0,0 +1,23 @@ +#pragma once + +#include "ATen/ATenGeneral.h" +#include "ATen/CPUGeneral.h" +#include "ATen/Allocator.h" +#include "ATen/Scalar.h" +#include "ATen/Type.h" +#include "ATen/Generator.h" +#include "ATen/Context.h" +#include "ATen/Storage.h" +#include "ATen/Tensor.h" +#include "ATen/Device.h" +#include "ATen/TensorGeometry.h" +#include "ATen/Functions.h" +#include "ATen/Formatting.h" +#include "ATen/TensorOperators.h" +#include "ATen/TensorMethods.h" +#include "ATen/Dispatch.h" +#include "ATen/DimVector.h" +#include "ATen/DeviceGuard.h" +#include "ATen/TensorOptions.h" +#include "ATen/Layout.h" +#include "ATen/OptionsGuard.h" diff --git a/aten/src/ATen/ATenConfig.cmake.in b/aten/src/ATen/ATenConfig.cmake.in new file mode 100644 index 0000000..e945926 --- /dev/null +++ b/aten/src/ATen/ATenConfig.cmake.in @@ -0,0 +1,9 @@ +# Find the TH includes and library +# +# ATEN_INCLUDE_DIR -- where to find the includes +# ATEN_LIBRARIES -- list of libraries to link against +# ATEN_FOUND -- set to 1 if found + +SET(ATEN_FOUND 1) +SET(ATEN_INCLUDE_DIR "@ATEN_INCLUDE_DIR@") +SET(ATEN_LIBRARIES "@ATEN_LIBRARIES@") diff --git a/aten/src/ATen/ATenGeneral.h b/aten/src/ATen/ATenGeneral.h new file mode 100644 index 0000000..88c58a0 --- /dev/null +++ b/aten/src/ATen/ATenGeneral.h @@ -0,0 +1,11 @@ +#pragma once + +#ifdef _WIN32 +# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +# define AT_API __declspec(dllexport) +# else +# define AT_API __declspec(dllimport) +# endif +#else +# define AT_API +#endif diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h new file mode 100644 index 0000000..b11b5ba --- /dev/null +++ b/aten/src/ATen/AccumulateType.h @@ -0,0 +1,43 @@ +#pragma once +#include "ATen/Config.h" +#include "ATen/Half.h" + +// Defines the accumulation type for a scalar type. +// Example: +// using accscalar_t = acc_type; + +#ifdef __CUDACC__ +#include +#include +#endif + +namespace at { + +template +struct AccumulateType { }; + +#ifdef __CUDACC__ +template <> struct AccumulateType { using type = float; }; +#endif +template <> struct AccumulateType { using type = float; }; +template <> struct AccumulateType { using type = float; }; +template <> struct AccumulateType { using type = double; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = double; }; +template <> struct AccumulateType { using type = double; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; + +template +using acc_type = typename AccumulateType::type; + +} // namespace at diff --git a/aten/src/ATen/AlignOf.h b/aten/src/ATen/AlignOf.h new file mode 100644 index 0000000..5e9f012 --- /dev/null +++ b/aten/src/ATen/AlignOf.h @@ -0,0 +1,145 @@ +//===--- AlignOf.h - Portable calculation of type alignment -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AlignedCharArray and AlignedCharArrayUnion classes. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::AlignOf +// replaced LLVM_ALIGNAS with alignas + +#pragma once + +#include + +namespace at { + +/// \struct AlignedCharArray +/// \brief Helper for building an aligned character array type. +/// +/// This template is used to explicitly build up a collection of aligned +/// character array types. We have to build these up using a macro and explicit +/// specialization to cope with MSVC (at least till 2015) where only an +/// integer literal can be used to specify an alignment constraint. Once built +/// up here, we can then begin to indirect between these using normal C++ +/// template parameters. + +// MSVC requires special handling here. +#ifndef _MSC_VER + +template +struct AlignedCharArray { + alignas(Alignment) char buffer[Size]; +}; + +#else // _MSC_VER + +/// \brief Create a type with an aligned char buffer. +template +struct AlignedCharArray; + +// We provide special variations of this template for the most common +// alignments because __declspec(align(...)) doesn't actually work when it is +// a member of a by-value function argument in MSVC, even if the alignment +// request is something reasonably like 8-byte or 16-byte. Note that we can't +// even include the declspec with the union that forces the alignment because +// MSVC warns on the existence of the declspec despite the union member forcing +// proper alignment. + +template +struct AlignedCharArray<1, Size> { + union { + char aligned; + char buffer[Size]; + }; +}; + +template +struct AlignedCharArray<2, Size> { + union { + short aligned; + char buffer[Size]; + }; +}; + +template +struct AlignedCharArray<4, Size> { + union { + int aligned; + char buffer[Size]; + }; +}; + +template +struct AlignedCharArray<8, Size> { + union { + double aligned; + char buffer[Size]; + }; +}; + + +// The rest of these are provided with a __declspec(align(...)) and we simply +// can't pass them by-value as function arguments on MSVC. + +#define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \ + template \ + struct AlignedCharArray { \ + __declspec(align(x)) char buffer[Size]; \ + }; + +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16) +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32) +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64) +AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128) + +#undef AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT + +#endif // _MSC_VER + +namespace detail { +template +class AlignerImpl { + T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10; + + AlignerImpl() = delete; +}; + +template +union SizerImpl { + char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)], + arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)], + arr9[sizeof(T9)], arr10[sizeof(T10)]; +}; +} // end namespace detail + +/// \brief This union template exposes a suitably aligned and sized character +/// array member which can hold elements of any of up to ten types. +/// +/// These types may be arrays, structs, or any other types. The goal is to +/// expose a char array buffer member which can be used as suitable storage for +/// a placement new of any of these types. Support for more than ten types can +/// be added at the cost of more boilerplate. +template +struct AlignedCharArrayUnion : AlignedCharArray< + alignof(detail::AlignerImpl), + sizeof(::at::detail::SizerImpl)> { +}; +} // end namespace at diff --git a/aten/src/ATen/Allocator.cpp b/aten/src/ATen/Allocator.cpp new file mode 100644 index 0000000..7d2f1fa --- /dev/null +++ b/aten/src/ATen/Allocator.cpp @@ -0,0 +1,14 @@ +#include + +namespace at { + +static void deleteInefficientStdFunctionContext(void* ptr) { + delete static_cast(ptr); +} + +at::DataPtr +InefficientStdFunctionContext::makeDataPtr(void* ptr, const std::function& deleter, Device device) { + return {ptr, new InefficientStdFunctionContext({ptr, deleter}), &deleteInefficientStdFunctionContext, device}; +} + +} // namespace at diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h new file mode 100644 index 0000000..867ae4c --- /dev/null +++ b/aten/src/ATen/Allocator.h @@ -0,0 +1,101 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace at { + +// A DataPtr is a unique pointer (with an attached deleter and some +// context for the deleter) to some memory, which also records what +// device is for its data. +// +// nullptr DataPtrs can still have a nontrivial device; this allows +// us to treat zero-size allocations uniformly with non-zero allocations. +// +class DataPtr { +private: + detail::UniqueVoidPtr ptr_; + Device device_; +public: + // Choice of CPU here is arbitrary; if there's an "undefined" device + // we could use that too + DataPtr() : ptr_(), device_(kCPU) {} + DataPtr(void* data, Device device) + : ptr_(data), device_(device) {} + DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device) + : ptr_(data, ctx, ctx_deleter), device_(device) {} + void* operator->() const { return ptr_.get(); } + void* get() const { return ptr_.get(); } + void* get_context() const { return ptr_.get_context(); } + void* release_context() { return ptr_.release_context(); } + operator bool() const { return static_cast(ptr_); } + template + T* cast_context(DeleterFnPtr expected_deleter) const { + return ptr_.cast_context(expected_deleter); + } + Device device() const { return device_; } +}; + +// NB: Device is NOT tested for here; a CUDA nullptr is as much a nullptr as a +// CPU nullptr + +inline bool operator==(const at::DataPtr& dp, std::nullptr_t) noexcept { return !dp; } +inline bool operator==(std::nullptr_t, const at::DataPtr& dp) noexcept { return !dp; } +inline bool operator!=(const at::DataPtr& dp, std::nullptr_t) noexcept { return dp; } +inline bool operator!=(std::nullptr_t, const at::DataPtr& dp) noexcept { return dp; } + +// Note [raw_allocate/raw_deallocate and Thrust] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Thrust's support for custom allocators requires us to write something +// like this: +// +// class ThrustAllocator { +// char* allocate(size_t); +// void deallocate(char*, size_t); +// }; +// +// This is not good for our unique_ptr based allocator interface, as +// there is no way to get to the context when we free. +// +// However, in some cases the context is exactly the same as +// the data pointer. In this case, we can support the "raw" +// allocate and deallocate interface. This is what +// raw_deleter signifies. By default, it returns a nullptr, which means that +// the raw interface is not implemented. Be sure to implement it whenever +// possible, or the raw interface will incorrectly reported as unsupported, +// when it is actually possible. + +struct Allocator { + virtual ~Allocator() {} + virtual at::DataPtr allocate(size_t n) const = 0; + + // If this returns a non nullptr, it means that allocate() + // is guaranteed to return a unique_ptr with this deleter attached; + // it means the rawAllocate and rawDeallocate APIs are safe to use. + // This function MUST always return the same BoundDeleter. + virtual DeleterFnPtr raw_deleter() const { return nullptr; } + void* raw_allocate(size_t n) { + auto dptr = allocate(n); + AT_ASSERT(dptr.get() == dptr.get_context()); + return dptr.release_context(); + } + void raw_deallocate(void* ptr) { + auto d = raw_deleter(); + AT_ASSERT(d); + d(ptr); + } +}; + +struct AT_API InefficientStdFunctionContext { + std::unique_ptr> ptr_; + InefficientStdFunctionContext(std::unique_ptr>&& ptr) + : ptr_(std::move(ptr)) {} + static at::DataPtr makeDataPtr(void* ptr, const std::function& deleter, Device device); +}; + +} // namespace at diff --git a/aten/src/ATen/ArrayRef.h b/aten/src/ATen/ArrayRef.h new file mode 100644 index 0000000..df14402 --- /dev/null +++ b/aten/src/ATen/ArrayRef.h @@ -0,0 +1,192 @@ +//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::ArrayRef. +// removed llvm-specific functionality +// removed some implicit const -> non-const conversions that rely on +// complicated std::enable_if meta-programming +// removed a bunch of slice variants for simplicity... + +#pragma once + +#include +#include + +#include +#include +#include + +namespace at { + /// ArrayRef - Represent a constant reference to an array (0 or more elements + /// consecutively in memory), i.e. a start pointer and a length. It allows + /// various APIs to take consecutive elements easily and conveniently. + /// + /// This class does not own the underlying data, it is expected to be used in + /// situations where the data resides in some other buffer, whose lifetime + /// extends past that of the ArrayRef. For this reason, it is not in general + /// safe to store an ArrayRef. + /// + /// This is intended to be trivially copyable, so it should be passed by + /// value. + template + class ArrayRef { + public: + typedef const T *iterator; + typedef const T *const_iterator; + typedef size_t size_type; + + typedef std::reverse_iterator reverse_iterator; + + private: + /// The start of the array, in an external buffer. + const T *Data; + + /// The number of elements. + size_type Length; + + public: + /// @name Constructors + /// @{ + + /// Construct an empty ArrayRef. + /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {} + + /// Construct an ArrayRef from a single element. + /*implicit*/ ArrayRef(const T &OneElt) + : Data(&OneElt), Length(1) {} + + /// Construct an ArrayRef from a pointer and length. + /*implicit*/ ArrayRef(const T *data, size_t length) + : Data(data), Length(length) {} + + /// Construct an ArrayRef from a range. + ArrayRef(const T *begin, const T *end) + : Data(begin), Length(end - begin) {} + + /// Construct an ArrayRef from a SmallVector. This is templated in order to + /// avoid instantiating SmallVectorTemplateCommon whenever we + /// copy-construct an ArrayRef. + template + /*implicit*/ ArrayRef(const SmallVectorTemplateCommon &Vec) + : Data(Vec.data()), Length(Vec.size()) { + } + + /// Construct an ArrayRef from a std::vector. + template + /*implicit*/ ArrayRef(const std::vector &Vec) + : Data(Vec.data()), Length(Vec.size()) {} + + /// Construct an ArrayRef from a std::array + template + /*implicit*/ constexpr ArrayRef(const std::array &Arr) + : Data(Arr.data()), Length(N) {} + + /// Construct an ArrayRef from a C array. + template + /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} + + /// Construct an ArrayRef from a std::initializer_list. + /*implicit*/ ArrayRef(const std::initializer_list &Vec) + : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()), + Length(Vec.size()) {} + + /// @} + /// @name Simple Operations + /// @{ + + const_iterator begin() const { return Data; } + const_iterator end() const { return Data + Length; } + + reverse_iterator rbegin() const { return reverse_iterator(end()); } + reverse_iterator rend() const { return reverse_iterator(begin()); } + + /// empty - Check if the array is empty. + bool empty() const { return Length == 0; } + + const T *data() const { return Data; } + + /// size - Get the array size. + size_t size() const { return Length; } + + /// front - Get the first element. + const T &front() const { + AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); + return Data[0]; + } + + /// back - Get the last element. + const T &back() const { + AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); + return Data[Length-1]; + } + + /// equals - Check for element-wise equality. + bool equals(ArrayRef RHS) const { + if (Length != RHS.Length) + return false; + return std::equal(begin(), end(), RHS.begin()); + } + + /// slice(n, m) - Chop off the first N elements of the array, and keep M + /// elements in the array. + ArrayRef slice(size_t N, size_t M) const { + AT_CHECK(N+M <= size(), "ArrayRef: invalid slice, ", N, " + ", M, " is not <= ", size()); + return ArrayRef(data()+N, M); + } + + /// slice(n) - Chop off the first N elements of the array. + ArrayRef slice(size_t N) const { return slice(N, size() - N); } + + /// @} + /// @name Operator Overloads + /// @{ + const T &operator[](size_t Index) const { + return Data[Index]; + } + + /// Vector compatibility + const T &at(size_t Index) const { + AT_CHECK(Index < Length, "ArrayRef: invalid index ", Index, " for length ", Length); + return Data[Index]; + } + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, ArrayRef>::type & + operator=(U &&Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, ArrayRef>::type & + operator=(std::initializer_list) = delete; + + /// @} + /// @name Expensive Operations + /// @{ + std::vector vec() const { + return std::vector(Data, Data+Length); + } + + /// @} + /// @name Conversion operators + /// @{ + operator std::vector() const { + return std::vector(Data, Data+Length); + } + + /// @} + }; + +} // end namespace at diff --git a/aten/src/ATen/Backtrace.cpp b/aten/src/ATen/Backtrace.cpp new file mode 100644 index 0000000..8ee61c7 --- /dev/null +++ b/aten/src/ATen/Backtrace.cpp @@ -0,0 +1,230 @@ +#include +#include + +#include +#include +#include +#include +#include + +#if !defined(_WIN32) +#include +#include +#endif // !defined(_WIN32) + +namespace at { +#if defined(_MSC_VER) +// Windows does not have cxxabi.h, so we will simply return the original. +std::string demangle(const char* name) { + return std::string(name); +} +#else +std::string demangle(const char* name) { + int status = -1; + + // This function will demangle the mangled function name into a more human + // readable format, e.g. _Z1gv -> g(). + // More information: + // https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/libsupc%2B%2B/cxxabi.h + // NOTE: `__cxa_demangle` returns a malloc'd string that we have to free + // ourselves. + std::unique_ptr> demangled( + abi::__cxa_demangle( + name, + /*__output_buffer=*/nullptr, + /*__length=*/0, + &status), + /*deleter=*/free); + + // Demangling may fail, for example when the name does not follow the + // standard C++ (Itanium ABI) mangling scheme. This is the case for `main` + // or `clone` for example, so the mangled name is a fine default. + if (status == 0) { + return demangled.get(); + } else { + return name; + } +} +#endif + +// TODO: This backtrace retrieval can be implemented on Windows via the Windows +// API using `CaptureStackBackTrace` and `SymFromAddr`. +// https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code +// https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows +// https://msdn.microsoft.com/en-us/library/windows/desktop/bb204633%28v=vs.85%29.aspx. +#if !defined(_WIN32) + +namespace { + +struct FrameInformation { + /// If available, the demangled name of the function at this frame, else + /// whatever (possibly mangled) name we got from `backtrace()`. + std::string function_name; + /// This is a number in hexadecimal form (e.g. "0xdead") representing the + /// offset into the function's machine code at which the function's body + /// starts, i.e. skipping the "prologue" that handles stack manipulation and + /// other calling convention things. + std::string offset_into_function; + /// NOTE: In debugger parlance, the "object file" refers to the ELF file that + /// the symbol originates from, i.e. either an executable or a library. + std::string object_file; +}; + +bool is_python_frame(const FrameInformation& frame) { + return frame.object_file == "python" || + (frame.object_file.find("libpython") != std::string::npos); +} + +at::optional parse_frame_information( + const std::string& frame_string) { + FrameInformation frame; + + // This is the function name in the CXX ABI mangled format, e.g. something + // like _Z1gv. Reference: + // https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling + std::string mangled_function_name; + +#if defined(__GLIBCXX__) + // In GLIBCXX, `frame_string` follows the pattern + // `(+) + // []` + + auto function_name_start = frame_string.find("("); + if (function_name_start == std::string::npos) { + return at::nullopt; + } + function_name_start += 1; + + auto offset_start = frame_string.find('+', function_name_start); + if (offset_start == std::string::npos) { + return at::nullopt; + } + offset_start += 1; + + const auto offset_end = frame_string.find(')', offset_start); + if (offset_end == std::string::npos) { + return at::nullopt; + } + + frame.object_file = frame_string.substr(0, function_name_start - 1); + frame.offset_into_function = + frame_string.substr(offset_start, offset_end - offset_start); + + // NOTE: We don't need to parse the return address because + // we already have it from the call to `backtrace()`. + + mangled_function_name = frame_string.substr( + function_name_start, (offset_start - 1) - function_name_start); +#elif defined(_LIBCPP_VERSION) + // In LIBCXX, The pattern is + // ` + + // ` + std::string skip; + std::istringstream input_stream(frame_string); + // operator>>() does not fail -- if the input stream is corrupted, the + // strings will simply be empty. + input_stream >> skip >> frame.object_file >> skip >> mangled_function_name >> + skip >> frame.offset_into_function; +#else +#warning Unknown standard library, backtraces may have incomplete debug information + return at::nullopt; +#endif // defined(__GLIBCXX__) + + // Some system-level functions don't have sufficient debug information, so + // we'll display them as "". They'll still have a return + // address and other pieces of information. + if (mangled_function_name.empty()) { + frame.function_name = ""; + return frame; + } + + frame.function_name = demangle(mangled_function_name.c_str()); + return frame; +} + +} // anonymous namespace + +#endif // !defined(_WIN32) + +std::string get_backtrace( + size_t frames_to_skip, + size_t maximum_number_of_frames, + bool skip_python_frames) { +#if !defined(_WIN32) + + // We always skip this frame (backtrace). + frames_to_skip += 1; + + std::vector callstack( + frames_to_skip + maximum_number_of_frames, nullptr); + // backtrace() gives us a list of return addresses in the current call stack. + // NOTE: As per man (3) backtrace it can never fail + // (http://man7.org/linux/man-pages/man3/backtrace.3.html). + auto number_of_frames = + ::backtrace(callstack.data(), static_cast(callstack.size())); + + // Skip as many frames as requested. This is not efficient, but the sizes here + // are small and it makes the code nicer and safer. + for (; frames_to_skip > 0 && number_of_frames > 0; + --frames_to_skip, --number_of_frames) { + callstack.erase(callstack.begin()); + } + + // `number_of_frames` is strictly less than the current capacity of + // `callstack`, so this is just a pointer subtraction and makes the subsequent + // code safer. + callstack.resize(static_cast(number_of_frames)); + + // `backtrace_symbols` takes the return addresses obtained from `backtrace()` + // and fetches string representations of each stack. Unfortunately it doesn't + // return a struct of individual pieces of information but a concatenated + // string, so we'll have to parse the string after. NOTE: The array returned + // by `backtrace_symbols` is malloc'd and must be manually freed, but not the + // strings inside the array. + std::unique_ptr> raw_symbols( + ::backtrace_symbols(callstack.data(), static_cast(callstack.size())), + /*deleter=*/free); + const std::vector symbols( + raw_symbols.get(), raw_symbols.get() + callstack.size()); + + // The backtrace string goes into here. + std::ostringstream stream; + + // Toggles to true after the first skipped python frame. + bool has_skipped_python_frames = false; + + for (size_t frame_number = 0; frame_number < callstack.size(); + ++frame_number) { + const auto frame = parse_frame_information(symbols[frame_number]); + + if (skip_python_frames && frame && is_python_frame(*frame)) { + if (!has_skipped_python_frames) { + stream << "\n"; + has_skipped_python_frames = true; + } + continue; + } + + // frame #: + stream << "frame #" << frame_number << ": "; + + if (frame) { + // + ( in ) + stream << frame->function_name << " + " << frame->offset_into_function + << " (" << callstack[frame_number] << " in " << frame->object_file + << ")\n"; + } else { + // In the edge-case where we couldn't parse the frame string, we can + // just use it directly (it may have a different format). + stream << symbols[frame_number] << "\n"; + } + } + + return stream.str(); + +#else + + return "(no backtrace available)"; +#endif +} +} // namespace at diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h new file mode 100644 index 0000000..347c430 --- /dev/null +++ b/aten/src/ATen/Backtrace.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +#include + +namespace at { +/// Utility to demangle a C++ symbol name. +AT_API std::string demangle(const char* name); + +/// Returns the printable name of the type. +template +inline const char* demangle_type() { +#ifdef __GXX_RTTI + static const std::string name = demangle(typeid(T).name()); + return name.c_str(); +#else // __GXX_RTTI + return "(RTTI disabled, cannot show name)"; +#endif // __GXX_RTTI +} + +AT_API std::string get_backtrace( + size_t frames_to_skip = 0, + size_t maximum_number_of_frames = 64, + bool skip_python_frames = true); +} // namespace at diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt new file mode 100644 index 0000000..9e19223 --- /dev/null +++ b/aten/src/ATen/CMakeLists.txt @@ -0,0 +1,456 @@ +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) + +if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + # ---[ Generate and install header and cpp files + include(../../../cmake/Codegen.cmake) +endif() + +IF(NOT MSVC) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-absolute-value") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-absolute-value") +ENDIF(NOT MSVC) + +################################################################################ +# Helper functions +################################################################################ + +function(filter_list output input) + unset(result) + foreach(filename ${${input}}) + foreach(pattern ${ARGN}) + if("${filename}" MATCHES "${pattern}") + list(APPEND result "${filename}") + endif() + endforeach() + endforeach() + set(${output} ${result} PARENT_SCOPE) +endfunction() + + +# Can be compiled standalone +IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DIR OR NOT AT_INSTALL_SHARE_DIR) + SET(AT_INSTALL_BIN_DIR "bin" CACHE PATH "AT install binary subdirectory") + SET(AT_INSTALL_LIB_DIR "lib" CACHE PATH "AT install library subdirectory") + SET(AT_INSTALL_INCLUDE_DIR "include" CACHE PATH "AT install include subdirectory") + SET(AT_INSTALL_SHARE_DIR "share" CACHE PATH "AT install include subdirectory") +ENDIF() + +CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h") +CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h") + +# NB: If you edit these globs, you'll have to update setup.py package_data as well +FILE(GLOB base_h "*.h" "detail/*.h") +FILE(GLOB base_cpp "*.cpp" "detail/*.cpp") +FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh") +FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") +FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu") +FILE(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh") +FILE(GLOB cudnn_cpp "cudnn/*.cpp") +FILE(GLOB mkl_cpp "mkl/*.cpp") +FILE(GLOB mkldnn_cpp "mkldnn/*.cpp") + +FILE(GLOB native_cpp "native/*.cpp") +FILE(GLOB native_sparse_cpp "native/sparse/*.cpp") +FILE(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") +FILE(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") +FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp") +FILE(GLOB native_cuda_cu "native/cuda/*.cu") +FILE(GLOB native_cuda_cpp "native/cuda/*.cpp") +FILE(GLOB native_mkl_cpp "native/mkl/*.cpp") +FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") + +set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) +if(AT_MKL_ENABLED) + set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp}) +endif() +if(AT_MKLDNN_ENABLED) + set(all_cpu_cpp ${all_cpu_cpp} ${mkldnn_cpp}) +endif() + +IF(USE_CUDA OR USE_ROCM) + list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda) + set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu}) + set(all_cuda_cpp ${native_cudnn_cpp} ${native_sparse_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS}) + IF(CUDNN_FOUND) + SET(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp}) + ENDIF() +endif() + +filter_list(generated_h generated_cpp "\\.h$") +filter_list(cuda_generated_h cuda_generated_cpp "\\.h$") + +list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..) +# so the build can find the generated header files +list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}) +IF(NOT AT_LINK_STYLE) + SET(AT_LINK_STYLE SHARED) +ENDIF() + +IF(BLAS_FOUND) + IF ($ENV{TH_BINARY_BUILD}) + MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.") + list(APPEND ATen_CPU_DEPENDENCY_LIBS + "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}") + if(USE_CUDA OR USE_ROCM) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}") + endif() + ELSE ($ENV{TH_BINARY_BUILD}) + list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES}) + if(USE_CUDA OR USE_ROCM) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${BLAS_LIBRARIES}") + endif() + ENDIF ($ENV{TH_BINARY_BUILD}) +ENDIF(BLAS_FOUND) + +IF(LAPACK_FOUND) + list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES}) + if(USE_CUDA OR USE_ROCM) + # Although Lapack provides CPU (and thus, one might expect that ATen_cuda + # would not need this at all), some of our libraries (magma in particular) + # backend to CPU BLAS/LAPACK implementations, and so it is very important + # we get the *right* implementation, because even if the symbols are the + # same, LAPACK implementions may have different calling conventions. + # This caused https://github.com/pytorch/pytorch/issues/7353 + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES}) + endif() +ENDIF(LAPACK_FOUND) + +IF (UNIX AND NOT APPLE) + INCLUDE(CheckLibraryExists) + # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830 + CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT) + IF(NEED_LIBRT) + list(APPEND ATen_CPU_DEPENDENCY_LIBS rt) + SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt) + ENDIF(NEED_LIBRT) +ENDIF(UNIX AND NOT APPLE) + +IF(UNIX) + SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h") + CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP) + IF(HAVE_MMAP) + ADD_DEFINITIONS(-DHAVE_MMAP=1) + ENDIF(HAVE_MMAP) + # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html + ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64) + CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN) + IF(HAVE_SHM_OPEN) + ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1) + ENDIF(HAVE_SHM_OPEN) + CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK) + IF(HAVE_SHM_UNLINK) + ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1) + ENDIF(HAVE_SHM_UNLINK) + CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE) + IF(HAVE_MALLOC_USABLE_SIZE) + ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1) + ENDIF(HAVE_MALLOC_USABLE_SIZE) +ENDIF(UNIX) + +if(NOT MSVC) + list(APPEND ATen_CPU_DEPENDENCY_LIBS m) +endif() + +if(MKLDNN_FOUND) + list(APPEND ATen_CPU_DEPENDENCY_LIBS ${MKLDNN_LIBRARIES}) +endif(MKLDNN_FOUND) + +list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) + +if(NOT MSVC) + # Preserve values for the main build + set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) + set(__aten_sleef_build_tests ${BUILD_TESTS}) + + # Unset our restrictive C++ flags here and reset them later. + # Remove this once we use proper target_compile_options. + set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(CMAKE_CXX_FLAGS) + + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) + set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) + set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) + set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) + add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/sleef" ${CMAKE_BINARY_DIR}/sleef) + set_property(TARGET sleef PROPERTY FOLDER "dependencies") + list(APPEND ATen_THIRD_PARTY_INCLUDE ${CMAKE_BINARY_DIR}/include) + link_directories(${CMAKE_BINARY_DIR}/sleef/lib) + list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) + + set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) + + # Set these back. TODO: Use SLEEF_ to pass these instead + set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) + set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) +endif() + +IF(USE_CUDA AND NOT USE_ROCM) + IF ($ENV{ATEN_STATIC_CUDA}) + # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support + # we first have to build a fake lib that links with no device callbacks, + # and then we link against this object file. + # This was recommended by the CuFFT team at NVIDIA + + # build fake CuFFT lib in build dir + EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc) + if(${CUDA_VERSION_MAJOR} EQUAL "8") + SET(CUFFT_FAKELINK_OPTIONS + --generate-code arch=compute_35,code=sm_35 + --generate-code arch=compute_50,code=sm_50 + --generate-code arch=compute_60,code=sm_60) + elseif(${CUDA_VERSION_MAJOR} EQUAL "9") + SET(CUFFT_FAKELINK_OPTIONS + --generate-code arch=compute_35,code=sm_35 + --generate-code arch=compute_50,code=sm_50 + --generate-code arch=compute_60,code=sm_60 + --generate-code arch=compute_70,code=sm_70) + else() + MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}") + endif() + ADD_CUSTOM_COMMAND( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a + COMMAND "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" -o ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a -Xcompiler -fPIC + ${CUFFT_FAKELINK_OPTIONS} + --device-link ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc -lcufft_static -lculibos + ) + ADD_CUSTOM_TARGET(FAKELINKED_CUFFT_TARGET DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a) + add_library(FAKELINKED_CUFFT STATIC IMPORTED GLOBAL) + add_dependencies(FAKELINKED_CUFFT FAKELINKED_CUFFT_TARGET) + set_target_properties(FAKELINKED_CUFFT PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a) + + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusparse_static.a + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a + FAKELINKED_CUFFT + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static.a + ) + ELSE() + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + ${CUDA_cusparse_LIBRARY} + ${CUDA_curand_LIBRARY}) + ENDIF() + + if(CUDNN_FOUND) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${CUDNN_LIBRARIES}) + endif(CUDNN_FOUND) + + IF(USE_MAGMA) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${MAGMA_LIBRARIES}) + IF ($ENV{TH_BINARY_BUILD}) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}") + ENDIF($ENV{TH_BINARY_BUILD}) + ENDIF(USE_MAGMA) + IF ($ENV{ATEN_STATIC_CUDA}) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a") + list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a") + ENDIF($ENV{ATEN_STATIC_CUDA}) +ENDIF() + +IF(USE_ROCM) + ### Link in the ROCm libraries BLAS / RNG. + FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib) + FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib) + + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY}) +ENDIF() + +# Include CPU paths for CUDA as well +list(APPEND ATen_CUDA_INCLUDE ${ATen_CPU_INCLUDE}) + +# We have two libraries: libATen_cpu.so and libATen_cuda.so, +# with libATen_cuda.so depending on libATen_cpu.so. The CPU library +# contains CPU code only. libATen_cpu.so is invariant to the setting +# of USE_CUDA (it always builds the same way); libATen_cuda.so is only +# built when USE_CUDA=1 and CUDA is available. +set(ATen_CPU_SRCS ${all_cpu_cpp}) +if(AT_LINK_STYLE STREQUAL "INTERFACE") + # Source code can't be added to an interface library, so it is + # passed back to be compiled into the containing library + add_library(ATen_cpu INTERFACE) + list(APPEND ATen_CPU_DEPENDENCY_LIBS ATEN_CPU_FILES_GEN_LIB) +else() + add_library(ATen_cpu ${AT_LINK_STYLE} ${ATen_CPU_SRCS}) + if (ATen_THIRD_PARTY_INCLUDE) + target_include_directories(ATen_cpu SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) + endif() + target_include_directories(ATen_cpu INTERFACE $) + target_include_directories(ATen_cpu PRIVATE ${ATen_CPU_INCLUDE}) + target_link_libraries(ATen_cpu PUBLIC ${ATen_CPU_DEPENDENCY_LIBS}) + target_link_libraries(ATen_cpu PRIVATE ATEN_CPU_FILES_GEN_LIB) + caffe2_interface_library(ATen_cpu ATen_cpu_library) + # Set standard properties on the target + aten_set_target_props(ATen_cpu) + + # Make sure these don't get built by parent + set(ATen_CPU_SRCS) +endif() + +if(USE_CUDA OR USE_ROCM) + set(ATen_CUDA_SRCS ${all_cuda_cpp}) + if(AT_LINK_STYLE STREQUAL "INTERFACE") + # Source code can't be added to an interface library, so it is + # passed back to be compiled into the containing library + add_library(ATen_cuda INTERFACE) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB) + else() + # A hack to deal with cuda library dependencies and modern CMake: the + # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result, + # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This + # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with + # it. We will then manually add the cudart library as interface libs. + set(__tmp ${CUDA_LIBRARIES}) + set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES}) + torch_cuda_based_add_library(ATen_cuda ${AT_LINK_STYLE} ${ATen_CUDA_SRCS}) + set(CUDA_LIBRARIES ${__tmp}) + target_link_libraries(ATen_cuda INTERFACE caffe2::cudart) + + target_include_directories( + ATen_cuda INTERFACE $) + target_include_directories( + ATen_cuda PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) + target_include_directories( + ATen_cuda PRIVATE ${ATen_CUDA_INCLUDE}) + target_link_libraries( + ATen_cuda PRIVATE ${ATen_CUDA_DEPENDENCY_LIBS} ATEN_CUDA_FILES_GEN_LIB) + + # These public dependencies must go after the previous dependencies, as the + # order of the libraries in the linker call matters here when statically + # linking; libculibos and cublas must be last. + target_link_libraries( + ATen_cuda PUBLIC ATen_cpu ${ATen_PUBLIC_CUDA_DEPENDENCY_LIBS}) + + # Set standard properties on the target + aten_set_target_props(ATen_cuda) + + caffe2_interface_library(ATen_cuda ATen_cuda_library) + + # Make sure these don't get built by parent + set(ATen_CUDA_SRCS) + endif() +endif() + +if(NOT AT_LINK_STYLE STREQUAL "INTERFACE") + if(USE_CUDA) + if (NOT $ENV{ATEN_STATIC_CUDA}) + cuda_add_cublas_to_target(ATen_cuda) + cuda_add_cufft_to_target(ATen_cuda) + endif() + endif() + + if(NOT MSVC) + aten_compile_options(ATen_cpu) + if(USE_CUDA OR USE_ROCM) + aten_compile_options(ATen_cuda) + endif() + endif() + + if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1") + set_property(TARGET ATen_cpu PROPERTY CXX_STANDARD 11) + if(USE_CUDA OR USE_ROCM) + set_property(TARGET ATen_cuda PROPERTY CXX_STANDARD 11) + endif() + endif() +endif() + +if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + # Eventually replace this use of LOCATION with use of + # $, but generators only work in some cases + cmake_policy(SET CMP0026 OLD) + get_target_property(ATEN_CPU_OUTPUT_NAME ATen_cpu LOCATION) + get_filename_component(ATEN_CPU_OUTPUT_NAME ${ATEN_CPU_OUTPUT_NAME} NAME) + set(ATEN_LIBRARIES + "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CPU_OUTPUT_NAME}") + if(USE_CUDA OR USE_ROCM) + get_target_property(ATEN_CUDA_OUTPUT_NAME ATen_cuda LOCATION) + get_filename_component(ATEN_CUDA_OUTPUT_NAME ${ATEN_CUDA_OUTPUT_NAME} NAME) + list(APPEND ATEN_LIBRARIES + "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CUDA_OUTPUT_NAME}") + endif() + + install(TARGETS ATen_cpu + RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}" + LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}") + + if(USE_CUDA OR USE_ROCM) + install(TARGETS ATen_cuda + RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}" + LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}") + endif() +endif() + +SET(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}") +CONFIGURE_FILE(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake") +INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" + DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen") + +# https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake +FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h}) + string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER}) + GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY) + INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR}) +ENDFOREACH() +FOREACH(HEADER ${generated_h} ${cuda_generated_h}) + # NB: Assumed to be flat + INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen) +ENDFOREACH() +INSTALL(FILES ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml + DESTINATION ${AT_INSTALL_SHARE_DIR}/ATen) + +# if(ATEN_NO_TEST) +# message("disable test because ATEN_NO_TEST is set") +# else() +# add_subdirectory(test) +# endif() + +if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + foreach(test_src ${ATen_CPU_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + add_executable(${test_name} "${test_src}") + target_include_directories( + ${test_name} PRIVATE $) + target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) + target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) + target_link_libraries(${test_name} ATen_cpu) + add_test(NAME ${test_name} COMMAND $) + install(TARGETS ${test_name} DESTINATION test) + endforeach() + + if(USE_CUDA OR USE_ROCM) + foreach(test_src ${ATen_CUDA_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + torch_cuda_based_add_executable(${test_name} "${test_src}") + target_include_directories( + ${test_name} PRIVATE $) + target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) + target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) + target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda) + add_test(NAME ${test_name} COMMAND $) + install(TARGETS ${test_name} DESTINATION test) + endforeach() + endif() + + # Make sure these don't get built by parent + set(ATen_CPU_TEST_SRCS) + set(ATen_CUDA_TEST_SRCS) +endif() + +# Pass source, includes, and libs to parent +set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) +set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) +set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) +set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) +set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) +set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) +set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h new file mode 100644 index 0000000..2db2786 --- /dev/null +++ b/aten/src/ATen/CPUApplyUtils.h @@ -0,0 +1,492 @@ +#pragma once + +#include "ATen/Parallel.h" +#include "ATen/TensorUtils.h" +#include + +namespace at { + +/* + * The basic strategy for apply is as follows: + * + * 1. Starting with the outermost index, loop until we reach a dimension where + * the data is no longer contiguous, i.e. the stride at that dimension is not + * equal to the size of the tensor defined by the outer dimensions. Let's call + * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then + * A is equal to the entire Tensor. Let's call the inner tensor B. + * + * 2. We loop through the indices in B, starting at its outermost dimension. For + * example, if B is a 2x2 matrix, then we do: + * + * B[0][0] + * B[0][1] + * B[1][0] + * B[1][1] + * + * We set the offset into the underlying storage as (storageOffset + stride_B * + * index_B), i.e. basically we compute the offset into the storage as we would + * normally for a Tensor. But because we are guaranteed the subsequent data is + * contiguous in memory, we can simply loop for sizeof(A) iterations and perform + * the operation, without having to follow the order described by the strides of + * A. + * + * 3. As an optimization, we merge dimensions of A that are contiguous in + * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, + * then the first two dimensions can be merged for the purposes of APPLY, + * reducing the number of nested loops. + */ + +inline Tensor sort_strides(Tensor& tensor_) { + IntList strides = tensor_.strides(); + std::vector indices; + indices.reserve(tensor_.ndimension()); + for (int64_t i = 0; i < tensor_.ndimension(); i++) { + indices.push_back(i); + } + std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) { + return strides[i1] > strides[i2]; + }); + Tensor tensor = tensor_.permute(indices); + return tensor; +} + +template +inline void _setup_arrays(Tensor& tensor, Arg* iter) { + int64_t max_dim = tensor.ndimension(); + iter->dim_ = 0; + for (int64_t i = 0; i < max_dim; i++) { + int64_t size = tensor.size(i); + int64_t stride = tensor.stride(i); + while (i + 1 < max_dim && + (tensor.size(i + 1) == 1 || + tensor.stride(i) == tensor.size(i + 1) * tensor.stride(i + 1))) { + size = size * tensor.size(i + 1); + if (tensor.size(i + 1) != 1) + stride = tensor.stride(i + 1); + i++; + } + iter->sizes_[iter->dim_] = size; + iter->strides_[iter->dim_] = stride; + iter->dim_++; + } +} + +template +struct strided_tensor_iter_fixed { + public: + T* data_ = NULL; + int64_t dim_ = 0; + + int64_t counter_[N] = {0}; + int64_t sizes_[N] = {0}; + int64_t strides_[N] = {0}; + + strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete; + void operator=(strided_tensor_iter_fixed const& x) = delete; + strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default; + strided_tensor_iter_fixed(Tensor& tensor, bool sort_strides = false) + : data_(tensor.data()) { + memset(counter_, 0, sizeof(int64_t) * N); + _setup_arrays(tensor, this); + } +}; + +template +struct strided_tensor_iter { + private: + public: + T* data_ = NULL; + int64_t dim_; + + std::vector counter_; + std::vector sizes_; + std::vector strides_; + + strided_tensor_iter(strided_tensor_iter const&) = delete; + void operator=(strided_tensor_iter const& x) = delete; + strided_tensor_iter(strided_tensor_iter&&) = default; + strided_tensor_iter(Tensor& tensor) + : data_(tensor.data()), + dim_(tensor.ndimension()), + counter_(dim_, 0), + sizes_(tensor.sizes()), + strides_(tensor.strides()) { + _setup_arrays(tensor, this); + } +}; + +inline bool _all_equal_numel(at::ArrayRef tensors) { + if (tensors.size() == 0) + return true; + int64_t all_numel = tensors[0].numel(); + for (size_t i = 1; i < tensors.size(); i++) { + if (tensors[i].numel() != all_numel) + return false; + } + return true; +} + +inline std::string _all_equal_numel_error(at::ArrayRef tensors) { + std::ostringstream oss; + oss << "inconsistent tensor size, expected "; + for (size_t i = 0; i < tensors.size() - 1; i++) { + oss << tensors[i].sizes() << ", "; + } + oss << "and " << tensors[tensors.size() - 1] + << " to have the same number of elements, but got "; + for (size_t i = 0; i < tensors.size() - 1; i++) { + oss << tensors[i].numel() << ", "; + } + oss << "and " << tensors[tensors.size() - 1].numel() + << " elements respectively"; + return oss.str(); +} + +inline bool _apply_preamble(ArrayRef tensors) { + checkBackend("CPU_tensor_apply", tensors, Backend::CPU); + if (!_all_equal_numel(tensors)) + throw std::runtime_error(_all_equal_numel_error(tensors)); + // An empty tensor has no elements + for (auto& t : tensors) + if (t.numel() == 0) + return false; + return true; +} + +inline int64_t _max_dim_tensors(ArrayRef tensors) { + int64_t dim = 0; + for (auto& t : tensors) + dim = std::max(dim, t.ndimension()); + return dim; +} + +inline void iterate(int64_t size){}; + +template +inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) { + iter.counter_[iter.dim_ - 1] += size; + iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1]; + iterate(size, iter_tail...); +} + +inline bool iterate_continue() { + return true; +}; + +template +inline bool iterate_continue(Arg& iter, Args&... iter_tail) { + return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] && + iterate_continue(iter_tail...); +} + +inline int64_t max_iterate_size() { + return std::numeric_limits::max(); +}; + +template +inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) { + return std::min( + (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]), + max_iterate_size(iter_tail...)); +} + +inline void iterate_overflow(){}; + +template +inline void iterate_overflow(Arg& iter, Args&... iter_tail) { + if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) { + for (int64_t i = iter.dim_ - 1; i > 0; i--) { + if (iter.counter_[i] == iter.sizes_[i]) { + iter.counter_[i] = 0; + iter.counter_[i - 1]++; + iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) + + iter.strides_[i - 1]; + } + } + } + iterate_overflow(iter_tail...); +} + +inline void forward(int64_t offset){}; + +template +inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) { + int64_t multi = offset; + for (int64_t i = iter.dim_ - 1; i >= 0; i--) { + int64_t inc = multi % iter.sizes_[i]; + multi = multi / iter.sizes_[i]; + iter.data_ = iter.data_ + inc * iter.strides_[i]; + iter.counter_[i] += inc; + } + forward(offset, iter_tail...); +} + +inline int64_t max_dim() { + return 0; +} + +template +inline int64_t max_dim(Arg& iter, Args&... iter_tail) { + return std::max(iter.dim_, max_dim(iter_tail...)); +} + +inline void apply_op(){}; + +template +inline void +apply_op(int64_t numel, int64_t offset, const Op& op, Args... iters) { + // For 0-dim tensors + if (numel == 1 && max_dim(iters...) == 0) { + op(*iters.data_...); + return; + } + if (offset > 0) + forward(offset, iters...); + // Splitting this into chunks helps the compiler create faster assembly + for (int64_t i = 0; i < numel;) { + for (; iterate_continue(iters...) && i < numel;) { + op(*iters.data_...); + iterate(1, iters...); + i++; + } + iterate_overflow(iters...); + } +} + + +inline void apply_kernel(){}; + +// TODO: Deal elegantly with 0-dim tensors. iters.strides_ of 0-dim +// strided_tensor_iter will be of size 0 for dim 0 and iters.strides_[iters.dim_ +// - 1] will index at -1. C++14 integer_sequence could be of use here. +template +inline void +apply_kernel(int64_t numel, int64_t offset, const Op& op, Args... iters) { + if (offset > 0) + forward(offset, iters...); + int64_t size = std::min(numel, max_iterate_size(iters...)); + op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...); + iterate(size, iters...); + iterate_overflow(iters...); + int64_t i = size; + size = std::min(numel, max_iterate_size(iters...)); + for (; i < numel;) { + op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...); + iterate(size, iters...); + i += size; + iterate_overflow(iters...); + } +} + +template +inline void +CPU_tensor_parallel_kernel_apply2(Tensor tensor1, Tensor tensor2, const Op op) { + if (!_apply_preamble({tensor1, tensor2})) + return; + if (tensor1.numel() == 1) { + op(1, tensor1.data(), tensor2.data(), 0, 0); + return; + } + if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) { + parallel_for( + 0, + tensor1.numel(), + 1, + [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { + apply_kernel( + end - begin, + begin, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2)); + }); + } else { + parallel_for( + 0, + tensor1.numel(), + 1, + [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { + apply_kernel( + end - begin, + begin, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2)); + }); + } +} + +/* + Apply a pointwise operator to sequence of tensors + + The calling convention for op is a function/functor that takes takes the same + number of pointers of type scalar as the number of given tensors. For example, + to compute a = b * c, op would be of the form: + [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] = + b_val[0] * c_val[0]; }; +*/ + +template +inline void CPU_tensor_apply1(Tensor tensor1, const Op op) { + if (!_apply_preamble({tensor1})) + return; + if (tensor1.ndimension() < 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1, true)); + } else { + apply_op(tensor1.numel(), 0, op, strided_tensor_iter(tensor1)); + } +} + +template +inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) { + if (!_apply_preamble({tensor1, tensor2})) + return; + if (_max_dim_tensors({tensor1, tensor2}) <= 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2)); + } else { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2)); + } +} + +template +inline void +CPU_tensor_apply3(Tensor tensor1, Tensor tensor2, Tensor tensor3, const Op op) { + if (!_apply_preamble({tensor1, tensor2, tensor3})) + return; + if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2), + strided_tensor_iter_fixed(tensor3)); + } else { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2), + strided_tensor_iter(tensor3)); + } +} + +template < + typename scalar1, + typename scalar2, + typename scalar3, + typename scalar4, + typename Op> +inline void CPU_tensor_apply4( + Tensor tensor1, + Tensor tensor2, + Tensor tensor3, + Tensor tensor4, + const Op op) { + if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4})) + return; + if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2), + strided_tensor_iter_fixed(tensor3), + strided_tensor_iter_fixed(tensor4)); + } else { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2), + strided_tensor_iter(tensor3), + strided_tensor_iter(tensor4)); + } +} + +template +inline void CPU_tensor_parallel_apply1( + Tensor tensor1, + const Op op, + int64_t grain_size = internal::GRAIN_SIZE) { + if (!_apply_preamble({tensor1})) + return; + if (tensor1.ndimension() < 8) { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &op](int64_t begin, int64_t end) { + apply_op( + end - begin, + begin, + op, + strided_tensor_iter_fixed(tensor1, true)); + }); + } else { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &op](int64_t begin, int64_t end) { + apply_op( + end - begin, begin, op, strided_tensor_iter(tensor1)); + }); + } +} + +template +inline void CPU_tensor_parallel_apply2( + Tensor tensor1, + Tensor tensor2, + const Op op, + int64_t grain_size = internal::GRAIN_SIZE) { + if (!_apply_preamble({tensor1, tensor2})) + return; + if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { + apply_op( + end - begin, + begin, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2)); + }); + } else { + parallel_for( + 0, + tensor1.numel(), + grain_size, + [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { + apply_op( + end - begin, + begin, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2)); + }); + } +} + +} // namespace at diff --git a/aten/src/ATen/CPUFixedAllocator.h b/aten/src/ATen/CPUFixedAllocator.h new file mode 100644 index 0000000..c7caea5 --- /dev/null +++ b/aten/src/ATen/CPUFixedAllocator.h @@ -0,0 +1,31 @@ +#pragma once + +#include "TH/TH.h" +#include "ATen/Error.h" + +// This file creates a fake allocator that just throws exceptions if +// it is actually used. + +// state passed to the allocator is the std::function called +// when the blob is release by ATen + +namespace at { + +static cpu_fixed_malloc(void *, ptrdiff_t) { + AT_ERROR("attempting to resize a tensor view of an external blob"); +} + +static cpu_fixed_realloc(void *, void*, ptrdiff_t) { + AT_ERROR("attempting to resize a tensor view of an external blob"); +} + +static cpu_fixed_free(void * state, void * allocation) { + auto on_release = static_cast*>(state); + (*on_release)(allocation); + delete on_release; +} + +static THAllocator CPU_fixed_allocator = + { cpu_fixed_malloc, cpu_fixed_realloc, cpu_fixed_free }; + +} diff --git a/aten/src/ATen/CPUGeneral.cpp b/aten/src/ATen/CPUGeneral.cpp new file mode 100644 index 0000000..910e3ae --- /dev/null +++ b/aten/src/ATen/CPUGeneral.cpp @@ -0,0 +1,16 @@ +#include +#include +#include +#include + +namespace at { +// Lock free atomic type +std::atomic num_threads(-1); + +void set_num_threads(int num_threads_) { + if (num_threads_ >= 0) + num_threads.store(num_threads_); +} + +int get_num_threads() { return num_threads.load(); } +} diff --git a/aten/src/ATen/CPUGeneral.h b/aten/src/ATen/CPUGeneral.h new file mode 100644 index 0000000..83ee165 --- /dev/null +++ b/aten/src/ATen/CPUGeneral.h @@ -0,0 +1,12 @@ +#pragma once + +// Using AT_API is crucial as otherwise you'll see +// linking errors using MSVC +// See https://msdn.microsoft.com/en-us/library/a90k134d.aspx +// This header adds this if using AT_API +#include "ATen/ATenGeneral.h" + +namespace at { +AT_API void set_num_threads(int); +AT_API int get_num_threads(); +} diff --git a/aten/src/ATen/CPUGenerator.cpp b/aten/src/ATen/CPUGenerator.cpp new file mode 100644 index 0000000..d737e1f --- /dev/null +++ b/aten/src/ATen/CPUGenerator.cpp @@ -0,0 +1,49 @@ +#include "ATen/CPUGenerator.h" + +#define const_generator_cast(generator) \ + dynamic_cast(generator) + +namespace at { + +CPUGenerator::CPUGenerator(Context * context_) + : context(context_), generator(THGenerator_new()) +{} + +CPUGenerator::~CPUGenerator() { + if (generator) + THGenerator_free(generator); +} + +CPUGenerator& CPUGenerator::copy(const Generator& from) { + THGenerator_copy(generator, const_generator_cast(from).generator); + return *this; +} + +CPUGenerator& CPUGenerator::free() { + THGenerator_free(generator); + return *this; +} + +uint64_t CPUGenerator::seed() { + return THRandom_seed(generator); +} + +uint64_t CPUGenerator::initialSeed() { + return THRandom_initialSeed(generator); +} + +CPUGenerator& CPUGenerator::manualSeed(uint64_t seed) { + THRandom_manualSeed(generator, seed); + return *this; +} + +CPUGenerator& CPUGenerator::manualSeedAll(uint64_t seed) { + // There's only one CPU generator + return manualSeed(seed); +} + +void * CPUGenerator::unsafeGetTH() { + return generator; +} + +} // namespace at diff --git a/aten/src/ATen/CUDAStream.cpp b/aten/src/ATen/CUDAStream.cpp new file mode 100644 index 0000000..ad9d51c --- /dev/null +++ b/aten/src/ATen/CUDAStream.cpp @@ -0,0 +1,183 @@ +#include "ATen/CUDAStream.h" +#include "ATen/Error.h" +#include "ATen/detail/CUDAHooksInterface.h" + +#include + +// Internal implementation is entirely hidden +struct CUDAStreamInternals { + bool is_destructible; + std::atomic refcount; + int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t + cudaStream_t stream; +}; + +namespace at { + +namespace detail { + + /* + * Stream state + */ + static constexpr cudaStream_t DEFAULT_STREAM = 0; + + static std::once_flag init_flag; + static int64_t num_gpus; + static CUDAStreamInternals* default_streams; + static thread_local CUDAStreamInternals** current_streams = nullptr; + + // Creates a(n indestructible) default stream for each device + // Note: the default stream on each device is signified by a zero + // value for the pointer, and so is not actually created as usual. + // In particular, we don't need to switch devices when creating the + // streams. + static void initDefaultCUDAStreams() { + num_gpus = getCUDAHooks().getNumGPUs(); + default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals)); + for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { + default_streams[i].is_destructible = false; + default_streams[i].refcount = 0; + default_streams[i].device = i; + default_streams[i].stream = DEFAULT_STREAM; + } + } + + // Init front-end to ensure initialization only occurs once + static void initCUDAStreamsOnce() { + // Inits default streams (once, globally) + std::call_once(init_flag, initDefaultCUDAStreams); + + // Inits current streams (thread local) to default streams + if (current_streams) return; + current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*)); + for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { + current_streams[i] = &default_streams[i]; + } + } + + /* + * Pointer-based stream API + */ + + // Helper to return the current device + static inline int64_t current_device() { + int cur_device; + DynamicCUDAInterface::get_device(&cur_device); + return cur_device; + } + + // Helper to verify the GPU index is valid + static inline void check_gpu(int64_t device) { + AT_CHECK(device >= 0 && device < num_gpus); + } + + CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) { + initCUDAStreamsOnce(); + check_gpu(device); + return &default_streams[device]; + } + CUDAStreamInternals* CUDAStream_getDefaultStream() { + return CUDAStream_getDefaultStreamOnDevice(current_device()); + } + + // Creates (and retains) and new cuda stream + CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) { + CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals)); + internals->is_destructible = true; + internals->refcount = 1; + internals->device = current_device(); + DynamicCUDAInterface::cuda_stream_create_with_priority(&internals->stream, flags, priority); + return internals; + } + + // Note: despite not being "unsafe," is using these methods in a multithreaded + // environment then the caller must be sure that streams are valid + // when they're requested. These methods will throw an error if an + // invalid stream is requested. + CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) { + initCUDAStreamsOnce(); + check_gpu(device); + auto cur = current_streams[device]; + AT_CHECK(CUDAStream_retain(cur)); + return cur; + } + CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() { + return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device()); + } + + // Note: these unsafe methods do not retain the stream before returning it. + // This is unsafe behavior and these methods SHOULD NOT BE USED. + // They are here only for legacy compatibility. + CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) { + initCUDAStreamsOnce(); + check_gpu(device); + return current_streams[device]; + } + CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() { + return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device()); + } + + void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { + initCUDAStreamsOnce(); + check_gpu(device); + AT_CHECK(ptr); + AT_CHECK(ptr->device == device); + AT_CHECK(CUDAStream_retain(ptr)); + + CUDAStream_free(current_streams[device]); + current_streams[device] = ptr; + } + void CUDAStream_setStream(CUDAStreamInternals* ptr) { + CUDAStream_setStreamOnDevice(current_device(), ptr); + } + + // Getters + cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) { + AT_CHECK(ptr); + return ptr->stream; + } + + int64_t CUDAStream_device(CUDAStreamInternals* ptr) { + AT_CHECK(ptr); + return ptr->device; + } + + // Memory management + // Note: only destructible (non-default) streams are ref counted + bool CUDAStream_retain(CUDAStreamInternals* ptr) { + AT_CHECK(ptr); + if (ptr->is_destructible) return(++ptr->refcount > 1); + return true; + } + + void CUDAStream_free(CUDAStreamInternals*& ptr) { + if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { + AT_CHECK(ptr->refcount == 0); + DynamicCUDAInterface::cuda_stream_destroy(ptr->stream); + free(ptr); + ptr = nullptr; + } + } + +} // namespace detail + + /* + * CUDAStream functions + */ + + // Copy constructor + CUDAStream::CUDAStream(const CUDAStream& other) { + AT_CHECK(other.internals_); + AT_CHECK(detail::CUDAStream_retain(other.internals_)); + + internals_ = other.internals_; + } + + // Move constructor + CUDAStream::CUDAStream(CUDAStream&& other) { + AT_CHECK(other.internals_); + + std::swap(internals_, other.internals_); + } + +} // namespace at diff --git a/aten/src/ATen/CUDAStream.h b/aten/src/ATen/CUDAStream.h new file mode 100644 index 0000000..6e1a663 --- /dev/null +++ b/aten/src/ATen/CUDAStream.h @@ -0,0 +1,97 @@ +#pragma once + +#include +#include + +#include + +/* +* A CUDA stream interface with no CUDA build dependency. +* +* Includes the CUDAStream RAII class and a pointer-based stream API. +* +* The ATen Context interface should be preferred when working with streams. +*/ + +// Forward-declares cudaStream_t to avoid depending on CUDA in CPU builds +// Note: this is the internal CUDA runtime typedef for cudaStream_t +struct CUstream_st; +typedef struct CUstream_st* cudaStream_t; + +// Forward-declares internals +struct CUDAStreamInternals; + +namespace at { + +namespace detail { + +// Pointer-based API (for internal use) +// Note: ATen/Context is preferred to work with streams safely +AT_API CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device); +AT_API CUDAStreamInternals* CUDAStream_getDefaultStream(); + +AT_API CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority); + +AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device); +AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream(); + +// Note: these Unsafe gets should NEVER be used and are only here for legacy +// purposes. Once those uses are gone they should be removed. +AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device); +AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe(); + +AT_API void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals); +AT_API void CUDAStream_setStream(CUDAStreamInternals* internals); + +AT_API cudaStream_t CUDAStream_stream(CUDAStreamInternals*); +AT_API int64_t CUDAStream_device(CUDAStreamInternals*); + +AT_API bool CUDAStream_retain(CUDAStreamInternals*); +AT_API void CUDAStream_free(CUDAStreamInternals*&); + +} // namespace detail + +// RAII for a CUDA stream +// Allows use as a cudaStream_t, copying, moving, and metadata access. +struct CUDAStream { + // Constants + static constexpr int32_t DEFAULT_FLAGS = 1; // = cudaStreamNonBlocking; + static constexpr int32_t DEFAULT_PRIORITY = 0; + + // Constructors + CUDAStream() = default; + CUDAStream(CUDAStreamInternals* internals) : internals_{internals} { } + + // Destructor + ~CUDAStream() { detail::CUDAStream_free(internals_); } + + // Copy constructor + AT_API CUDAStream(const CUDAStream& other); + + // Move constructor + AT_API CUDAStream(CUDAStream&& other); + + // Assignment operator + CUDAStream& operator=(CUDAStream other) { + std::swap(internals_, other.internals_); + return *this; + } + + // Implicit conversion to cudaStream_t + operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); } + + // Less than operator (to allow use in sets) + friend bool operator<(const CUDAStream& left, const CUDAStream& right) { + return left.internals_ < right.internals_; + } + + // Getters + int64_t device() const { return detail::CUDAStream_device(internals_); } + cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); } + CUDAStreamInternals* internals() const { return internals_; } + +private: + CUDAStreamInternals* internals_ = nullptr; +}; + +} // namespace at diff --git a/aten/src/ATen/CheckGenerator.h b/aten/src/ATen/CheckGenerator.h new file mode 100644 index 0000000..3cf5c0f --- /dev/null +++ b/aten/src/ATen/CheckGenerator.h @@ -0,0 +1,18 @@ +#pragma once + +#include "ATen/Error.h" +#include "ATen/Generator.h" +#include "ATen/Utils.h" + +namespace at { + +template +static inline T * check_generator(Generator * expr, Generator * defaultValue) { + if (!expr) + expr = defaultValue; + if(auto result = dynamic_cast(expr)) + return result; + AT_ERROR("Expected a '", typeid(T).name(), "' but found '", typeid(expr).name(), "'"); +} + +} // namespace at diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in new file mode 100644 index 0000000..8373c92 --- /dev/null +++ b/aten/src/ATen/Config.h.in @@ -0,0 +1,10 @@ +#pragma once + +// Test these using #if AT_MKL_ENABLED(), not #ifdef, so that it's +// obvious if you forgot to include Config.h +// c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined +// +// DO NOT put the macros for CUDA libraries in this file; they belong in cuda/CUDAConfig.h + +#define AT_MKLDNN_ENABLED() @AT_MKLDNN_ENABLED@ +#define AT_MKL_ENABLED() @AT_MKL_ENABLED@ diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp new file mode 100644 index 0000000..59f6ff7 --- /dev/null +++ b/aten/src/ATen/Context.cpp @@ -0,0 +1,95 @@ +#include "ATen/Config.h" + +#include "Context.h" + +#include +#include +#include +#include +#include + +#include "ATen/CPUGenerator.h" + +#ifdef USE_SSE3 +#include +#endif + +namespace at { + +static inline void errorHandler(const char * msg, void * data) { + throw std::runtime_error(msg); +} +static inline void argErrorHandler(int arg, const char * msg, void * data) { + std::stringstream new_error; + new_error << "invalid argument " << arg << ": " << msg; + throw std::runtime_error(new_error.str()); +} + +Context::Context() +: next_id(static_cast(TypeID::NumOptions)) +, thc_state(nullptr, [](THCState* p){ /* no-op */ } ) { + + THSetDefaultErrorHandler(errorHandler,nullptr); + THSetDefaultArgErrorHandler(argErrorHandler,nullptr); + + generator_registry[static_cast(Backend::CPU)] + .reset(new CPUGenerator(this)); + Type::registerCPU(this); +} + +Context & globalContext() { + static Context globalContext_; + return globalContext_; +} + +// NB: This method is *purely* whether or not a user requested +// that CuDNN was enabled, it doesn't actually say anything about +// whether or not CuDNN is actually usable. +bool Context::userEnabledCuDNN() const { + return enabled_cudnn; +} + +void Context::setUserEnabledCuDNN(bool e) { + enabled_cudnn = e; +} + +bool Context::deterministicCuDNN() const { + return deterministic_cudnn; +} + +void Context::setDeterministicCuDNN(bool b) { + deterministic_cudnn = b; +} + +bool Context::benchmarkCuDNN() const { + return benchmark_cudnn; +} + +void Context::setBenchmarkCuDNN(bool b) { + benchmark_cudnn = b; +} + +bool Context::hasMKL() const { +#if AT_MKL_ENABLED() + return true; +#else + return false; +#endif +} + +bool Context::setFlushDenormal(bool on) { +#ifdef USE_SSE3 + // Setting flush-to-zero (FTZ) flag + _MM_SET_FLUSH_ZERO_MODE(on ? _MM_FLUSH_ZERO_ON + : _MM_FLUSH_ZERO_OFF); + + // Setting denormals-are-zero (DAZ) flag + _MM_SET_DENORMALS_ZERO_MODE(on ? _MM_DENORMALS_ZERO_ON + : _MM_DENORMALS_ZERO_OFF); + return true; +#else + return false; +#endif +} + +} diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h new file mode 100644 index 0000000..accb57b --- /dev/null +++ b/aten/src/ATen/Context.h @@ -0,0 +1,208 @@ +#pragma once + +#include "ATen/ATenGeneral.h" +#include +#include "ATen/Generator.h" +#include "ATen/Type.h" +#include "ATen/Utils.h" +#include "ATen/Error.h" +#include "ATen/detail/CUDAHooksInterface.h" +#include "ATen/CUDAStream.h" + +#include +#include +#include + +namespace at { + +enum class IsVariable { + NotVariable, + Variable, + NumOptions +}; + +class AT_API Context { +public: + Context(); + Type* getTypeRaw(Backend p, ScalarType s) { + return type_registry[static_cast(p)][static_cast(s)].get(); + } + Type * getTypeOpt(Backend p, ScalarType s) { + initCUDAIfNeeded(p); + auto type = getTypeRaw(p, s); + + if(!type) { + // there is only a single Undefined Type. + if (p == Backend::Undefined || s == ScalarType::Undefined) { + return getTypeRaw(Backend::Undefined, ScalarType::Undefined); + } + } + + return type; + } + Type & getType(Backend p, ScalarType s) { + auto* type = getTypeOpt(p, s); + if (!type) AT_ERROR(toString(p), toString(s), "Type is not enabled."); + return *type; + } + Generator & defaultGenerator(Backend p) { + initCUDAIfNeeded(p); + auto & generator = generator_registry[static_cast(p)]; + if(!generator) + AT_ERROR(toString(p), " backend type not enabled."); + return *generator; + } + bool hasMKL() const; + bool hasCUDA() const { + return detail::getCUDAHooks().hasCUDA(); + } + bool hasCuDNN() const { + return detail::getCUDAHooks().hasCuDNN(); + } + int64_t current_device() const { + return detail::getCUDAHooks().current_device(); + } + // defined in header so that getType has ability to inline + // call_once check. getType is called fairly frequently + THCState* lazyInitCUDA() { + std::call_once(thc_init,[&] { + thc_state = detail::getCUDAHooks().initCUDA(); + generator_registry[static_cast(Backend::CUDA)] = + detail::getCUDAHooks().initCUDAGenerator(this); + detail::getCUDAHooks().registerCUDATypes(this); + }); + return thc_state.get(); + } + + THCState* getTHCState() { + // AT_ASSERT(thc_state); + return thc_state.get(); + } + + CUDAStream createCUDAStream() const { + return detail::CUDAStream_createAndRetainWithOptions( + CUDAStream::DEFAULT_FLAGS + , CUDAStream::DEFAULT_PRIORITY + ); + } + + CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority) const { + return detail::CUDAStream_createAndRetainWithOptions(flags, priority); + } + + CUDAStream getDefaultCUDAStream() const { + return detail::CUDAStream_getDefaultStream(); + } + + CUDAStream getDefaultCUDAStreamOnDevice(int64_t device) const { + return detail::CUDAStream_getDefaultStreamOnDevice(device); + } + + CUDAStream getCurrentCUDAStream() const { + return detail::CUDAStream_getAndRetainCurrentStream(); + } + + CUDAStream getCurrentCUDAStreamOnDevice(int64_t device) const { + return detail::CUDAStream_getAndRetainCurrentStreamOnDevice(device); + } + + void setCurrentCUDAStream(CUDAStream stream) const { + return detail::CUDAStream_setStream(stream.internals()); + } + + void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) const { + return detail::CUDAStream_setStreamOnDevice(device, stream.internals()); + } + +#ifndef __HIP_PLATFORM_HCC__ + cusparseHandle_t getCurrentCUDASparseHandle() const { + return detail::getCUDAHooks().getCurrentCUDASparseHandle(thc_state.get()); + } +#endif + cudaDeviceProp* getCurrentDeviceProperties() const { + return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get()); + } + cudaDeviceProp* getDeviceProperties(int device) const { + return detail::getCUDAHooks().getDeviceProperties(thc_state.get(), device); + } + int getNumGPUs() const { + return detail::getCUDAHooks().getNumGPUs(); + } + size_t freshTypeID() { + return next_id++; + } + bool setFlushDenormal(bool on); + + // NB: This method is *purely* whether or not a user requested + // that CuDNN was enabled, it doesn't actually say anything about + // whether or not CuDNN is actually usable. Use cudnn_is_acceptable + // to test this instead + bool userEnabledCuDNN() const; + void setUserEnabledCuDNN(bool e); + bool benchmarkCuDNN() const; + void setBenchmarkCuDNN(bool); + bool deterministicCuDNN() const; + void setDeterministicCuDNN(bool); + std::unique_ptr + generator_registry[static_cast(Backend::NumOptions)]; +private: + // NB: type_registry has nullptr for all CUDA backends until + // CUDA initialization has occurred + std::unique_ptr type_registry + [static_cast(Backend::NumOptions)] + [static_cast(ScalarType::NumOptions)]; + void initCUDAIfNeeded(Backend p) { + if(p == Backend::CUDA) + lazyInitCUDA(); + } + std::once_flag thc_init; + bool enabled_cudnn = true; + bool deterministic_cudnn = false; + bool benchmark_cudnn = false; + std::atomic next_id; + std::unique_ptr thc_state; + friend struct Type; + friend void register_cuda_types(Context * context); +}; + +AT_API Context & globalContext(); + +static inline void init() { + globalContext(); + if (const char *env_p = std::getenv("OMP_NUM_THREADS")) { + at::set_num_threads(std::stoi(env_p)); + } + if (const char *env_p = std::getenv("MKL_NUM_THREADS")) { + at::set_num_threads(std::stoi(env_p)); + } +} + +static inline Type& getType(Backend p, ScalarType s) { + return globalContext().getType(p, s); +} + +static inline Type& CPU(ScalarType s) { + return getType(Backend::CPU, s); +} + +static inline Type& CUDA(ScalarType s) { + return getType(Backend::CUDA, s); +} + +static inline bool hasCUDA() { + return globalContext().hasCUDA(); +} + +static inline bool hasCuDNN() { + return globalContext().hasCuDNN(); +} + +static inline bool hasMKL() { + return globalContext().hasMKL(); +} + +static inline int64_t current_device() { + return globalContext().current_device(); +} + +} // namespace at diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp new file mode 100644 index 0000000..963a835 --- /dev/null +++ b/aten/src/ATen/DLConvertor.cpp @@ -0,0 +1,172 @@ +#include "ATen/DLConvertor.h" + +#include +#include + + +using namespace std; +namespace at { + +static DLDataType getDLDataType(const Type& type) { + DLDataType dtype; + dtype.lanes = 1; + dtype.bits = type.elementSizeInBytes() * 8; + switch (type.scalarType()) { + case ScalarType::Byte: + dtype.code = DLDataTypeCode::kDLUInt; + break; + case ScalarType::Char: + dtype.code = DLDataTypeCode::kDLInt; + break; + case ScalarType::Double: + dtype.code = DLDataTypeCode::kDLFloat; + break; + case ScalarType::Float: + dtype.code = DLDataTypeCode::kDLFloat; + break; + case ScalarType::Int: + dtype.code = DLDataTypeCode::kDLInt; + break; + case ScalarType::Long: + dtype.code = DLDataTypeCode::kDLInt; + break; + case ScalarType::Short: + dtype.code = DLDataTypeCode::kDLInt; + break; + case ScalarType::Half: + dtype.code = DLDataTypeCode::kDLFloat; + break; + case ScalarType::Undefined: + throw std::logic_error("Undefined is not a valid ScalarType"); + case ScalarType::NumOptions: + throw std::logic_error("NumOptions is not a valid ScalarType"); + } + return dtype; +} + + +static DLContext getDLContext(const Type& type, const int64_t& device_id) { + DLContext ctx; + ctx.device_id = device_id; + if (type.is_cuda()) { + ctx.device_type = DLDeviceType::kDLGPU; + } else { + ctx.device_type = DLDeviceType::kDLCPU; + } + return ctx; +} + + +static Backend getATenBackend(const DLContext& ctx) { + Backend backend; + switch (ctx.device_type) { + case DLDeviceType::kDLCPU: + backend = Backend::CPU; + break; + case DLDeviceType::kDLGPU: + backend = Backend::CUDA; + break; + default: + throw std::logic_error("Unsupported device_type: " + std::to_string(ctx.device_type)); + } + return backend; +} + + +ScalarType toScalarType(const DLDataType& dtype) { + ScalarType stype; + if (dtype.lanes != 1) throw std::logic_error("ATen does not support lanes != 1"); + switch (dtype.code) { + case DLDataTypeCode::kDLUInt: + switch (dtype.bits) { + case 8: + stype = ScalarType::Byte; + break; + default: + throw std::logic_error("Unsupported kUInt bits " + std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLInt: + switch (dtype.bits) { + case 8: + stype = ScalarType::Char; + break; + case 16: + stype = ScalarType::Short; + break; + case 32: + stype = ScalarType::Int; + break; + case 64: + stype = ScalarType::Long; + break; + default: + throw std::logic_error("Unsupported kInt bits " + std::to_string(dtype.bits)); + } + break; + case DLDataTypeCode::kDLFloat: + switch (dtype.bits) { + case 16: + stype = ScalarType::Half; + break; + case 32: + stype = ScalarType::Float; + break; + case 64: + stype = ScalarType::Double; + break; + default: + throw std::logic_error("Unsupported kFloat bits " + std::to_string(dtype.bits)); + } + break; + default: + throw std::logic_error("Unsupported code " + std::to_string(dtype.code)); + } + return stype; +} + +struct ATenDLMTensor { + Tensor handle; + DLManagedTensor tensor; +}; + +void deleter(DLManagedTensor * arg) { + delete static_cast(arg->manager_ctx); +} + + +// This function returns a shared_ptr to memory managed DLpack tensor constructed +// out of ATen tensor +DLManagedTensor* toDLPack(const Tensor& src) { + ATenDLMTensor * atDLMTensor(new ATenDLMTensor); + atDLMTensor->handle = src; + atDLMTensor->tensor.manager_ctx = atDLMTensor; + atDLMTensor->tensor.deleter = &deleter; + atDLMTensor->tensor.dl_tensor.data = src.data_ptr(); + int64_t device_id = 0; + if (src.type().is_cuda()) { + device_id = src.get_device(); + } + atDLMTensor->tensor.dl_tensor.ctx = getDLContext(src.type(), device_id); + atDLMTensor->tensor.dl_tensor.ndim = src.dim(); + atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src.type()); + atDLMTensor->tensor.dl_tensor.shape = const_cast(src.sizes().data()); + atDLMTensor->tensor.dl_tensor.strides = const_cast(src.strides().data()); + atDLMTensor->tensor.dl_tensor.byte_offset = 0; + return &(atDLMTensor->tensor); +} + + +Tensor fromDLPack(const DLManagedTensor* src) { + Backend backend = getATenBackend(src->dl_tensor.ctx); + ScalarType stype = toScalarType(src->dl_tensor.dtype); + auto deleter = [src](void * self) { + src->deleter(const_cast(src)); + }; + return getType(backend, stype).tensorFromBlob( + src->dl_tensor.data, + IntList(src->dl_tensor.shape, src->dl_tensor.ndim), + IntList(src->dl_tensor.strides, src->dl_tensor.ndim), + deleter); +} +} //namespace at diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h new file mode 100644 index 0000000..5ed9899 --- /dev/null +++ b/aten/src/ATen/DLConvertor.h @@ -0,0 +1,17 @@ +#pragma once + +#include "ATen/Tensor.h" +#include "ATen/ATen.h" +#include "ATen/dlpack.h" + +// this convertor will: +// 1) take a Tensor object and wrap it in the DLPack tensor +// 2) take a dlpack tensor and convert it to the ATen Tensor + +namespace at { + +AT_API ScalarType toScalarType(const DLDataType& dtype); +AT_API DLManagedTensor * toDLPack(const Tensor& src); +AT_API Tensor fromDLPack(const DLManagedTensor* src); + +} //namespace at diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap new file mode 100644 index 0000000..760c4a2 --- /dev/null +++ b/aten/src/ATen/Declarations.cwrap @@ -0,0 +1,3737 @@ +[[ + name: storageOffset + python_name: storage_offset + cpu_half: True + device_guard: False + return: long + arguments: + - THTensor* self +]] +[[ + name: nDimension + python_name: ndimension + cpu_half: True + device_guard: False + return: long + arguments: + - THTensor* self +]] +[[ + name: resize_ + return: self + cname: resize + cpu_half: True + arguments: + - THTensor* self + - arg: THSize* size + long_args: True + - CONSTANT NULL +]] +[[ + name: set_ + cname: set + cpu_half: True + device_guard: False + return: argument 0 + options: + - cname: set + scalar_check: source_->isScalar() + arguments: + - THTensor* self + - THTensor* source + - cname: setStorage + scalar_check: False + arguments: + - THTensor* self + - CONSTANT NULL, 0, THLongStorageView({0}, THLongStorageViewKind::SIZE), NULL + - cname: setStorage + scalar_check: False + arguments: + - THTensor* self + - THStorage* source + - CONSTANT 0 + - CONSTANT __storage_size.get() + - CONSTANT NULL + - cname: setStorage + arguments: + - THTensor* self + - THStorage* source + - long storage_offset + - THSize* size + - arg: THStride* stride + default: NULL +]] +[[ + name: _fill_ + return: self + cname: fill + options: + - arguments: + - THTensor* self + - real value + - zero_dim_tensor_only: True + arguments: + - THTensor* self + - THTensor* value +]] +[[ + name: isContiguous + python_name: is_contiguous + cpu_half: True + device_guard: False + return: bool + arguments: + - THTensor* self +]] +[[ + name: isSetTo + python_name: is_set_to + cpu_half: True + device_guard: False + return: bool + arguments: + - THTensor* self + - THTensor* tensor +]] +[[ + name: maskedFill_ + cname: maskedFill + python_name: masked_fill_ + return: self + options: + - arguments: + - arg: THTensor* self + broadcast: mask inplace fallback types:Byte + - THBoolTensor* mask + - real value + - zero_dim_tensor_only: True + arguments: + - arg: THTensor* self + broadcast: mask inplace fallback types:Byte + - THBoolTensor* mask + - THTensor* value +]] +[[ + name: maskedCopy_ + cname: maskedCopy + python_name: masked_scatter_ + return: self + arguments: + - arg: THTensor* self + broadcast: mask inplace fallback types:Byte + - THBoolTensor* mask + - THTensor* source +]] +[[ + name: maskedSelect + python_name: masked_select + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: mask fallback types:Byte + - THBoolTensor* mask +]] +[[ + name: nonzero + variants: + - method + - function + return: argument 0 + arguments: + - arg: THIndexTensor* result + output: True + - THTensor* self +]] +[[ + name: contiguous + cname: newContiguous + return: THTensor* + arguments: + - THTensor* self +]] +[[ + name: th_clone + cname: newClone + return: THTensor* + variants: + - function + cpu_half: True + arguments: + - THTensor* self +]] +[[ + name: view + cname: newView + device_guard: False + return: THTensor* + arguments: + - THTensor* self + - arg: THSize* size + long_args: True +]] +[[ + name: resizeAs_ + python_name: th_resize_as_ + cname: resizeAs + variants: + - function + return: self + scalar_check: the_template_->isScalar() + arguments: + - THTensor* self + - THTensor* the_template +]] +[[ + name: indexSelect + python_name: index_select + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index +]] +[[ + name: _indexCopy_ + cname: indexCopy + return: argument 0 + arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - THTensor* source +]] +[[ + name: take + cname: take + variants: + - method + - function + return: argument 0 + scalar_check: index_->isScalar() + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THIndexTensor* index +]] +[[ + name: put_ + cname: put + backends: + - CPU + - CUDA + return: argument 0 + arguments: + - THTensor* self + - THIndexTensor* index + - THTensor* source + - arg: bool accumulate + default: "false" +]] +[[ + name: indexAdd_ + python_name: index_add_ + cname: indexAdd + return: argument 0 + arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - THTensor* source +]] +[[ + name: indexFill_ + python_name: index_fill_ + cname: indexFill + return: argument 0 + options: + - arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - real value + - zero_dim_tensor_only: True + arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - THTensor* value +]] +[[ + name: unfold + cpu_half: True + device_guard: False + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dimension + wrap_dim: self + - long size + - long step +]] +[[ + name: _range + cname: range + variants: + - function + backends: + - CPU + - CUDA + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - accreal start + - accreal end + - arg: accreal step + default: 1 +]] +[[ + name: _arange + variants: + - function + backends: + - CPU + - CUDA + return: argument 0 + options: + - cname: arange + arguments: + - arg: THTensor* result + output: True + - accreal start + - accreal end + - arg: accreal step + default: 1 + - cname: arange + arguments: + - arg: THTensor* result + output: True + - CONSTANT 0 + - accreal end + - CONSTANT 1 +]] +[[ + name: scatter_ + return: argument 0 + options: + - cname: scatter + arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - THTensor* src + - cname: scatterFill + arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - real value +]] +[[ + name: scatter_add_ + return: argument 0 + cname: scatterAdd + arguments: + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index + - THTensor* src +]] +[[ + name: gather + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + resize: index + - THTensor* self + - arg: long dim + wrap_dim: self + - THIndexTensor* index +]] +[[ + name: data_ptr + with_gil: True + device_guard: False + return: void* + cpu_half: True + cname: data + arguments: + - THTensor* self +]] +[[ + name: equal + variants: + - method + - function + return: bool + arguments: + - THTensor* self + - THTensor* other +]] +[[ + name: __and__ + variants: + - method + - function + return: argument 0 + options: + - cname: bitand + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: cbitand + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: __iand__ + variants: + - method + return: argument 0 + options: + - cname: bitand + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: cbitand + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: __or__ + variants: + - method + - function + return: argument 0 + options: + - cname: bitor + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: cbitor + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: __ior__ + variants: + - method + return: argument 0 + options: + - cname: bitor + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: cbitor + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: __xor__ + variants: + - method + - function + return: argument 0 + options: + - cname: bitxor + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: cbitxor + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: __ixor__ + variants: + - method + return: argument 0 + options: + - cname: bitxor + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: cbitxor + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: __lshift__ + variants: + - method + - function + return: argument 0 + options: + - cname: lshift + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: clshift + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: __ilshift__ + variants: + - method + return: argument 0 + options: + - cname: lshift + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: clshift + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: __rshift__ + variants: + - method + - function + return: argument 0 + options: + - cname: rshift + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: crshift + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: __irshift__ + variants: + - method + return: argument 0 + options: + - cname: rshift + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: crshift + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: lt + variants: + - method + - function + return: argument 0 + options: + - cname: ltValue + arguments: + - arg: THBoolTensor* result + output: True + - THTensor* self + - real other + - cname: ltTensor + arguments: + - arg: THBoolTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: lt_ + return: self + options: + - cname: ltValueT + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: ltTensorT + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - arg: THTensor* other +]] +[[ + name: gt + variants: + - method + - function + return: argument 0 + options: + - cname: gtValue + arguments: + - arg: THBoolTensor* result + output: True + - THTensor* self + - real other + - cname: gtTensor + arguments: + - arg: THBoolTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: gt_ + return: self + options: + - cname: gtValueT + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: gtTensorT + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: le + variants: + - method + - function + return: argument 0 + options: + - cname: leValue + arguments: + - arg: THBoolTensor* result + output: True + - THTensor* self + - real other + - cname: leTensor + arguments: + - arg: THBoolTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: le_ + return: self + options: + - cname: leValueT + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: leTensorT + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: ge + variants: + - method + - function + return: argument 0 + options: + - cname: geValue + arguments: + - arg: THBoolTensor* result + output: True + - THTensor* self + - real other + - cname: geTensor + arguments: + - arg: THBoolTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: ge_ + return: self + options: + - cname: geValueT + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: geTensorT + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: eq + variants: + - method + - function + return: argument 0 + options: + - cname: eqValue + arguments: + - arg: THBoolTensor* result + output: True + - THTensor* self + - real other + - cname: eqTensor + arguments: + - arg: THBoolTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: eq_ + return: self + options: + - cname: eqValueT + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: eqTensorT + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: ne + variants: + - method + - function + return: argument 0 + options: + - cname: neValue + arguments: + - arg: THBoolTensor* result + output: True + - THTensor* self + - real other + - cname: neTensor + arguments: + - arg: THBoolTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: ne_ + return: self + options: + - cname: neValueT + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: neTensorT + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: min + variants: + - method + - function + options: + - cname: minall + return: real + arguments: + - THTensor* self + - cname: cmin + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: _th_min + variants: + - method + - function + options: + - cname: min + return: argument 0,1 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* min + output: True + - arg: THIndexTensor* min_indices + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: max + variants: + - method + - function + options: + - cname: maxall + return: real + arguments: + - THTensor* self + - cname: cmax + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: _th_max + variants: + - method + - function + options: + - cname: max + return: argument 0,1 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* max + output: True + - arg: THIndexTensor* max_indices + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: _th_kthvalue + backends: + - CPU + variants: + - method + - function + cname: kthvalue + return: argument 0,1 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* values + output: True + - arg: THIndexTensor* indices + output: True + - THTensor* self + - long k + - arg: long dim + wrap_dim: self + default: __last_dim + - arg: bool keepdim + default: "false" +]] +[[ + name: _th_mode + variants: + - method + - function + cname: mode + return: argument 0,1 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* values + output: True + - arg: THIndexTensor* indices + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + default: __last_dim + - arg: bool keepdim + default: "false" +]] +[[ + name: median + variants: + - method + - function + return: argument 0,1 + options: + - cname: medianall + return: real + arguments: + - THTensor* self +]] +[[ + name: _th_median + variants: + - method + - function + cname: median + return: argument 0,1 + options: + - cname: median + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* values + output: True + - arg: THIndexTensor* indices + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: sort + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* values + output: True + - arg: THIndexTensor* indices + output: True + - THTensor* self + - arg: long dim + default: __last_dim + wrap_dim: self + - arg: bool descending + default: "false" +]] +[[ + name: topk + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* values + output: True + - arg: THIndexTensor* indices + output: True + - THTensor* self + - long k + - arg: long dim + default: __last_dim + wrap_dim: self + - arg: bool largest + default: "true" + - arg: bool sorted + default: "true" +]] +[[ + name: all + types: + - Byte + variants: + - method + - function + backends: + - CPU + - CUDA + options: + - cname: logicalAndAll + return: real + arguments: + - THTensor* self +]] +[[ + name: _th_all + types: + - Byte + variants: + - method + - function + backends: + - CPU + - CUDA + options: + - cname: logicalAnd + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: any + types: + - Byte + variants: + - method + - function + backends: + - CPU + - CUDA + options: + - cname: logicalAnyAll + return: real + arguments: + - THTensor* self +]] +[[ + name: _th_any + types: + - Byte + variants: + - method + - function + backends: + - CPU + - CUDA + options: + - cname: logicalAny + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: getDevice + python_name: _th_get_device + device_guard: False + variants: + - function + backends: + - CUDA + return: long + arguments: + - THTensor* self +]] +[[ + name: _abs + cname: abs + types: + - floating_point + - Long + - Int + - Short + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _th_sigmoid + types: + - floating_point + backends: + - CUDA + cname: sigmoid + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _log + cname: log + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _log10 + cname: log10 + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _log1p + cname: log1p + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _log2 + cname: log2 + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: lgamma + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: lgamma_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: lgamma + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: digamma + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: digamma_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: digamma + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: polygamma + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - int64_t n + - THTensor* self +]] +[[ + name: polygamma_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: polygamma + return: self + arguments: + - THTensor* self + - int64_t n + - THTensor* self +]] +[[ + name: _exp + cname: exp + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _expm1 + cname: expm1 + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _cos + cname: cos + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _acos + cname: acos + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _cosh + cname: cosh + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _sin + cname: sin + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _asin + cname: asin + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _sinh + cname: sinh + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _tan + cname: tan + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _atan + cname: atan + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _th_tanh + cname: tanh + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _erf + cname: erf + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _erfc + cname: erfc + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: erfinv_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: erfinv + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: erfinv + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _sqrt + cname: sqrt + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _rsqrt + cname: rsqrt + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _ceil + cname: ceil + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _floor + cname: floor + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _round + cname: round + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _trunc + cname: trunc + types: + - floating_point + backends: + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: frac_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: frac + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: frac + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: _th_var + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + options: + - cname: varall + return: accreal + arguments: + - THTensor* self + - arg: bool unbiased + if_true: 0 + if_false: 1 + default: 0 + - cname: var + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool unbiased + if_true: 0 + if_false: 1 + default: 0 + - arg: bool keepdim + default: "false" +]] +[[ + name: _th_std + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + options: + - cname: stdall + return: accreal + arguments: + - THTensor* self + - arg: bool unbiased + if_true: 0 + if_false: 1 + default: 0 + - cname: std + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool unbiased + if_true: 0 + if_false: 1 + default: 0 + - arg: bool keepdim + default: "false" +]] +[[ + name: th_norm + cname: norm + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - function + options: + - cname: normall + return: accreal + arguments: + - THTensor* self + - arg: real p + default: AS_REAL(2) +]] +[[ + name: _th_norm + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + options: + - cname: norm + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: real p + python_default_init: AS_REAL(2) + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: renorm + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real p + - arg: long dim + wrap_dim: self + - real maxnorm +]] +[[ + name: renorm_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: renorm + return: self + arguments: + - THTensor* self + - THTensor* self + - real p + - arg: long dim + wrap_dim: self + - real maxnorm +]] +[[ + name: dist + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + options: + - cname: dist + return: accreal + arguments: + - arg: THTensor* self + broadcast: other fallback + - THTensor* other + - arg: real p + default: AS_REAL(2) +]] +[[ + name: reciprocal + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + options: + - cname: cinv + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: reciprocal_ + types: + - floating_point + backends: + - CPU + - CUDA + options: + - cname: cinv + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: neg + backends: + - CPU + - CUDA + variants: + - method + - function + options: + - cname: neg + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: neg_ + backends: + - CPU + - CUDA + options: + - cname: neg + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: atan2 + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + cname: atan2 + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: atan2_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: atan2 + return: argument 0 + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other fallback inplace + - THTensor* other +]] +[[ + name: th_pow + cname: pow + variants: + - function + return: argument 0 + options: + - cname: pow + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real exponent +]] +[[ + name: pow + variants: + - method + - function + return: argument 0 + options: + - cname: cpow + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: exponent fallback + - THTensor* exponent +]] +[[ + name: pow + variants: + - function + return: argument 0 + options: + - cname: tpow + arguments: + - arg: THTensor* result + output: True + - real base + - THTensor* self +]] +[[ + name: pow_ + return: argument 0 + cname: pow + options: + - cname: pow + arguments: + - THTensor* self + - THTensor* self + - real exponent + - cname: cpow + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: exponent inplace fallback + - THTensor* exponent +]] +[[ + name: lerp + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + cname: lerp + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: end fallback + - THTensor* end + - real weight +]] +[[ + name: lerp_ + types: + - floating_point + backends: + - CPU + - CUDA + return: self + cname: lerp + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: end fallback inplace + - THTensor* end + - real weight +]] +[[ + name: _linspace + cname: linspace + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - real start + - real end + - arg: long steps + default: 100 +]] +[[ + name: _logspace + cname: logspace + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - real start + - real end + - arg: long steps + default: 100 +]] +[[ + name: histc + types: + - Float + - Double + backends: + - CPU + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long bins + default: 100 + - arg: real min + default: 0 + - arg: real max + default: 0 +]] +[[ + name: th_zero_ + cname: zero + return: self + variants: + - function + arguments: + - THTensor* self +]] +[[ + name: _sumall + variants: + - method + - function + options: + - cname: sumall + return: accreal + arguments: + - THTensor* self +]] +[[ + name: _th_sum + variants: + - method + - function + options: + - cname: sum + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: _prodall + variants: + - method + - function + options: + - cname: prodall + return: accreal + arguments: + - THTensor* self +]] +[[ + name: _th_prod + variants: + - method + - function + options: + - cname: prod + return: argument 0 + scalar_check: self_->isScalar() || (keepdim == false && self_->dim() == 1) + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self + - arg: bool keepdim + default: "false" +]] +[[ + name: _cumsum + cname: cumsum + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self +]] +[[ + name: _cumprod + cname: cumprod + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long dim + wrap_dim: self +]] +[[ + name: sign + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self +]] +[[ + name: sign_ + cname: sign + return: self + arguments: + - THTensor* self + - THTensor* self +]] +[[ + name: trace + variants: + - method + - function + return: accreal + arguments: + - THTensor* self +]] +[[ + name: th_add + variants: + - function + return: argument 0 + options: + - cname: cadd + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* other +]] +[[ + name: add + variants: + - method + - function + return: argument 0 + options: + - cname: add_scaled + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True +]] +[[ + name: th_add_ + return: argument 0 + variants: [function] + options: + - cname: cadd + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* other +]] +[[ + name: add_ + return: argument 0 + options: + - cname: add_scaled + arguments: + - THTensor* self + - THTensor* self + - real other + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True +]] +[[ + name: th_sub + variants: + - function + return: argument 0 + options: + - cname: csub + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* other +]] +[[ + name: sub + variants: + - method + - function + return: argument 0 + options: + - cname: sub_scaled + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True +]] +[[ + name: th_sub_ + return: argument 0 + variants: [function] + options: + - cname: csub + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* other +]] +[[ + name: sub_ + return: argument 0 + options: + - cname: sub_scaled + arguments: + - THTensor* self + - THTensor* self + - real other + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True +]] +[[ + name: th_mul + variants: + - function + return: argument 0 + options: + - cname: mul + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: cmul + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - arg: THTensor* other +]] +[[ + name: th_mul_ + variants: + - function + return: argument 0 + options: + - cname: mul + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: cmul + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: th_div + variants: + - function + return: argument 0 + options: + - cname: div + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other +]] +[[ + name: div + variants: + - method + - function + return: argument 0 + options: + - cname: cdiv + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: th_div_ + variants: [function] + return: argument 0 + options: + - cname: div + arguments: + - THTensor* self + - THTensor* self + - real other +]] +[[ + name: div_ + return: argument 0 + options: + - cname: cdiv + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: fmod + return: argument 0 + variants: + - method + - function + options: + - cname: fmod + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: cfmod + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - THTensor* other +]] +[[ + name: fmod_ + return: argument 0 + options: + - cname: fmod + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: cfmod + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: remainder + return: argument 0 + variants: + - method + - function + options: + - cname: remainder + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real other + - cname: cremainder + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: other fallback + - arg: THTensor* other +]] +[[ + name: remainder_ + return: argument 0 + options: + - cname: remainder + arguments: + - THTensor* self + - THTensor* self + - real other + - cname: cremainder + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: other inplace fallback + - THTensor* other +]] +[[ + name: clamp + cname: clamp + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real min + - real max +]] +[[ + name: clamp_ + cname: clamp + variants: + - method + - function + return: argument 0 + arguments: + - THTensor* self + - THTensor* self + - real min + - real max +]] +[[ + name: clamp_min + cname: cmaxValue + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real min +]] +[[ + name: clamp_min_ + cname: cmaxValue + variants: + - method + - function + return: argument 0 + arguments: + - THTensor* self + - THTensor* self + - real min +]] +[[ + name: clamp_max + cname: cminValue + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - real max +]] +[[ + name: clamp_max_ + cname: cminValue + variants: + - method + - function + return: argument 0 + arguments: + - THTensor* self + - THTensor* self + - real max +]] +[[ + name: _dot + backend_type_pairs: [[CUDA,floating_point], [CPU,all]] + cname: dot + variants: + - method + - function + return: accreal + arguments: + - arg: THTensor* self + - arg: THTensor* tensor +]] +[[ + name: tril + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long diagonal + default: 0 +]] +[[ + name: tril_ + cname: tril + return: self + arguments: + - THTensor* self + - THTensor* self + - arg: long diagonal + default: 0 +]] +[[ + name: triu + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long diagonal + default: 0 +]] +[[ + name: triu_ + cname: triu + return: self + arguments: + - THTensor* self + - THTensor* self + - arg: long diagonal + default: 0 +]] +[[ + name: cross + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THTensor* other + - arg: long dim + default: -1 +]] +[[ + name: diag + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - arg: long diagonal + default: 0 + aten_custom_call: | + if (self_->isScalar()) { + throw std::runtime_error("Input must be 1-d or 2-d"); + } + ${THTensor}_diag(${state,}result_->tensor, self_->tensor, diagonal); + result_->maybeScalar(self_->isScalar()); +]] +[[ + name: th_addmm + cname: addmm + variants: + - function + return: argument 0 + options: + - arguments: + - arg: THTensor* result + output: True + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - arg: THTensor* self + broadcast: mat1,mat2 dims:mat1.dim0,mat2.dim1 + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* mat1 + - THTensor* mat2 +]] +[[ + name: th_addmm_ + variants: [function] + return: self + options: + - cname: addmm + arguments: + - THTensor* self + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - THTensor* self + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* mat1 + - THTensor* mat2 +]] +[[ + name: _addmv + cname: addmv + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - arg: THTensor* self + broadcast: mat,vec dims:mat.dim0 + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* mat + - THTensor* vec +]] +[[ + name: _addmv_ + cname: addmv + return: self + arguments: + - THTensor* self + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - THTensor* self + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* mat + - THTensor* vec +]] +[[ + name: _addr + cname: addr + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - arg: THTensor* self + broadcast: vec1,vec2 dims:vec1.dim0,vec2.dim0 + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* vec1 + - THTensor* vec2 +]] +[[ + name: _addr_ + cname: addr + return: self + arguments: + - THTensor* self + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - THTensor* self + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* vec1 + - THTensor* vec2 +]] +[[ + name: _ger + cname: addr + variants: + - method + - function + return: argument 0 + scalar_check: False + arguments: + - arg: THTensor* result + output: True + resize: [ [self,0], [vec2,0] ] + resize_scalar: True + - CONSTANT AS_REAL(0) + - argument 0 + - CONSTANT AS_REAL(1) + - THTensor* self + - THTensor* vec2 +]] +[[ + name: _mv + cname: addmv + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + resize: [ [self, 0] ] + cpu_zero: True + - CONSTANT AS_REAL(0) + - argument 0 + - CONSTANT AS_REAL(1) + - THTensor* self + - THTensor* vec +]] +[[ + name: _mm + variants: + - method + - function + return: argument 0 + options: + - cname: addmm + arguments: + - arg: THTensor* result + output: True + resize: [ [self, 0], [mat2,1] ] + cpu_zero: True + - CONSTANT AS_REAL(0) + - argument 0 + - CONSTANT AS_REAL(1) + - THTensor* self + - THTensor* mat2 +]] +[[ + name: bmm + cname: baddbmm + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + resize: [ [self,0], [self,1], [mat2,2] ] + cpu_zero: True + - CONSTANT AS_REAL(0) + - argument 0 + - CONSTANT AS_REAL(1) + - THTensor* self + - THTensor* mat2 +]] +[[ + name: addbmm + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - arg: THTensor* self + broadcast: batch1,batch2 dims:batch1.dim1,batch2.dim2 + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* batch1 + - THTensor* batch2 +]] +[[ + name: addbmm_ + cname: addbmm + return: self + arguments: + - THTensor* self + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - THTensor* self + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* batch1 + - THTensor* batch2 +]] +[[ + name: baddbmm + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - arg: THTensor* self + broadcast: batch1,batch2 dims:batch1.dim0,batch1.dim1,batch2.dim2 + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* batch1 + - THTensor* batch2 +]] +[[ + name: baddbmm_ + cname: baddbmm + return: argument 0 + arguments: + - THTensor* self + - arg: real beta + default: AS_REAL(1) + kwarg_only: True + - THTensor* self + - arg: real alpha + default: AS_REAL(1) + kwarg_only: True + - THTensor* batch1 + - THTensor* batch2 +]] +[[ + name: addcmul + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: tensor1,tensor2 fallback + - arg: real value + default: AS_REAL(1) + kwarg_only: True + - THTensor* tensor1 + - THTensor* tensor2 +]] +[[ + name: addcmul_ + options: + - cname: addcmul + return: argument 0 + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: tensor1,tensor2 inplace fallback + - arg: real value + default: AS_REAL(1) + kwarg_only: True + - THTensor* tensor1 + - THTensor* tensor2 + - cname: spaddcmul + return: argument 0 + arguments: + - THTensor* self + - THTensor* self + - arg: real value + default: AS_REAL(1) + kwarg_only: True + - THSTensor* tensor1 + - THSTensor* tensor2 +]] +[[ + name: addcdiv + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - arg: THTensor* self + broadcast: tensor1,tensor2 fallback + - arg: real value + default: AS_REAL(1) + kwarg_only: True + - THTensor* tensor1 + - THTensor* tensor2 +]] +[[ + name: addcdiv_ + cname: addcdiv + return: argument 0 + arguments: + - THTensor* self + - arg: THTensor* self + broadcast: tensor1,tensor2 inplace fallback + - arg: real value + default: AS_REAL(1) + kwarg_only: True + - THTensor* tensor1 + - THTensor* tensor2 +]] +[[ + name: _gesv_single + cname: gesv + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* solution + output: True + - arg: THTensor* lu + output: True + - THTensor* self + - THTensor* A +]] +[[ + name: gels + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - THTensor* self + - THTensor* A +]] +[[ + name: trtrs + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - THTensor* self + - THTensor* A + - arg: bool upper + if_true: U + if_false: L + default: U + - arg: bool transpose + if_true: T + if_false: N + default: N + - arg: bool unitriangular + if_true: U + if_false: N + default: N +]] +[[ + name: symeig + cname: syev + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - THTensor* self + - arg: bool eigenvectors + if_true: V + if_false: N + default: N + - arg: bool upper + if_true: U + if_false: L + default: U +]] +[[ + name: eig + cname: geev + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - THTensor* self + - arg: bool eigenvectors + if_true: V + if_false: N + default: N +]] +[[ + name: svd + cname: gesvd + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1,2 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - arg: THTensor* res3 + output: True + - THTensor* self + - arg: bool some + if_true: S + if_false: A + default: S +]] +[[ + name: _getri + cname: getri + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* output + output: True + - THTensor* self +]] +[[ + name: potrf + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* output + output: True + - THTensor* self + - arg: bool upper + if_true: U + if_false: L + default: U +]] +[[ + name: potrs + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THTensor* input2 + - arg: bool upper + if_true: U + if_false: L + default: U +]] +[[ + name: potri + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* output + output: True + - THTensor* self + - arg: bool upper + if_true: U + if_false: L + default: U +]] +[[ + name: pstrf + types: + - Float + - Double + backends: + - CPU + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THIntegerTensor* res2 + output: True + - THTensor* self + - arg: bool upper + if_true: U + if_false: L + default: U + - arg: real tol + default: -1 + aten_custom_call: | + ${THTensor}_pstrf(res1_->tensor, res2_->tensor, self_->tensor, (upper) ? "U" : "L", tol_); + res2 -= 1; // LAPACK returns 1-indexed pivots +]] +[[ + name: qr + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - THTensor* self +]] +[[ + name: geqrf + types: + - Float + - Double + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* res1 + output: True + - arg: THTensor* res2 + output: True + - THTensor* self +]] +[[ + name: orgqr + types: + - Float + - Double + backends: + - CPU + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THTensor* input2 +]] +[[ + name: ormqr + types: + - Float + - Double + backends: + - CPU + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THTensor* input2 + - THTensor* input3 + - arg: bool left + if_true: L + if_false: R + default: L + - arg: bool transpose + if_true: T + if_false: N + default: N +]] +[[ + name: btrifact + cname: btrifact + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1 + arguments: + - arg: THTensor* result + output: True + - arg: THIntegerTensor* pivots + output: True + - CONSTANT NULL + - arg: bool pivot + kwarg_only: True + default: "true" + - THTensor* self +]] +[[ + name: btrifact_with_info + cname: btrifact + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0,1,2 + arguments: + - arg: THTensor* result + output: True + - arg: THIntegerTensor* pivots + output: True + - arg: THIntegerTensor* info + output: True + - arg: bool pivot + kwarg_only: True + default: "true" + - THTensor* self +]] +[[ + name: btrisolve + cname: btrisolve + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THTensor* LU_data + - THIntegerTensor* LU_pivots +]] +[[ + name: random_ + backends: + - CPU + - CUDA + return: self + options: + - cname: random + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - cname: cappedRandom + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - int64_t to + - cname: clampedRandom + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - int64_t from + - int64_t to +]] +[[ + name: multinomial + types: + - floating_point + backends: + - CPU + - CUDA + variants: + - method + - function + return: argument 0 + arguments: + - arg: THIndexTensor* result + output: True + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - THTensor* self + - long num_samples + - arg: bool replacement + default: "false" +]] +[[ + name: uniform_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: uniform + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - arg: double from + default: 0 + - arg: double to + default: 1 +]] +[[ + name: normal + types: + - floating_point + backends: + - CPU + - CUDA + return: argument 0 + variants: + - function + options: + - cname: normal_means + arguments: + - arg: THTensor* output + output: True + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - THTensor* mean + - arg: double std + default: 1 + - cname: normal_stddevs + arguments: + - arg: THTensor* output + output: True + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - arg: double mean + - THTensor* std + - cname: normal_means_stddevs + arguments: + - arg: THTensor* output + output: True + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - THTensor* mean + - THTensor* std +]] +[[ + name: normal_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: normal + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - arg: double mean + default: 0 + - arg: double std + default: 1 +]] +[[ + name: cauchy_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: cauchy + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - arg: double median + default: 0 + - arg: double sigma + default: 1 +]] +[[ + name: logNormal_ + cname: logNormal + python_name: log_normal_ + types: + - floating_point + backends: + - CPU + - CUDA + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - arg: double mean + default: 1 + - arg: double std + default: 2 +]] +[[ + name: exponential_ + types: + - floating_point + backends: + - CPU + - CUDA + cname: exponential + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - arg: double lambd + default: 1 +]] +[[ + name: geometric_ + backends: + - CPU + - CUDA + cname: geometric + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - double p +]] +[[ + name: _bernoulli_ + backends: + - CPU + - CUDA + cname: bernoulli + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - double p +]] +[[ + name: _th_bernoulli + types: + - Float + - Double + return: argument 0 + variants: + - method + - function + cname: bernoulli_Tensor + arguments: + - arg: THTensor* output + output: True + resize: self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - THTensor* self +]] +[[ + name: _dirichlet_grad + types: + - floating_point + backends: + - CPU + return: argument 0 + variants: + - function + options: + - cname: dirichlet_grad + arguments: + - arg: THTensor* output + output: True + - THTensor* x + - THTensor* alpha + - THTensor* total +]] +[[ + name: th_tensor + return: THTensor* + cpu_half: True + variants: [function] + options: + - cname: new + arguments: [] + - cname: newWithSize + arguments: + - THSize* size + - CONSTANT NULL +]] +[[ + name: tensor + return: THTensor* + cpu_half: True + variants: [function] + options: + - cname: newWithSize + arguments: + - THSize* size + - arg: THStride* stride + - cname: newWithStorage + arguments: + - THStorage* storage + - int64_t storageOffset + - THSize* size + - arg: THStride* stride + default: NULL +]] + +# In theory, this could be a part of the above declaration. But in +# practice this leads to all sorts of problems with ambiguous overloads. +# So we add it here with a separate name. +[[ + name: alias + return: THTensor* + cpu_half: True + variants: [function] + options: + - cname: newWithTensor + arguments: + - THTensor* self +]] +[[ + name: _copy_ignoring_overlaps_ + cname: copyIgnoringOverlaps + return: self + backends: + - CUDA + arguments: + - THTensor* self + - THTensor* src +]] + +[[ + name: as_strided + variants: [method,function] + cpu_half: True + return: argument 0 + arguments: + - arg: THTensor* result + output: True + - THTensor* self + - THSize* size + - THStride* stride + - arg: int64_t storage_offset + aten_custom_call: | + ${THTensor}_setStorage(${state,}result_->tensor, self_->tensor->storage, storage_offset, size_, stride_); + result_->maybeScalar(size.size() == 0); +]] + +[[ + name: as_strided_ + variants: [method] + cpu_half: True + return: argument 0 + arguments: + - THTensor* self + - THSize* size + - THStride* stride + - arg: int64_t storage_offset + aten_custom_call: | + ${THTensor}_setStorage(${state,}self_->tensor, self_->tensor->storage, storage_offset, size_, stride_); + self_->maybeScalar(size.size() == 0); +]] + +[[ + name: _cat + cname: catArray + variants: [function] + return: self + arguments: + - arg: THTensor* self + output: True + - TensorList tensors + - arg: int64_t dim + default: 0 +]] diff --git a/aten/src/ATen/Deprecated.h b/aten/src/ATen/Deprecated.h new file mode 100644 index 0000000..6e136ed --- /dev/null +++ b/aten/src/ATen/Deprecated.h @@ -0,0 +1,16 @@ +#pragma once + +// Largely from https://stackoverflow.com/questions/295120/c-mark-as-deprecated + +#if defined(__cplusplus) && __cplusplus > 201402L +#define AT_DEPRECATED(function) [[deprecated]] function +#else +#if defined(__GNUC__) +#define AT_DEPRECATED(function) __attribute__((deprecated)) function +#elif defined(_MSC_VER) +#define AT_DEPRECATED(function) __declspec(deprecated) function +#else +#warning "You need to implement AT_DEPRECATED for this compiler" +#define AT_DEPRECATED(function) function +#endif // defined(__GNUC__) +#endif // defined(__cplusplus) && __cplusplus > 201402L diff --git a/aten/src/ATen/Device.cpp b/aten/src/ATen/Device.cpp new file mode 100644 index 0000000..14ad860 --- /dev/null +++ b/aten/src/ATen/Device.cpp @@ -0,0 +1,100 @@ +#include + +#include + +#include +#include +#include +#include + +namespace at { +namespace { +std::pair parse_type(const std::string& device_string) { + auto position = device_string.find("cpu"); + if (position != std::string::npos) { + return {Device::Type::CPU, 3}; + } + position = device_string.find("cuda"); + if (position != std::string::npos) { + return {Device::Type::CUDA, 4}; + } + AT_ERROR("Expected 'cpu' or 'cuda' device type at start of device string"); +} +} // namespace + +// `std::regex` is still in a very incomplete state in GCC 4.8.x, +// so we have to do our own parsing, like peasants. +// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions +// +// Replace with the following code once we shed our GCC skin: +// +// static const std::regex regex( +// "(cuda|cpu)|(cuda|cpu):([0-9]+)|([0-9]+)", +// std::regex_constants::basic); +// std::smatch match; +// const bool ok = std::regex_match(device_string, match, regex); +// AT_CHECK(ok, "Invalid device string: '", device_string, "'"); +// if (match[1].matched) { +// type_ = parse_type_from_string(match[1].str()); +// } else { +// if (match[2].matched) { +// type_ = parse_type_from_string(match[1].str()); +// } else { +// type_ = Type::CUDA; +// } +// AT_ASSERT(match[3].matched); +// index_ = std::stoi(match[3].str()); +// } +Device::Device(const std::string& device_string) : Device(Type::CPU) { + AT_CHECK(!device_string.empty(), "Device string must not be empty"); + + size_t position; + std::tie(type_, position) = parse_type(device_string); + + // e.g. 'cuda', 'cpu'. + if (position == device_string.size()) { + return; + } + + AT_CHECK( + device_string[position] == ':', + "Expected ':' to separate device type from index in device string"); + // Skip the colon. + position += 1; + + const auto index_string = device_string.substr(position); + try { + index_ = std::stoi(index_string); + } catch (const std::exception&) { + AT_ERROR( + "Could not parse device index '", + index_string, + "' in device string '", + device_string, + "'"); + } +} + +} // namespace at + +std::ostream& operator<<(std::ostream& stream, at::Device::Type type) { + switch (type) { + case at::Device::Type::CPU: { + stream << "cpu"; + break; + } + case at::Device::Type::CUDA: { + stream << "cuda"; + break; + } + } + return stream; +} + +std::ostream& operator<<(std::ostream& stream, const at::Device& device) { + stream << device.type(); + if (device.has_index()) { + stream << ":" << device.index(); + } + return stream; +} diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h new file mode 100644 index 0000000..4795b77 --- /dev/null +++ b/aten/src/ATen/Device.h @@ -0,0 +1,128 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace at { +/// Represents a a compute device on which a tensor is located. A device is +/// uniquely identified by a type, which specifies the type of machine it is +/// (e.g. CPU or CUDA GPU), and a device index or ordinal, which identifies the +/// specific compute device when there is more than one of a certain type. The +/// device index is optional, and in its defaulted state represents (abstractly) +/// "the current device". Further, there are two constraints on the value of the +/// device index, if one is explicitly stored: +/// 1. A negative index represents the current device, a non-negative index +/// represents a specific, concrete device, +/// 2. When the device type is CPU, the device index must be zero. +struct Device { + /// The possible values of the device *type*. + enum class Type { CPU, CUDA }; + + /// Converts a `Backend` to a `Device::Type` if possible. + static Type backend_to_type(Backend backend) { + switch (backend) { + case kCPU: + case kSparseCPU: + return Type::CPU; + case kCUDA: + case kSparseCUDA: + return Type::CUDA; + default: + AT_ERROR( + "Invalid backend ", toString(backend), " for Device construction"); + } + } + + /// Constructs a new `Device` from a `Type` and an optional device index. + /* implicit */ Device(Type type, int32_t index = -1) + : type_(type), index_(index) { + AT_CHECK( + index == -1 || index >= 0, + "Device index must be -1 or non-negative, got ", + index); + AT_CHECK( + !is_cpu() || index <= 0, + "CPU device index must be -1 or zero, got ", + index); + } + + /// Constructs a `Device` from a string description, for convenience. + /// The string supplied must follow the following schema: + /// `(cpu|cuda):[]` + /// where `cpu:` or `cuda:` specifies the device type, and + /// `` optionally specifies a device index. + /* implicit */ Device(const std::string& device_string); + + /// Constructs a new `Device` from a `Backend` (which is converted to a + /// `Type`, if possible) and an optional device index. + /* implicit */ Device(Backend backend, int32_t index = -1) + : Device(backend_to_type(backend), index) {} + + /// Returns true if the type and index of this `Device` matches that of + /// `other`. + bool operator==(const Device& other) const noexcept { + return this->type_ == other.type_ && this->index_ == other.index_; + } + + /// Returns true if the type or index of this `Device` differs from that of + /// `other`. + bool operator!=(const Device& other) const noexcept { + return !(*this == other); + } + + /// Sets the device index. + void set_index(int32_t index) { + index_ = index; + } + + /// Returns the type of device this is. + Type type() const noexcept { + return type_; + } + + /// Returns the optional index. + const int32_t& index() const noexcept { + return index_; + } + + /// Returns true if the device has a non-default index. + bool has_index() const noexcept { + return index_ != -1; + } + + /// Return true if the device is of CUDA type. + bool is_cuda() const noexcept { + return type_ == Type::CUDA; + } + + /// Return true if the device is of CPU type. + bool is_cpu() const noexcept { + return type_ == Type::CPU; + } + + private: + Type type_; + int32_t index_ = -1; +}; +} // namespace at + +AT_API std::ostream& operator<<(std::ostream& stream, at::Device::Type type); +AT_API std::ostream& operator<<(std::ostream& stream, const at::Device& device); + +namespace std { + template<> struct hash + { + size_t operator()(const at::Device& device) const noexcept { + size_t hash_val = static_cast(device.index() + 1); + if (device.is_cuda()) { + hash_val += 2; + } + return hash_val; + } + }; +} // namespace std diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h new file mode 100644 index 0000000..6a3b84d --- /dev/null +++ b/aten/src/ATen/DeviceGuard.h @@ -0,0 +1,102 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at { +/// RAII guard that sets a certain default GPU index in its constructor, and +/// changes it back to the device that was originally active upon destruction. +/// +/// The index is always reset to the one that was active at the time of +/// construction of the guard. Even if you `set_index` after construction, the +/// destructor will still reset the index to the one that was active at +/// construction time. +struct DeviceGuard { + /// Default constructor, does nothing. + DeviceGuard() = default; + + /// Uses the given device's `index()` if it is a CUDA device, else does + /// nothing. + explicit DeviceGuard(Device device) { + if (device.is_cuda()) { + set_index(device.index()); + } + } + + /// Calls `set_device` with the given index. + explicit DeviceGuard(int32_t index) { + set_index(index); + } + + /// Sets the device to the index on which the given tensor is located. + explicit DeviceGuard(const Tensor& tensor) { + set_index_from(tensor); + } + + /// Sets the device to the index on which the first tensor in the list is + /// located. If the list is empty, does nothing. + explicit DeviceGuard(const TensorList& tensors) { + if (!tensors.empty()) { + set_index_from(tensors.front()); + } + } + + /// Resets the device to the index that was active at construction of the + /// guard. + ~DeviceGuard() { + // It should only not have a value if an index was never actually set. + if (original_index_ != -1) { + // Unchecked because we don't want to throw in the destructor. + detail::DynamicCUDAInterface::unchecked_set_device(original_index_); + } + } + + /// Sets the device to the given one. + void set_index(int32_t index) { + if (index == -1) { + return; + } + AT_ASSERT(index >= 0); + if (original_index_ == -1) { + int32_t previous_index = -123; + detail::DynamicCUDAInterface::get_device(&previous_index); + original_index_ = previous_index; + if (index != original_index_) { + detail::DynamicCUDAInterface::set_device(index); + } + } else { + detail::DynamicCUDAInterface::set_device(index); + } + last_index_ = index; + } + + /// Calls `set_index` with the `Tensor`'s current device, if it is a CUDA + /// tensor. Does nothing if the `tensor` is not defined. + void set_index_from(const Tensor& tensor) { + if (tensor.defined() && tensor.is_cuda()) { + set_index(tensor.get_device()); + } + } + + /// Returns the device that was set upon construction of the guard. + int32_t original_index() const noexcept { + return original_index_; + } + + // /// Returns the last device that was set via `set_device`, if any. + int32_t last_index() const noexcept { + return last_index_; + } + + private: + /// The original device that was active at construction of this object. + int32_t original_index_ = -1; + /// The last index that was set via `set_device`. + int32_t last_index_ = -1; +}; +} // namespace at diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h new file mode 100644 index 0000000..aaa4dc9 --- /dev/null +++ b/aten/src/ATen/DimVector.h @@ -0,0 +1,11 @@ +#pragma once + +#include "SmallVector.h" +#include + +namespace at { + +/// A container for sizes or strides +using DimVector = SmallVector; + +} diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h new file mode 100644 index 0000000..6cd8722 --- /dev/null +++ b/aten/src/ATen/Dispatch.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include + +#define AT_PRIVATE_CASE_TYPE(enum_type, type, ...) \ + case enum_type: { \ + using scalar_t = type; \ + return __VA_ARGS__(); \ + } + +#define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + +#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, Half, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + +#define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + +#define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + +#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, Half, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() diff --git a/aten/src/ATen/Error.cpp b/aten/src/ATen/Error.cpp new file mode 100644 index 0000000..1261fbe --- /dev/null +++ b/aten/src/ATen/Error.cpp @@ -0,0 +1,32 @@ +#include +#include + +#include +#include + +namespace at { +std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { + out << loc.function << " at " << loc.file << ":" << loc.line; + return out; +} + +Error::Error(SourceLocation source_location, std::string err) + : what_without_backtrace_(err) + , what_(str(err, " (", source_location, ")\n", get_backtrace(/*frames_to_skip=*/2))) + {} + +void Warning::warn(SourceLocation source_location, std::string msg) { + warning_handler_(source_location, msg.c_str()); +} + +void Warning::set_warning_handler(handler_t handler) { + warning_handler_ = handler; +} + +void Warning::print_warning(const SourceLocation& source_location, const char* msg) { + std::cerr << "Warning: " << msg << " (" << source_location << ")\n"; +} + +Warning::handler_t Warning::warning_handler_ = &Warning::print_warning; + +} // namespace at diff --git a/aten/src/ATen/Error.h b/aten/src/ATen/Error.h new file mode 100644 index 0000000..5a41eb7 --- /dev/null +++ b/aten/src/ATen/Error.h @@ -0,0 +1,131 @@ +#pragma once + +#include // for AT_API +#include + +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +namespace at { + +namespace detail { + +inline std::ostream& _str(std::ostream& ss) { return ss; } + +template +inline std::ostream& _str(std::ostream& ss, const T& t) { + ss << t; + return ss; +} + +template +inline std::ostream& +_str(std::ostream& ss, const T& t, const Args&... args) { + return _str(_str(ss, t), args...); +} + +} // namespace detail + +// Convert a list of string-like arguments into a single string. +template +inline std::string str(const Args&... args) { + std::ostringstream ss; + detail::_str(ss, args...); + return ss.str(); +} + +// Specializations for already-a-string types. +template <> +inline std::string str(const std::string& str) { + return str; +} +inline std::string str(const char* c_str) { + return c_str; +} + +/// Represents a location in source code (for debugging). +struct SourceLocation { + const char* function; + const char* file; + uint32_t line; +}; + +std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); + +/// The primary ATen error class. +/// Provides a complete error message with source location information via +/// `what()`, and a more concise message via `what_without_backtrace()`. Should +/// primarily be used with the `AT_ERROR` macro. +/// +/// NB: at::Error is handled specially by the default torch to suppress the +/// backtrace, see torch/csrc/Exceptions.h +class AT_API Error : public std::exception { + std::string what_without_backtrace_; + std::string what_; + +public: + Error(SourceLocation source_location, std::string err); + + /// Returns the complete error message, including the source location. + const char* what() const noexcept override { + return what_.c_str(); + } + + /// Returns only the error message string, without source location. + const char* what_without_backtrace() const noexcept { + return what_without_backtrace_.c_str(); + } +}; + +class AT_API Warning { + using handler_t = void(*)(const SourceLocation& source_location, const char* msg); + +public: + /// Issue a warning with a given message. Dispatched to the current + /// warning handler. + static void warn(SourceLocation source_location, std::string msg); + + /// Sets the global warning handler. This is not thread-safe, so it should + /// generally be called once during initialization. + static void set_warning_handler(handler_t handler); + + /// The default warning handler. Prints the message to stderr. + static void print_warning(const SourceLocation& source_location, const char* msg); + +private: + static handler_t warning_handler_; +}; + + +} // namespace at + +// TODO: variants that print the expression tested and thus don't require strings +// TODO: CAFFE_ENFORCE_WITH_CALLER style macro + +#define AT_ERROR(...) \ + throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) + +#define AT_WARN(...) \ + at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) + +#define AT_ASSERT(cond) \ + if (!(cond)) { \ + AT_ERROR(#cond " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch."); \ + } + +#define AT_ASSERTM(cond, ...) \ + if (!(cond)) { \ + AT_ERROR(at::str(#cond, " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch. ", __VA_ARGS__)); \ + } + +#define AT_CHECK(cond, ...) \ + if (!(cond)) { \ + AT_ERROR(at::str(__VA_ARGS__)); \ + } diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp new file mode 100644 index 0000000..f4a0ce4 --- /dev/null +++ b/aten/src/ATen/ExpandUtils.cpp @@ -0,0 +1,80 @@ +#include "ATen/ExpandUtils.h" + +namespace at { + +std::vector infer_size(IntList a, IntList b) { + auto dimsA = a.size(); + auto dimsB = b.size(); + ptrdiff_t ndim = dimsA > dimsB ? dimsA : dimsB; + std::vector expandedSizes(ndim); + + for (long i = ndim - 1; i >= 0; --i) { + long offset = ndim - 1 - i; + long dimA = dimsA - 1 - offset; + long dimB = dimsB - 1 - offset; + long sizeA = (dimA >= 0) ? a[dimA] : 1; + long sizeB = (dimB >= 0) ? b[dimB] : 1; + + AT_CHECK( + sizeA == sizeB || sizeA == 1 || sizeB == 1, + "The size of tensor a (", sizeA, + ") must match the size of tensor b (", sizeB, + ") at non-singleton dimension ", i); + + // 1s map to the other size (even 0). + expandedSizes[i] = sizeA == 1 ? sizeB : sizeA; + } + + return expandedSizes; +} + +std::tuple, std::vector> inferExpandGeometry( + const Tensor& tensor, + IntList sizes) { + int64_t ndim = sizes.size(); + + if (tensor.dim() == 0) { + std::vector expandedStrides(ndim, 0); + return std::tuple, std::vector>( + sizes.vec(), expandedStrides); + } + std::vector expandedSizes(ndim); + std::vector expandedStrides(ndim); + + // create a new geometry for the tensors + for (int64_t i = ndim - 1; i >= 0; --i) { + int64_t offset = ndim - 1 - i; + int64_t dim = tensor.dim() - 1 - offset; + int64_t size = (dim >= 0) ? tensor.sizes()[dim] : 1; + int64_t stride = (dim >= 0) ? tensor.strides()[dim] + : expandedSizes[i + 1] * expandedStrides[i + 1]; + int64_t targetSize = sizes[i]; + if (targetSize == -1) { + AT_CHECK( + dim >= 0, + "The expanded size of the tensor (", + targetSize, + ") isn't allowed in a leading, non-existing dimension ", + i); + targetSize = size; + } + if (size != targetSize) { + AT_CHECK( + size == 1, + "The expanded size of the tensor (", + targetSize, + ") must match the existing size (", + size, + ") at non-singleton dimension ", + i); + size = targetSize; + stride = 0; + } + expandedSizes[i] = size; + expandedStrides[i] = stride; + } + return std::tuple, std::vector>( + expandedSizes, expandedStrides); +} + +} // namespace at diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h new file mode 100644 index 0000000..2080e56 --- /dev/null +++ b/aten/src/ATen/ExpandUtils.h @@ -0,0 +1,133 @@ +#pragma once + +#include "ATen/Tensor.h" +#include "ATen/Error.h" + +#include +#include +#include + +namespace at { + +AT_API std::vector infer_size(IntList a, IntList b); +std::tuple, std::vector > inferExpandGeometry(const Tensor &tensor, IntList sizes); + +// avoid copy-construction of Tensor by using a reference_wrapper. +inline void check_defined(std::initializer_list> tensors, const char *api_name) { + for (auto& t : tensors) { + if (!t.get().defined()) { + AT_ERROR(api_name, "(...) called with an undefined Tensor"); + } + } +} + +inline std::tuple expand_inplace(const Tensor &tensor, const Tensor &to_expand) { + if (tensor.sizes().equals(to_expand.sizes())) { + return std::make_tuple(to_expand); + } + + return std::make_tuple(to_expand.expand(tensor.sizes(), /*implicit=*/true)); // see [expand implicit] +} + +inline std::tuple expand_inplace(const Tensor &tensor, const Tensor &to_expand, const char *api_name) { + check_defined({tensor, to_expand}, api_name); + return expand_inplace(tensor, to_expand); +} + +inline std::tuple expand_inplace(const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2) { + if (tensor.sizes().equals(to_expand1.sizes()) && tensor.sizes().equals((to_expand2.sizes()))) { + return std::make_tuple(to_expand1, to_expand2); + } + + return std::make_tuple( + to_expand1.expand(tensor.sizes(), /*implicit=*/true), // see [expand implicit] + to_expand2.expand(tensor.sizes(), /*implicit=*/true)); +} + +inline std::tuple expand_inplace(const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2, + const char *api_name) { + check_defined({tensor, to_expand1, to_expand2}, api_name); + return expand_inplace(tensor, to_expand1, to_expand2); +} + +inline std::tuple expand_outplace(const Tensor &to_expand1, const Tensor &to_expand2) { + if (to_expand1.sizes().equals(to_expand2.sizes())) { + return std::make_tuple(to_expand1, to_expand2); + } + + auto expanded_size = infer_size(to_expand1.sizes(), to_expand2.sizes()); + return std::make_tuple( + to_expand1.expand(expanded_size, /*implicit=*/true), // see [expand implicit] + to_expand2.expand(expanded_size, /*implicit=*/true)); +} + +inline std::tuple expand_outplace(const Tensor &to_expand1, const Tensor &to_expand2, const char *api_name) { + check_defined({to_expand1, to_expand2}, api_name); + return expand_outplace(to_expand1, to_expand2); +} + +inline std::tuple expand_outplace(const Tensor &to_expand1, + const Tensor &to_expand2, + const Tensor &to_expand3) { + if (to_expand1.sizes().equals(to_expand2.sizes()) && to_expand1.sizes().equals(to_expand3.sizes())) { + return std::make_tuple(to_expand1, to_expand2, to_expand3); + } + + auto expanded_size12 = infer_size(to_expand1.sizes(), to_expand2.sizes()); + auto expanded_size = infer_size(expanded_size12, to_expand3.sizes()); + return std::make_tuple( + to_expand1.expand(expanded_size, /*implicit=*/true), // see [expand implicit] + to_expand2.expand(expanded_size, /*implicit=*/true), + to_expand3.expand(expanded_size, /*implicit=*/true)); +} + +inline std::tuple expand_outplace(const Tensor &to_expand1, + const Tensor &to_expand2, + const Tensor &to_expand3, + const char *api_name) { + check_defined({to_expand1, to_expand2, to_expand3}, api_name); + return expand_outplace(to_expand1, to_expand2, to_expand3); +} + +inline std::tuple expand_size(const Tensor &to_expand, IntList sizes) { + if(to_expand.sizes().equals(sizes)) { + return std::make_tuple(to_expand); + } + + return std::make_tuple(to_expand.expand(sizes, /*implicit=*/true)); // see [expand implicit] +} + +inline std::tuple expand_size(const Tensor &to_expand, IntList sizes, const char *api_name) { + check_defined({to_expand}, api_name); + return expand_size(to_expand, sizes); +} + +inline std::vector expand_outplace(TensorList to_expand) { + // expands a list of Tensors; ignores undefined (null) tensors + bool first = true; + std::vector sizes; + for (size_t i = 0; i < to_expand.size(); ++i) { + if (!to_expand[i].defined()) { + continue; + } else if (first) { + sizes = to_expand[i].sizes(); + first = false; + } else { + sizes = infer_size(sizes, to_expand[i].sizes()); + } + } + + std::vector result(to_expand.size()); + for (size_t i = 0; i < to_expand.size(); ++i) { + if (!to_expand[i].defined()) { + continue; + } else if (to_expand[i].sizes().equals(sizes)) { + result[i] = to_expand[i]; + } else { + result[i] = to_expand[i].expand(sizes, /*implicit=*/true); // see [expand implicit] + } + } + return result; +} + +} diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp new file mode 100644 index 0000000..aab224f --- /dev/null +++ b/aten/src/ATen/Formatting.cpp @@ -0,0 +1,295 @@ +#include "ATen/Formatting.h" +#include "ATen/Tensor.h" +#include "ATen/Context.h" +#include "ATen/TensorMethods.h" + +#include +#include +#include + + +namespace at { + +//not all C++ compilers have default float so we define our own here +inline std::ios_base& defaultfloat(std::ios_base& __base) { + __base.unsetf(std::ios_base::floatfield); + return __base; +} +//saves/restores number formatting inside scope +struct FormatGuard { + FormatGuard(std::ostream & out) + : out(out), saved(nullptr) { + saved.copyfmt(out); + } + ~FormatGuard() { + out.copyfmt(saved); + } +private: + std::ostream & out; + std::ios saved; +}; + +std::ostream& operator<<(std::ostream & out, IntList list) { + int i = 0; + out << "["; + for(auto e : list) { + if (i++ > 0) + out << ", "; + out << e; + } + out << "]"; + return out; +} + +std::ostream& operator<<(std::ostream & out, Backend b) { + return out << toString(b); +} + +std::ostream& operator<<(std::ostream & out, ScalarType t) { + return out << toString(t); +} + +std::ostream& operator<<(std::ostream & out, const Type& t) { + return out << t.toString(); +} + +static std::tuple __printFormat(std::ostream& stream, const Tensor& self) { + auto size = self.numel(); + if(size == 0) { + return std::make_tuple(1., 0); + } + bool intMode = true; + auto self_p = self.data(); + for(int64_t i = 0; i < size; i++) { + auto z = self_p[i]; + if(std::isfinite(z)) { + if(z != ceil(z)) { + intMode = false; + break; + } + } + } + int64_t offset = 0; + while(!std::isfinite(self_p[offset])) { + offset = offset + 1; + if(offset == size) { + break; + } + } + double expMin; + double expMax; + if(offset == size) { + expMin = 1; + expMax = 1; + } else { + expMin = fabs(self_p[offset]); + expMax = fabs(self_p[offset]); + for(int64_t i = offset; i < size; i++) { + double z = fabs(self_p[i]); + if(std::isfinite(z)) { + if(z < expMin) { + expMin = z; + } + if(self_p[i] > expMax) { + expMax = z; + } + } + } + if(expMin != 0) { + expMin = floor(log10(expMin)) + 1; + } else { + expMin = 1; + } + if(expMax != 0) { + expMax = floor(log10(expMax)) + 1; + } else { + expMax = 1; + } + } + double scale = 1; + int64_t sz; + if(intMode) { + if(expMax > 9) { + sz = 11; + stream << std::scientific << std::setprecision(4); + } else { + sz = expMax + 1; + stream << defaultfloat; + } + } else { + if(expMax-expMin > 4) { + sz = 11; + if(fabs(expMax) > 99 || fabs(expMin) > 99) { + sz = sz + 1; + } + stream << std::scientific << std::setprecision(4); + } else { + if(expMax > 5 || expMax < 0) { + sz = 7; + scale = pow(10, expMax-1); + stream << std::fixed << std::setprecision(4); + } else { + if(expMax == 0) { + sz = 7; + } else { + sz = expMax+6; + } + stream << std::fixed << std::setprecision(4); + } + } + } + return std::make_tuple(scale, sz); +} + +static void __printIndent(std::ostream &stream, int64_t indent) +{ + for(int64_t i = 0; i < indent; i++) { + stream << " "; + } +} + +static void printScale(std::ostream & stream, double scale) { + FormatGuard guard(stream); + stream << defaultfloat << scale << " *" << std::endl; +} +static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent) +{ + double scale; + int64_t sz; + std::tie(scale, sz) = __printFormat(stream, self); + + __printIndent(stream, indent); + int64_t nColumnPerLine = (linesize-indent)/(sz+1); + int64_t firstColumn = 0; + int64_t lastColumn = -1; + while(firstColumn < self.size(1)) { + if(firstColumn + nColumnPerLine <= self.size(1)) { + lastColumn = firstColumn + nColumnPerLine - 1; + } else { + lastColumn = self.size(1) - 1; + } + if(nColumnPerLine < self.size(1)) { + if(firstColumn != 0) { + stream << std::endl; + } + stream << "Columns " << firstColumn+1 << " to " << lastColumn+1; + __printIndent(stream, indent); + } + if(scale != 1) { + printScale(stream,scale); + __printIndent(stream, indent); + } + for(int64_t l = 0; l < self.size(0); l++) { + Tensor row = self.select(0,l); + double *row_ptr = row.data(); + for(int64_t c = firstColumn; c < lastColumn+1; c++) { + stream << std::setw(sz) << row_ptr[c]/scale; + if(c == lastColumn) { + stream << std::endl; + if(l != self.size(0)-1) { + if(scale != 1) { + __printIndent(stream, indent); + stream << " "; + } else { + __printIndent(stream, indent); + } + } + } else { + stream << " "; + } + } + } + firstColumn = lastColumn + 1; + } +} + +void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) +{ + std::vector counter(self.ndimension()-2); + bool start = true; + bool finished = false; + counter[0] = -1; + for(size_t i = 1; i < counter.size(); i++) + counter[i] = 0; + while(true) { + for(int64_t i = 0; self.ndimension()-2; i++) { + counter[i] = counter[i] + 1; + if(counter[i] >= self.size(i)) { + if(i == self.ndimension()-3) { + finished = true; + break; + } + counter[i] = 0; + } else { + break; + } + } + if(finished) { + break; + } + if(start) { + start = false; + } else { + stream << std::endl; + } + stream << "("; + Tensor tensor = self; + for(int64_t i=0; i < self.ndimension()-2; i++) { + tensor = tensor.select(0, counter[i]); + stream << counter[i]+1 << ","; + } + stream << ".,.) = " << std::endl; + __printMatrix(stream, tensor, linesize, 1); + } +} + +std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesize) { + FormatGuard guard(stream); + if(!tensor_.defined()) { + stream << "[ Tensor (undefined) ]"; + } else if (tensor_.is_sparse()) { + stream << "[ " << tensor_.pImpl->toString() << "{}\n"; + stream << "indices:\n" << tensor_._indices() << "\n"; + stream << "values:\n" << tensor_._values() << "\n"; + stream << "size:\n" << tensor_.sizes() << "\n"; + stream << "]"; + } else { + Type& cpudouble = tensor_.type().toBackend(kCPU).toScalarType(kDouble); + Tensor tensor = tensor_.toType(cpudouble).contiguous(); + if(tensor.ndimension() == 0) { + stream << defaultfloat << tensor.data()[0] << std::endl; + stream << "[ " << tensor_.pImpl->toString() << "{} ]"; + } else if(tensor.ndimension() == 1) { + if (tensor.numel() > 0) { + double scale; + int64_t sz; + std::tie(scale, sz) = __printFormat(stream, tensor); + if(scale != 1) { + printScale(stream, scale); + } + double* tensor_p = tensor.data(); + for(int64_t i = 0; i < tensor.size(0); i++) { + stream << std::setw(sz) << tensor_p[i]/scale << std::endl; + } + } + stream << "[ " << tensor_.pImpl->toString() << "{" << tensor.size(0) << "} ]"; + } else if(tensor.ndimension() == 2) { + if (tensor.numel() > 0) { + __printMatrix(stream, tensor, linesize, 0); + } + stream << "[ " << tensor_.pImpl->toString() << "{" << tensor.size(0) << "," << tensor.size(1) << "} ]"; + } else { + if (tensor.numel() > 0) { + __printTensor(stream, tensor, linesize); + } + stream << "[ " << tensor_.pImpl->toString() << "{" << tensor.size(0); + for(int64_t i = 1; i < tensor.ndimension(); i++) { + stream << "," << tensor.size(i); + } + stream << "} ]"; + } + } + return stream; +} + +} diff --git a/aten/src/ATen/Formatting.h b/aten/src/ATen/Formatting.h new file mode 100644 index 0000000..fe496a1 --- /dev/null +++ b/aten/src/ATen/Formatting.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include "ATen/Type.h" +#include "ATen/Scalar.h" + +namespace at { + +AT_API std::ostream& operator<<(std::ostream & out, IntList list); +AT_API std::ostream& operator<<(std::ostream & out, Backend b); +AT_API std::ostream& operator<<(std::ostream & out, ScalarType t); +AT_API std::ostream& operator<<(std::ostream & out, const Type & t); +AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize); +static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { + return print(out,t,80); +} +static inline void print(const Tensor & t, int64_t linesize=80) { + print(std::cout,t,linesize); +} + +static inline std::ostream& operator<<(std::ostream & out, Scalar s) { + s = s.local(); + return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong()); +} + +} diff --git a/aten/src/ATen/Generator.h b/aten/src/ATen/Generator.h new file mode 100644 index 0000000..7e2b68b --- /dev/null +++ b/aten/src/ATen/Generator.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace at { + +struct Generator { + Generator() {}; + Generator(const Generator& other) = delete; + Generator(Generator&& other) = delete; + virtual ~Generator() {}; + + virtual Generator& copy(const Generator& other) = 0; + virtual Generator& free() = 0; + + virtual uint64_t seed() = 0; + virtual uint64_t initialSeed() = 0; + virtual Generator& manualSeed(uint64_t seed) = 0; + virtual Generator& manualSeedAll(uint64_t seed) = 0; + virtual void * unsafeGetTH() = 0; +}; + +} // namespace at diff --git a/aten/src/ATen/Half-inl.h b/aten/src/ATen/Half-inl.h new file mode 100644 index 0000000..e5563fa --- /dev/null +++ b/aten/src/ATen/Half-inl.h @@ -0,0 +1,168 @@ +#pragma once + +#include "ATen/ATenGeneral.h" +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +namespace at { + +/// Constructors + +inline AT_HOSTDEVICE Half::Half(float value) { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + x = __half_as_short(__float2half(value)); +#else + x = detail::float2halfbits(value); +#endif +} + +/// Implicit conversions + +inline AT_HOSTDEVICE Half::operator float() const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(*reinterpret_cast(&x)); +#else + return detail::halfbits2float(x); +#endif +} + +#ifdef __CUDACC__ +inline AT_HOSTDEVICE Half::Half(const __half& value) { + x = *reinterpret_cast(&value); +} +inline AT_HOSTDEVICE Half::operator __half() const { + return *reinterpret_cast(&x); +} +#endif + +/// Arithmetic + +inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { + return (float)a + (float)b; +} + +inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { + return (float)a - (float)b; +} + +inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { + return (float)a * (float)b; +} + +inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { + return (float)a / (float)b; +} + +inline AT_HOSTDEVICE Half operator-(const Half& a) { + return -(float)a; +} + +inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { + a = a + b; + return a; +} + +inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { + a = a - b; + return a; +} + +inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { + a = a * b; + return a; +} + +inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { + a = a / b; + return a; +} + +/// Arithmetic with floats + +inline AT_HOSTDEVICE float operator+(Half a, float b) { return (float)a + b; } +inline AT_HOSTDEVICE float operator-(Half a, float b) { return (float)a - b; } +inline AT_HOSTDEVICE float operator*(Half a, float b) { return (float)a * b; } +inline AT_HOSTDEVICE float operator/(Half a, float b) { return (float)a / b; } + +inline AT_HOSTDEVICE float operator+(float a, Half b) { return a + (float)b; } +inline AT_HOSTDEVICE float operator-(float a, Half b) { return a - (float)b; } +inline AT_HOSTDEVICE float operator*(float a, Half b) { return a * (float)b; } +inline AT_HOSTDEVICE float operator/(float a, Half b) { return a / (float)b; } + +inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { return a += (float)b; } +inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { return a -= (float)b; } +inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { return a *= (float)b; } +inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { return a /= (float)b; } + +/// Arithmetic with doubles + +inline AT_HOSTDEVICE double operator+(Half a, double b) { return (double)a + b; } +inline AT_HOSTDEVICE double operator-(Half a, double b) { return (double)a - b; } +inline AT_HOSTDEVICE double operator*(Half a, double b) { return (double)a * b; } +inline AT_HOSTDEVICE double operator/(Half a, double b) { return (double)a / b; } + +inline AT_HOSTDEVICE double operator+(double a, Half b) { return a + (double)b; } +inline AT_HOSTDEVICE double operator-(double a, Half b) { return a - (double)b; } +inline AT_HOSTDEVICE double operator*(double a, Half b) { return a * (double)b; } +inline AT_HOSTDEVICE double operator/(double a, Half b) { return a / (double)b; } + +/// Arithmetic with ints + +inline AT_HOSTDEVICE Half operator+(Half a, int b) { return a + (Half)b; } +inline AT_HOSTDEVICE Half operator-(Half a, int b) { return a - (Half)b; } +inline AT_HOSTDEVICE Half operator*(Half a, int b) { return a * (Half)b; } +inline AT_HOSTDEVICE Half operator/(Half a, int b) { return a / (Half)b; } + +inline AT_HOSTDEVICE Half operator+(int a, Half b) { return (Half)a + b; } +inline AT_HOSTDEVICE Half operator-(int a, Half b) { return (Half)a - b; } +inline AT_HOSTDEVICE Half operator*(int a, Half b) { return (Half)a * b; } +inline AT_HOSTDEVICE Half operator/(int a, Half b) { return (Half)a / b; } + +/// NOTE: we do not define comparisons directly and instead rely on the implicit +/// conversion from at::Half to float. + +} // namespace at + +namespace std { + +template<> class numeric_limits { + public: + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 11; + static constexpr int digits10 = 3; + static constexpr int max_digits10 = 5; + static constexpr int radix = 2; + static constexpr int min_exponent = -13; + static constexpr int min_exponent10 = -4; + static constexpr int max_exponent = 16; + static constexpr int max_exponent10 = 4; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = numeric_limits::tinyness_before; + static constexpr at::Half min() { return at::Half(0x0400, at::Half::from_bits); } + static constexpr at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits); } + static constexpr at::Half max() { return at::Half(0x7BFF, at::Half::from_bits); } + static constexpr at::Half epsilon() { return at::Half(0x1400, at::Half::from_bits); } + static constexpr at::Half round_error() { return at::Half(0x3800, at::Half::from_bits); } + static constexpr at::Half infinity() { return at::Half(0x7C00, at::Half::from_bits); } + static constexpr at::Half quiet_NaN() { return at::Half(0x7E00, at::Half::from_bits); } + static constexpr at::Half signaling_NaN() { return at::Half(0x7D00, at::Half::from_bits); } + static constexpr at::Half denorm_min() { return at::Half(0x0001, at::Half::from_bits); } +}; + +} // namespace std diff --git a/aten/src/ATen/Half.cpp b/aten/src/ATen/Half.cpp new file mode 100644 index 0000000..68f80a5 --- /dev/null +++ b/aten/src/ATen/Half.cpp @@ -0,0 +1,34 @@ +#include "ATen/Half.h" + +#include "ATen/Tensor.h" +#include "ATen/Context.h" + +#include +#include + +namespace at { + +static_assert(std::is_standard_layout::value, "at::Half must be standard layout."); + +namespace detail { + +float halfbits2float(unsigned short bits) { + float value; + TH_halfbits2float(&bits, &value); + return value; +} + +unsigned short float2halfbits(float value) { + unsigned short bits; + TH_float2halfbits(&value, &bits); + return bits; +} + +} // namespace detail + +std::ostream& operator<<(std::ostream & out, const Half& value) { + out << (float)value; + return out; +} + +} // namespace at diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h new file mode 100644 index 0000000..d740008 --- /dev/null +++ b/aten/src/ATen/Half.h @@ -0,0 +1,113 @@ +#pragma once + +/// Defines the Half type (half-precision floating-point) including conversions +/// to standard C types and basic arithmetic operations. Note that arithmetic +/// operations are implemented by converting to floating point and +/// performing the operation in float32, instead of using CUDA half intrinisics. +/// Most uses of this type within ATen are memory bound, including the +/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs. +/// If you are writing a compute bound kernel, you can use the CUDA half +/// intrinsics directly on the Half type from device code. + +#include "ATen/ATenGeneral.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#ifndef AT_HOSTDEVICE + #ifdef __CUDACC__ + #define AT_HOSTDEVICE __host__ __device__ + #else + #define AT_HOSTDEVICE + #endif +#endif + +namespace at { + +namespace detail { + +AT_API float halfbits2float(unsigned short bits); +AT_API unsigned short float2halfbits(float value); + +} + +struct alignas(2) Half { + unsigned short x; + + struct from_bits_t {}; + static constexpr from_bits_t from_bits = from_bits_t(); + + // HIP wants __host__ __device__ tag, CUDA does not +#ifdef __HIP_PLATFORM_HCC__ + AT_HOSTDEVICE Half() = default; +#else + Half() = default; +#endif + + constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}; + inline AT_HOSTDEVICE Half(float value); + inline AT_HOSTDEVICE operator float() const; + +#ifdef __CUDACC__ + inline AT_HOSTDEVICE Half(const __half& value); + inline AT_HOSTDEVICE operator __half() const; +#endif +}; + +template To convert(From f) { + return static_cast(f); +} + +// skip isnan and isinf check for integral types +template +typename std::enable_if::value, bool>::type overflows(From f) { + using limit = std::numeric_limits; + return f < limit::lowest() || f > limit::max(); +} + +template +typename std::enable_if::value, bool>::type overflows(From f) { + using limit = std::numeric_limits; + if (limit::has_infinity && std::isinf((double)f)) { + return false; + } + if (!limit::has_quiet_NaN && (f != f)) { + return true; + } + return f < limit::lowest() || f > limit::max(); +} + +template To checked_convert(From f, const char* name) { + if (overflows(f)) { + std::string msg = "value cannot be converted to type "; + msg += name; + msg += " without overflow: "; + msg += std::to_string(f); + throw std::domain_error(std::move(msg)); + } + return convert(f); +} + +template +To HalfFix(From h) { + To ret; + ret.x = h.x; + return ret; +} + +AT_API std::ostream& operator<<(std::ostream & out, const Half& value); + +} // namespace at + +#include "Half-inl.h" + +#undef AT_HOSTDEVICE diff --git a/aten/src/ATen/Layout.h b/aten/src/ATen/Layout.h new file mode 100644 index 0000000..010248a --- /dev/null +++ b/aten/src/ATen/Layout.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace at { +enum class Layout { Strided, Sparse }; + +constexpr auto kStrided = Layout::Strided; +constexpr auto kSparse = Layout::Sparse; + +inline Layout layout_from_backend(Backend backend) { + switch (backend) { + case Backend::SparseCPU: + case Backend::SparseCUDA: + return Layout::Sparse; + default: + return Layout::Strided; + } +} +} // namespace at diff --git a/aten/src/ATen/MatrixRef.h b/aten/src/ATen/MatrixRef.h new file mode 100644 index 0000000..b3e992a --- /dev/null +++ b/aten/src/ATen/MatrixRef.h @@ -0,0 +1,100 @@ +#pragma once +#include +#include + +#include + +namespace at { + /// MatrixRef - Like an ArrayRef, but with an extra recorded strides so that + /// we can easily view it as a multidimensional array. + /// + /// Like ArrayRef, this class does not own the underlying data, it is expected + /// to be used in situations where the data resides in some other buffer. + /// + /// This is intended to be trivially copyable, so it should be passed by + /// value. + /// + /// For now, 2D only (so the copies are actually cheap, without having + /// to write a SmallVector class) and contiguous only (so we can + /// return non-strided ArrayRef on index). + /// + /// P.S. dimension 0 indexes rows, dimension 1 indexes columns + template + class MatrixRef { + public: + typedef size_t size_type; + + private: + /// Underlying ArrayRef + ArrayRef arr; + + /// Stride of dim 0 (outer dimension) + size_type stride0; + + // Stride of dim 1 is assumed to be 1 + + public: + /// Construct an empty Matrixref. + /*implicit*/ MatrixRef() : arr(nullptr), stride0(0) {} + + /// Construct an MatrixRef from an ArrayRef and outer stride. + /*implicit*/ MatrixRef(ArrayRef arr, size_type stride0) + : arr(arr), stride0(stride0) { + AT_CHECK(arr.size() % stride0 == 0, "MatrixRef: ArrayRef size ", arr.size(), " not divisible by stride ", stride0) + } + + /// @} + /// @name Simple Operations + /// @{ + + /// empty - Check if the matrix is empty. + bool empty() const { return arr.empty(); } + + const T *data() const { return arr.data(); } + + /// size - Get size a dimension + size_t size(size_t dim) const { + if (dim == 0) { + return arr.size() / stride0; + } else if (dim == 1) { + return stride0; + } else { + AT_CHECK(0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1"); + } + } + + size_t numel() const { + return arr.size(); + } + + /// equals - Check for element-wise equality. + bool equals(MatrixRef RHS) const { + return stride0 == RHS.stride0 && arr.equals(RHS.arr); + } + + /// @} + /// @name Operator Overloads + /// @{ + ArrayRef operator[](size_t Index) const { + return arr.slice(Index*stride0, stride0); + } + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, MatrixRef>::type & + operator=(U &&Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, MatrixRef>::type & + operator=(std::initializer_list) = delete; + + }; + +} // end namespace at diff --git a/aten/src/ATen/OptionsGuard.cpp b/aten/src/ATen/OptionsGuard.cpp new file mode 100644 index 0000000..d36911b --- /dev/null +++ b/aten/src/ATen/OptionsGuard.cpp @@ -0,0 +1,16 @@ +#include +#include + +namespace at { + +thread_local at::optional DefaultTensorOptions::options_; + +TensorOptions& DefaultTensorOptions::get() { + if (!options_) { + options_.emplace( + /*use_thread_local_default_options=*/false); + } + return *options_; +} + +} // namespace at diff --git a/aten/src/ATen/OptionsGuard.h b/aten/src/ATen/OptionsGuard.h new file mode 100644 index 0000000..1aa39ac --- /dev/null +++ b/aten/src/ATen/OptionsGuard.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at { + +/// A wrapper over a thread local TensorOptions instance. +struct DefaultTensorOptions { + /// Returns the current thread local default options. + /// Defined in OptionsGuard.cpp because we can't use optional in headers, due + /// to Windows and other compilers. + static TensorOptions& get(); + + private: + /// This is an optional because of compiler bugs that mis-initialize static + /// thread local variables. The workaround is lazy initialization, i.e. + /// `DefaultTensorOptions::get()` will initialize the `options_` to a proper + /// value upon first invocation. + /// https://gcc.gnu.org/ml/gcc-bugs/2013-12/msg00026.html + static thread_local at::optional options_; +}; + +/// RAII guard that stores the current default options upon construction, sets +/// the current default options to the ones given to its constructor, and +/// finally resets the options back to the original ones in the destructor. +struct OptionsGuard { + /// Stores the current default options and sets them to the given ones. + explicit OptionsGuard(const TensorOptions& options) + : original_(DefaultTensorOptions::get()) { + DefaultTensorOptions::get() = options; + } + + /// Restores the original default options. + ~OptionsGuard() { + DefaultTensorOptions::get() = original_; + } + + /// Returns the original options that were in place at the time of + /// construction of this object. + const TensorOptions& original() { + return original_; + } + + private: + /// The original options that were in place at the time of construction of + /// this object. + TensorOptions original_; +}; + +} // namespace at diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h new file mode 100644 index 0000000..358dde9 --- /dev/null +++ b/aten/src/ATen/Parallel.h @@ -0,0 +1,68 @@ +#pragma once +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace at { +namespace internal { +// This parameter is heuristically chosen to determine the minimum number of +// work that warrants paralellism. For example, when summing an array, it is +// deemed inefficient to parallelise over arrays shorter than 32768. Further, +// no parallel algorithm (such as parallel_reduce) should split work into +// smaller than GRAIN_SIZE chunks. +constexpr int64_t GRAIN_SIZE = 32768; +} // namespace internal + +inline int64_t divup(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +template +inline void parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const F f) { +#ifdef _OPENMP +#pragma omp parallel if ((end - begin) >= grain_size) + { + int64_t num_threads = omp_get_num_threads(); + int64_t tid = omp_get_thread_num(); + int64_t chunk_size = divup((end - begin), num_threads); + int64_t begin_tid = begin + tid * chunk_size; + if (begin_tid < end) + f(begin_tid, std::min(end, chunk_size + begin_tid)); + } +#else + f(begin, end); +#endif +} + +template +inline scalar_t parallel_reduce( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const scalar_t ident, + const F f, + const SF sf) { + if (get_num_threads() == 1) { + return f(begin, end, ident); + } else { + const int64_t num_results = divup((end - begin), grain_size); + std::vector results(num_results); + scalar_t* results_data = results.data(); +#pragma omp parallel for if ((end - begin) >= grain_size) + for (int64_t id = 0; id < num_results; id++) { + int64_t i = begin + id * grain_size; + results_data[id] = f(i, i + std::min(end - i, grain_size), ident); + } + return std::accumulate( + results_data, results_data + results.size(), ident, sf); + } +} + +} // namespace at diff --git a/aten/src/ATen/Registry.h b/aten/src/ATen/Registry.h new file mode 100644 index 0000000..8fe9c02 --- /dev/null +++ b/aten/src/ATen/Registry.h @@ -0,0 +1,216 @@ +#pragma once + +/** + * Simple registry implementation that uses static variables to + * register object creators during program initialization time. + */ + +// NB: This Registry works poorly when you have other namespaces. +// Make all macro invocations from inside the at namespace. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { + +template +inline void PrintOffendingKey(const KeyType& /*key*/) { + printf("[key type printing not supported]\n"); +} + +template <> +inline void PrintOffendingKey(const std::string& key) { + printf("Offending key: %s.\n", key.c_str()); +} + +/** + * @brief A template class that allows one to register classes by keys. + * + * The keys are usually a std::string specifying the name, but can be anything that + * can be used in a std::map. + * + * You should most likely not use the Registry class explicitly, but use the + * helper macros below to declare specific registries as well as registering + * objects. + */ +template +class AT_API Registry { + public: + typedef std::function Creator; + + Registry() : registry_() {} + + void Register(const SrcType& key, Creator creator) { + // The if statement below is essentially the same as the following line: + // CHECK_EQ(registry_.count(key), 0) << "Key " << key + // << " registered twice."; + // However, CHECK_EQ depends on google logging, and since registration is + // carried out at static initialization time, we do not want to have an + // explicit dependency on glog's initialization function. + std::lock_guard lock(register_mutex_); + if (registry_.count(key) != 0) { + printf("Key already registered.\n"); + PrintOffendingKey(key); + std::exit(1); + } + registry_[key] = creator; + } + + void Register(const SrcType& key, Creator creator, const std::string& help_msg) { + Register(key, creator); + help_message_[key] = help_msg; + } + + inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); } + + ObjectPtrType Create(const SrcType& key, Args... args) { + if (registry_.count(key) == 0) { + // Returns nullptr if the key is not registered. + return nullptr; + } + return registry_[key](args...); + } + + /** + * Returns the keys currently registered as a std::vector. + */ + std::vector Keys() { + std::vector keys; + for (const auto& it : registry_) { + keys.push_back(it.first); + } + return keys; + } + + const std::unordered_map& HelpMessage() const { + return help_message_; + } + + const char* HelpMessage(const SrcType& key) const { + auto it = help_message_.find(key); + if (it == help_message_.end()) { + return nullptr; + } + return it->second.c_str(); + } + + private: + std::unordered_map registry_; + std::unordered_map help_message_; + std::mutex register_mutex_; + + Registry(const Registry&) = delete; + Registry& operator=(const Registry&) = delete; +}; + +template +class AT_API Registerer { + public: + Registerer( + const SrcType& key, + Registry* registry, + typename Registry::Creator creator, + const std::string& help_msg = "") { + registry->Register(key, creator, help_msg); + } + + template + static ObjectPtrType DefaultCreator(Args... args) { + // TODO(jiayq): old versions of NVCC does not handle make_unique well + // so we are forced to use a unique_ptr constructor here. Check if it is + // fine to use make_unique in the future. + // return make_unique(args...); + return ObjectPtrType(new DerivedType(args...)); + } +}; + +/** + * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with + * str and ending with a number that varies with the line. + * Pretty much a copy from 'folly/Preprocessor.h' + */ +#define AT_CONCATENATE_IMPL(s1, s2) s1##s2 +#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2) +#ifdef __COUNTER__ +#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__) +#else +#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__) +#endif + +/** + * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function + * declaration, as well as creating a convenient typename for its corresponding + * registerer. + */ +#define AT_DECLARE_TYPED_REGISTRY( \ + RegistryName, SrcType, ObjectType, PtrType, ...) \ + AT_API Registry, __VA_ARGS__>* RegistryName(); \ + typedef Registerer, __VA_ARGS__> \ + Registerer##RegistryName; \ + extern template class Registerer, __VA_ARGS__>; + +#define AT_DEFINE_TYPED_REGISTRY( \ + RegistryName, SrcType, ObjectType, PtrType, ...) \ + Registry, __VA_ARGS__>* RegistryName() { \ + static Registry, __VA_ARGS__>* registry = \ + new Registry, __VA_ARGS__>(); \ + return registry; \ + } \ + template class Registerer, __VA_ARGS__>; + +// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated +// creator with comma in its templated arguments. +#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...) \ + namespace { \ + Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \ + key, RegistryName(), __VA_ARGS__); \ + } + +#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...) \ + namespace { \ + Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \ + key, \ + RegistryName(), \ + Registerer##RegistryName::DefaultCreator<__VA_ARGS__>, \ + ::at::demangle_type<__VA_ARGS__>()); \ + } + +// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string +// as the key +// type, because that is the most commonly used cases. +#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \ + AT_DECLARE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__) + +#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \ + AT_DEFINE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__) + +#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ + AT_DECLARE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__) + +#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ + AT_DEFINE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__) + +// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string +// as the key +// type, because that is the most commonly used cases. +#define AT_REGISTER_CREATOR(RegistryName, key, ...) \ + AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) + +#define AT_REGISTER_CLASS(RegistryName, key, ...) \ + AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) + +} // namespace at diff --git a/aten/src/ATen/Retainable.h b/aten/src/ATen/Retainable.h new file mode 100644 index 0000000..792a220 --- /dev/null +++ b/aten/src/ATen/Retainable.h @@ -0,0 +1,58 @@ +#pragma once + +#include + +namespace at { + +// base class for refcounted things, allows for collects of generic +// refcounted objects that include tensors +struct Retainable { + Retainable(): refcount(1), weak_refcount(1) {} + void retain() { + ++refcount; + } + void release() { + if(--refcount == 0) { + // If we know that this is the last reference then we can skip + // all the decrements and release_resources(). + if (weak_refcount == 1) { + delete this; + } else { + release_resources(); + weak_release(); + } + } + } + void weak_retain() { + ++weak_refcount; + } + void weak_release() { + if (--weak_refcount == 0) { + delete this; + } + } + bool weak_lock() { + for (;;) { + auto current_refcount = refcount.load(); + if (current_refcount == 0) return false; + if (refcount.compare_exchange_strong(current_refcount, current_refcount + 1)) break; + } + return true; + } + uint32_t use_count() const { + return refcount.load(); + } + uint32_t weak_use_count() const { + return weak_refcount.load(); + } + + virtual void release_resources() {}; + virtual ~Retainable() {} +private: + // INVARIANT: once refcount reaches 0 it can never go up + // INVARIANT: weak_refcount = number of weak references + (refcount > 0 ? 1 : 0) + std::atomic refcount; + std::atomic weak_refcount; +}; + +} diff --git a/aten/src/ATen/Scalar.cpp b/aten/src/ATen/Scalar.cpp new file mode 100644 index 0000000..94925db --- /dev/null +++ b/aten/src/ATen/Scalar.cpp @@ -0,0 +1,21 @@ +#include "ATen/Config.h" + +#include "ATen/Scalar.h" + +#include + +#include "ATen/Tensor.h" +#include "ATen/Context.h" + +namespace at { +Tensor Scalar::toTensor() const { + if (Tag::HAS_t == tag) { + return Tensor(t); + } else if (Tag::HAS_d == tag) { + return CPU(kDouble).scalarTensor(*this); + } else { + assert(Tag::HAS_i == tag); + return CPU(kLong).scalarTensor(*this); + } +} +} diff --git a/aten/src/ATen/Scalar.h b/aten/src/ATen/Scalar.h new file mode 100644 index 0000000..806b05b --- /dev/null +++ b/aten/src/ATen/Scalar.h @@ -0,0 +1,103 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "ATen/ATenGeneral.h" +#include "ATen/Half.h" +#include "ATen/ScalarType.h" +#include "ATen/TensorBase.h" +#include "ATen/Utils.h" + + +namespace at { + +struct Tensor; + +class AT_API Scalar { +public: + Scalar() : Scalar(int64_t(0)) {} + + explicit Scalar(const detail::TensorBase & t) + : tag(Tag::HAS_t), t(t) { + AT_CHECK(t.defined(), "Attempting to create a Scalar from an undefined tensor"); + AT_CHECK(t.dim() == 0, "Attempting to create a Scalar from a ", t.dim(), " dim tensor"); + } + +#define DEFINE_IMPLICIT_CTOR(type,name,member) \ + Scalar(type vv) \ + : tag(Tag::HAS_##member) { \ + v . member = convert(vv); \ + } + + AT_FORALL_SCALAR_TYPES(DEFINE_IMPLICIT_CTOR) + +#undef DEFINE_IMPLICIT_CTOR + + // return a new scalar that is guarenteed to be not backed by a tensor. + Scalar local() const { + if (Tag::HAS_t != tag) { + return *this; + } + return t.pImpl->localScalar(); + } + +#define DEFINE_ACCESSOR(type,name,member) \ + type to##name () const { \ + if (Tag::HAS_t == tag) { \ + return local().to##name(); \ + } else if (Tag::HAS_d == tag) { \ + return checked_convert(v.d, #type); \ + } else { \ + return checked_convert(v.i, #type); \ + } \ + } + + Tensor toTensor() const; + + AT_FORALL_SCALAR_TYPES(DEFINE_ACCESSOR) + + //also support scalar.to(); + template + T to(); + +#undef DEFINE_ACCESSOR + bool isFloatingPoint() const { + return Tag::HAS_d == tag; + } + bool isIntegral() const { + return Tag::HAS_i == tag; + } + bool isBackedByTensor() const { + return Tag::HAS_t == tag; + } + +private: + enum class Tag { HAS_d, HAS_i, HAS_t }; + Tag tag; + union { + double d; + int64_t i; + } v; + detail::TensorBase t; + friend struct Type; +}; + +// define the scalar.to() specializations +template +inline T Scalar::to() { + throw std::runtime_error("to() cast to unexpected type."); +} + +#define DEFINE_TO(T,name,_) \ +template<> \ +inline T Scalar::to() { \ + return to##name(); \ +} +AT_FORALL_SCALAR_TYPES(DEFINE_TO) +#undef DEFINE_TO + +} diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h new file mode 100644 index 0000000..4cb68a6 --- /dev/null +++ b/aten/src/ATen/ScalarType.h @@ -0,0 +1,170 @@ +#pragma once + +#include + +#include "ATen/ArrayRef.h" +#include "ATen/ATenGeneral.h" +#include "ATen/Half.h" + +namespace at { + +// NB: Order matters for this macro; it is relied upon in +// _promoteTypesLookup and probably other places. +#define AT_FORALL_SCALAR_TYPES(_) \ +_(uint8_t,Byte,i) \ +_(int8_t,Char,i) \ +_(int16_t,Short,i) \ +_(int,Int,i) \ +_(int64_t,Long,i) \ +_(at::Half,Half,d) \ +_(float,Float,d) \ +_(double,Double,d) + +#define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \ +_(uint8_t,Byte,i) \ +_(int8_t,Char,i) \ +_(int16_t,Short,i) \ +_(int,Int,i) \ +_(int64_t,Long,i) \ +_(float,Float,d) \ +_(double,Double,d) + +enum class ScalarType { +#define DEFINE_ENUM(_1,n,_2) \ + n, + AT_FORALL_SCALAR_TYPES(DEFINE_ENUM) +#undef DEFINE_ENUM + Undefined, + NumOptions +}; + +enum class Backend { + CPU, + CUDA, + SparseCPU, + SparseCUDA, + Undefined, + NumOptions +}; + +constexpr Backend kCPU = Backend::CPU; +constexpr Backend kCUDA = Backend::CUDA; +constexpr Backend kSparseCPU = Backend::SparseCPU; +constexpr Backend kSparseCUDA = Backend::SparseCUDA; + +static inline Backend toSparse(Backend b) { + switch (b) { + case Backend::CPU: return Backend::SparseCPU; + case Backend::CUDA: return Backend::SparseCUDA; + case Backend::SparseCPU: return Backend::SparseCPU; + case Backend::SparseCUDA: return Backend::SparseCUDA; + default: throw std::runtime_error("Unknown backend"); + } +} + +static inline Backend toDense(Backend b) { + switch (b) { + case Backend::CPU: return Backend::CPU; + case Backend::CUDA: return Backend::CUDA; + case Backend::SparseCPU: return Backend::CPU; + case Backend::SparseCUDA: return Backend::CUDA; + default: throw std::runtime_error("Unknown backend"); + } +} + +static inline const char * toString(Backend b) { + switch(b) { + case Backend::CPU: return "CPU"; + case Backend::CUDA: return "CUDA"; + case Backend::SparseCPU: return "SparseCPU"; + case Backend::SparseCUDA: return "SparseCUDA"; + default: return "UNKNOWN_BACKEND"; + } +} + +#define DEFINE_CONSTANT(_,name,_2) \ +constexpr ScalarType k##name = ScalarType::name; + +AT_FORALL_SCALAR_TYPES(DEFINE_CONSTANT) +#undef DEFINE_CONSTANT + +static inline const char * toString(ScalarType t) { +#define DEFINE_CASE(_,name,_2) \ + case ScalarType:: name : return #name; + + switch(t) { + AT_FORALL_SCALAR_TYPES(DEFINE_CASE) + default: + return "UNKNOWN_SCALAR"; + } +#undef DEFINE_CASE +} + +static inline size_t elementSize(ScalarType t) { +#define CASE_ELEMENTSIZE_CASE(ctype,name,_2) \ + case ScalarType:: name : return sizeof(ctype); + + switch(t) { + AT_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE) + default: + AT_ERROR("Unknown ScalarType"); + } +#undef CASE_ELEMENTSIZE_CASE +} + +static inline bool isIntegralType(ScalarType t) { + return (t == ScalarType::Byte || + t == ScalarType::Char || + t == ScalarType::Int || + t == ScalarType::Long || + t == ScalarType::Short); +} + +static inline bool isFloatingType(ScalarType t) { + return (t == ScalarType::Double || + t == ScalarType::Float || + t == ScalarType::Half); +} + +static inline ScalarType promoteTypes(ScalarType a, ScalarType b) { + // This is generated according to NumPy's promote_types +#define u1 ScalarType::Byte +#define i1 ScalarType::Char +#define i2 ScalarType::Short +#define i4 ScalarType::Int +#define i8 ScalarType::Long +#define f2 ScalarType::Half +#define f4 ScalarType::Float +#define f8 ScalarType::Double +#define ud ScalarType::Undefined + static constexpr ScalarType _promoteTypesLookup + [static_cast(ScalarType::NumOptions)] + [static_cast(ScalarType::NumOptions)] = { + /* u1 i1 i2 i4 i8 f2 f4 f8, ud */ + /* u1 */ { u1, i2, i2, i4, i8, f2, f4, f8, ud }, + /* i1 */ { i2, i1, i2, i4, i8, f2, f4, f8, ud }, + /* i2 */ { i2, i2, i2, i4, i8, f4, f4, f8, ud }, + /* i4 */ { i4, i4, i4, i4, i8, f8, f4, f8, ud }, + /* i8 */ { i8, i8, i8, i8, i8, f8, f4, f8, ud }, + /* f2 */ { f2, f2, f4, f8, f8, f2, f4, f8, ud }, + /* f4 */ { f4, f4, f4, f4, f4, f4, f4, f8, ud }, + /* f8 */ { f8, f8, f8, f8, f8, f8, f8, f8, ud }, + /* ud */ { ud, ud, ud, ud, ud, ud, ud, ud, ud }, + }; +#undef u1 +#undef i1 +#undef i2 +#undef i4 +#undef i8 +#undef f2 +#undef f4 +#undef f8 +#undef ud + return _promoteTypesLookup[static_cast(a)][static_cast(b)]; +} + +struct Tensor; +typedef ArrayRef IntList; +typedef ArrayRef TensorList; + +} // namespace at diff --git a/aten/src/ATen/ScalarTypeUtils.h b/aten/src/ATen/ScalarTypeUtils.h new file mode 100644 index 0000000..ff96bbe --- /dev/null +++ b/aten/src/ATen/ScalarTypeUtils.h @@ -0,0 +1,19 @@ +#pragma once + +#include "ATen/ScalarType.h" + +namespace at { + +template +struct CTypeToScalarType { +}; + +#define DEFINE_TO_SCALAR_TYPE(ct, st, _2) \ +template <> \ +struct CTypeToScalarType { \ + static inline at::ScalarType to() { return at::ScalarType::st; } \ +}; +AT_FORALL_SCALAR_TYPES(DEFINE_TO_SCALAR_TYPE) +#undef DEFINE_TO_SCALAR_TYPE + +} // namespace at diff --git a/aten/src/ATen/SmallVector.cpp b/aten/src/ATen/SmallVector.cpp new file mode 100644 index 0000000..59095a2 --- /dev/null +++ b/aten/src/ATen/SmallVector.cpp @@ -0,0 +1,50 @@ +//===- llvm/ADT/SmallVector.cpp - 'Normally small' vectors ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SmallVector class. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::SmallVector. +// replaced report_bad_alloc_error with std::bad_alloc + +#include "SmallVector.h" + +namespace at { + +/// grow_pod - This is an implementation of the grow() method which only works +/// on POD-like datatypes and is out of line to reduce code duplication. +void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes, + size_t TSize) { + size_t CurSizeBytes = size_in_bytes(); + size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow. + if (NewCapacityInBytes < MinSizeInBytes) + NewCapacityInBytes = MinSizeInBytes; + + void *NewElts; + if (BeginX == FirstEl) { + NewElts = malloc(NewCapacityInBytes); + if (NewElts == nullptr) + throw std::bad_alloc(); + + // Copy the elements over. No need to run dtors on PODs. + memcpy(NewElts, this->BeginX, CurSizeBytes); + } else { + // If this wasn't grown from the inline copy, grow the allocated space. + NewElts = realloc(this->BeginX, NewCapacityInBytes); + if (NewElts == nullptr) + throw std::bad_alloc(); + } + + this->EndX = (char*)NewElts+CurSizeBytes; + this->BeginX = NewElts; + this->CapacityX = (char*)this->BeginX + NewCapacityInBytes; +} + +} diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h new file mode 100644 index 0000000..238a181 --- /dev/null +++ b/aten/src/ATen/SmallVector.h @@ -0,0 +1,976 @@ +//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the SmallVector class. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::SmallVector. +// replaced report_bad_alloc_error with std::bad_alloc +// replaced isPodLike with AT_IS_TRIVIALLY_COPYABLE +// replaced iterator_range constructor with inline Container&& constructor +// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers +// removed LLVM_UNLIKELY + +#pragma once + +#include "AlignOf.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if __GNUG__ && __GNUC__ < 5 +#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T) +#else +#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value +#endif + +namespace at { + +namespace detail { + +// From llvm/Support/MathExtras.h +static inline uint64_t NextPowerOf2(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +} + +/// This is all the non-templated stuff common to all SmallVectors. +class AT_API SmallVectorBase { +protected: + void *BeginX, *EndX, *CapacityX; + +protected: + SmallVectorBase(void *FirstEl, size_t Size) + : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {} + + /// This is an implementation of the grow() method which only works + /// on POD-like data types and is out of line to reduce code duplication. + void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize); + +public: + /// This returns size()*sizeof(T). + size_t size_in_bytes() const { + return size_t((char*)EndX - (char*)BeginX); + } + + /// capacity_in_bytes - This returns capacity()*sizeof(T). + size_t capacity_in_bytes() const { + return size_t((char*)CapacityX - (char*)BeginX); + } + + bool empty() const { return BeginX == EndX; } +}; + +/// This is the part of SmallVectorTemplateBase which does not depend on whether +/// the type T is a POD. The extra dummy template argument is used by ArrayRef +/// to avoid unnecessarily requiring T to be complete. +template +class SmallVectorTemplateCommon : public SmallVectorBase { +private: + template friend struct SmallVectorStorage; + + // Allocate raw space for N elements of type T. If T has a ctor or dtor, we + // don't want it to be automatically run, so we need to represent the space as + // something else. Use an array of char of sufficient alignment. + using U = AlignedCharArrayUnion; + U FirstEl; + // Space after 'FirstEl' is clobbered, do not add any instance vars after it. + +protected: + SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} + + void grow_pod(size_t MinSizeInBytes, size_t TSize) { + SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); + } + + /// Return true if this is a smallvector which has not had dynamic + /// memory allocated for it. + bool isSmall() const { + return BeginX == static_cast(&FirstEl); + } + + /// Put this vector in a state of being small. + void resetToSmall() { + BeginX = EndX = CapacityX = &FirstEl; + } + + void setEnd(T *P) { this->EndX = P; } + +public: + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using iterator = T *; + using const_iterator = const T *; + + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = std::reverse_iterator; + + using reference = T &; + using const_reference = const T &; + using pointer = T *; + using const_pointer = const T *; + + // forward iterator creation methods. + iterator begin() { return (iterator)this->BeginX; } + const_iterator begin() const { return (const_iterator)this->BeginX; } + iterator end() { return (iterator)this->EndX; } + const_iterator end() const { return (const_iterator)this->EndX; } + +protected: + iterator capacity_ptr() { return (iterator)this->CapacityX; } + const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;} + +public: + // reverse iterator creation methods. + reverse_iterator rbegin() { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } + reverse_iterator rend() { return reverse_iterator(begin()); } + const_reverse_iterator rend() const { return const_reverse_iterator(begin());} + + size_type size() const { return end()-begin(); } + size_type max_size() const { return size_type(-1) / sizeof(T); } + + /// Return the total number of elements in the currently allocated buffer. + size_t capacity() const { return capacity_ptr() - begin(); } + + /// Return a pointer to the vector's buffer, even if empty(). + pointer data() { return pointer(begin()); } + /// Return a pointer to the vector's buffer, even if empty(). + const_pointer data() const { return const_pointer(begin()); } + + reference operator[](size_type idx) { + assert(idx < size()); + return begin()[idx]; + } + const_reference operator[](size_type idx) const { + assert(idx < size()); + return begin()[idx]; + } + + reference front() { + assert(!empty()); + return begin()[0]; + } + const_reference front() const { + assert(!empty()); + return begin()[0]; + } + + reference back() { + assert(!empty()); + return end()[-1]; + } + const_reference back() const { + assert(!empty()); + return end()[-1]; + } +}; + +/// SmallVectorTemplateBase - This is where we put method +/// implementations that are designed to work with non-POD-like T's. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + static void destroy_range(T *S, T *E) { + while (S != E) { + --E; + E->~T(); + } + } + + /// Move the range [I, E) into the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(std::make_move_iterator(I), + std::make_move_iterator(E), Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(I, E, Dest); + } + + /// Grow the allocated memory (without initializing new elements), doubling + /// the size of the allocated memory. Guarantees space for at least one more + /// element, or MinSize more elements if specified. + void grow(size_t MinSize = 0); + +public: + void push_back(const T &Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*) this->end()) T(Elt); + this->setEnd(this->end()+1); + } + + void push_back(T &&Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*) this->end()) T(::std::move(Elt)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + this->end()->~T(); + } +}; + +// Define this out-of-line to dissuade the C++ compiler from inlining it. +template +void SmallVectorTemplateBase::grow(size_t MinSize) { + size_t CurCapacity = this->capacity(); + size_t CurSize = this->size(); + // Always grow, even from zero. + size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity+2)); + if (NewCapacity < MinSize) + NewCapacity = MinSize; + T *NewElts = static_cast(malloc(NewCapacity*sizeof(T))); + if (NewElts == nullptr) + throw std::bad_alloc(); + + // Move the elements over. + this->uninitialized_move(this->begin(), this->end(), NewElts); + + // Destroy the original elements. + destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + + this->setEnd(NewElts+CurSize); + this->BeginX = NewElts; + this->CapacityX = this->begin()+NewCapacity; +} + + +/// SmallVectorTemplateBase - This is where we put method +/// implementations that are designed to work with POD-like T's. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + // No need to do a destroy loop for POD's. + static void destroy_range(T *, T *) {} + + /// Move the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + // Just do a copy. + uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + // Arbitrary iterator types; just use the basic implementation. + std::uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy( + T1 *I, T1 *E, T2 *Dest, + typename std::enable_if::type, + T2>::value>::type * = nullptr) { + // Use memcpy for PODs iterated by pointers (which includes SmallVector + // iterators): std::uninitialized_copy optimizes to memmove, but we can + // use memcpy here. Note that I and E are iterators and thus might be + // invalid for memcpy if they are equal. + if (I != E) + memcpy(Dest, I, (E - I) * sizeof(T)); + } + + /// Double the size of the allocated memory, guaranteeing space for at + /// least one more element or MinSize if specified. + void grow(size_t MinSize = 0) { + this->grow_pod(MinSize*sizeof(T), sizeof(T)); + } + +public: + void push_back(const T &Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + memcpy(this->end(), &Elt, sizeof(T)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + } +}; + +/// This class consists of common code factored out of the SmallVector class to +/// reduce code duplication based on the SmallVector 'N' template parameter. +template +class SmallVectorImpl : public SmallVectorTemplateBase { + using SuperClass = SmallVectorTemplateBase; + +public: + using iterator = typename SuperClass::iterator; + using const_iterator = typename SuperClass::const_iterator; + using size_type = typename SuperClass::size_type; + +protected: + // Default ctor - Initialize to empty. + explicit SmallVectorImpl(unsigned N) + : SmallVectorTemplateBase(N*sizeof(T)) { + } + +public: + SmallVectorImpl(const SmallVectorImpl &) = delete; + + ~SmallVectorImpl() { + // Destroy the constructed elements in the vector. + this->destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + } + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->EndX = this->BeginX; + } + + void resize(size_type N) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + auto I = this->end(); + for (auto E = this->begin() + N; I != E; ++I) + new (&*I) T(); + this->setEnd(this->begin()+N); + } + } + + void resize(size_type N, const T &NV) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + std::uninitialized_fill(this->end(), this->begin()+N, NV); + this->setEnd(this->begin()+N); + } + } + + void reserve(size_type N) { + if (this->capacity() < N) + this->grow(N); + } + + T pop_back_val() { + T Result = ::std::move(this->back()); + this->pop_back(); + return Result; + } + + void swap(SmallVectorImpl &RHS); + + /// Add the specified range to the end of the SmallVector. + template ::iterator_category, + std::input_iterator_tag>::value>::type> + void append(in_iter in_start, in_iter in_end) { + size_type NumInputs = std::distance(in_start, in_end); + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + this->uninitialized_copy(in_start, in_end, this->end()); + this->setEnd(this->end() + NumInputs); + } + + /// Add the specified range to the end of the SmallVector. + void append(size_type NumInputs, const T &Elt) { + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + std::uninitialized_fill_n(this->end(), NumInputs, Elt); + this->setEnd(this->end() + NumInputs); + } + + void append(std::initializer_list IL) { + append(IL.begin(), IL.end()); + } + + // FIXME: Consider assigning over existing elements, rather than clearing & + // re-initializing them - for all assign(...) variants. + + void assign(size_type NumElts, const T &Elt) { + clear(); + if (this->capacity() < NumElts) + this->grow(NumElts); + this->setEnd(this->begin()+NumElts); + std::uninitialized_fill(this->begin(), this->end(), Elt); + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + void assign(in_iter in_start, in_iter in_end) { + clear(); + append(in_start, in_end); + } + + void assign(std::initializer_list IL) { + clear(); + append(IL); + } + + iterator erase(const_iterator CI) { + // Just cast away constness because this is a non-const member function. + iterator I = const_cast(CI); + + assert(I >= this->begin() && "Iterator to erase is out of bounds."); + assert(I < this->end() && "Erasing at past-the-end iterator."); + + iterator N = I; + // Shift all elts down one. + std::move(I+1, this->end(), I); + // Drop the last elt. + this->pop_back(); + return(N); + } + + iterator erase(const_iterator CS, const_iterator CE) { + // Just cast away constness because this is a non-const member function. + iterator S = const_cast(CS); + iterator E = const_cast(CE); + + assert(S >= this->begin() && "Range to erase is out of bounds."); + assert(S <= E && "Trying to erase invalid range."); + assert(E <= this->end() && "Trying to erase past the end."); + + iterator N = S; + // Shift all elts down. + iterator I = std::move(E, this->end(), S); + // Drop the last elts. + this->destroy_range(I, this->end()); + this->setEnd(I); + return(N); + } + + iterator insert(iterator I, T &&Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(::std::move(Elt)); + return this->end()-1; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + + ::new ((void*) this->end()) T(::std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = ::std::move(*EltPtr); + return I; + } + + iterator insert(iterator I, const T &Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(Elt); + return this->end()-1; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + ::new ((void*) this->end()) T(std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + const T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = *EltPtr; + return I; + } + + iterator insert(iterator I, size_type NumToInsert, const T &Elt) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(NumToInsert, Elt); + return this->begin()+InsertElt; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::fill_n(I, NumToInsert, Elt); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + std::fill_n(I, NumOverwritten, Elt); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt); + return I; + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + iterator insert(iterator I, ItTy From, ItTy To) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(From, To); + return this->begin()+InsertElt; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + size_t NumToInsert = std::distance(From, To); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::copy(From, To, I); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + for (T *J = I; NumOverwritten > 0; --NumOverwritten) { + *J = *From; + ++J; ++From; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(From, To, OldEnd); + return I; + } + + void insert(iterator I, std::initializer_list IL) { + insert(I, IL.begin(), IL.end()); + } + + template void emplace_back(ArgTypes &&... Args) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void *)this->end()) T(std::forward(Args)...); + this->setEnd(this->end() + 1); + } + + SmallVectorImpl &operator=(const SmallVectorImpl &RHS); + + SmallVectorImpl &operator=(SmallVectorImpl &&RHS); + + bool operator==(const SmallVectorImpl &RHS) const { + if (this->size() != RHS.size()) return false; + return std::equal(this->begin(), this->end(), RHS.begin()); + } + bool operator!=(const SmallVectorImpl &RHS) const { + return !(*this == RHS); + } + + bool operator<(const SmallVectorImpl &RHS) const { + return std::lexicographical_compare(this->begin(), this->end(), + RHS.begin(), RHS.end()); + } + + /// Set the array size to \p N, which the current array must have enough + /// capacity for. + /// + /// This does not construct or destroy any elements in the vector. + /// + /// Clients can use this in conjunction with capacity() to write past the end + /// of the buffer when they know that more elements are available, and only + /// update the size later. This avoids the cost of value initializing elements + /// which will only be overwritten. + void set_size(size_type N) { + assert(N <= this->capacity()); + this->setEnd(this->begin() + N); + } +}; + +template +void SmallVectorImpl::swap(SmallVectorImpl &RHS) { + if (this == &RHS) return; + + // We can only avoid copying elements if neither vector is small. + if (!this->isSmall() && !RHS.isSmall()) { + std::swap(this->BeginX, RHS.BeginX); + std::swap(this->EndX, RHS.EndX); + std::swap(this->CapacityX, RHS.CapacityX); + return; + } + if (RHS.size() > this->capacity()) + this->grow(RHS.size()); + if (this->size() > RHS.capacity()) + RHS.grow(this->size()); + + // Swap the shared elements. + size_t NumShared = this->size(); + if (NumShared > RHS.size()) NumShared = RHS.size(); + for (size_type i = 0; i != NumShared; ++i) + std::swap((*this)[i], RHS[i]); + + // Copy over the extra elts. + if (this->size() > RHS.size()) { + size_t EltDiff = this->size() - RHS.size(); + this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end()); + RHS.setEnd(RHS.end()+EltDiff); + this->destroy_range(this->begin()+NumShared, this->end()); + this->setEnd(this->begin()+NumShared); + } else if (RHS.size() > this->size()) { + size_t EltDiff = RHS.size() - this->size(); + this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end()); + this->setEnd(this->end() + EltDiff); + this->destroy_range(RHS.begin()+NumShared, RHS.end()); + RHS.setEnd(RHS.begin()+NumShared); + } +} + +template +SmallVectorImpl &SmallVectorImpl:: + operator=(const SmallVectorImpl &RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd; + if (RHSSize) + NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin()); + else + NewEnd = this->begin(); + + // Destroy excess elements. + this->destroy_range(NewEnd, this->end()); + + // Trim. + this->setEnd(NewEnd); + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: don't do this if they're efficiently moveable. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Copy construct the new elements in place. + this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + return *this; +} + +template +SmallVectorImpl &SmallVectorImpl::operator=(SmallVectorImpl &&RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If the RHS isn't small, clear this vector and then steal its buffer. + if (!RHS.isSmall()) { + this->destroy_range(this->begin(), this->end()); + if (!this->isSmall()) free(this->begin()); + this->BeginX = RHS.BeginX; + this->EndX = RHS.EndX; + this->CapacityX = RHS.CapacityX; + RHS.resetToSmall(); + return *this; + } + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd = this->begin(); + if (RHSSize) + NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); + + // Destroy excess elements and trim the bounds. + this->destroy_range(NewEnd, this->end()); + this->setEnd(NewEnd); + + // Clear the RHS. + RHS.clear(); + + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: this may not actually make any sense if we can efficiently move + // elements. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::move(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Move-construct the new elements in place. + this->uninitialized_move(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + + RHS.clear(); + return *this; +} + +/// Storage for the SmallVector elements which aren't contained in +/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1' +/// element is in the base class. This is specialized for the N=1 and N=0 cases +/// to avoid allocating unnecessary storage. +template +struct SmallVectorStorage { + typename SmallVectorTemplateCommon::U InlineElts[N - 1]; +}; +template struct SmallVectorStorage {}; +template struct SmallVectorStorage {}; + +/// This is a 'vector' (really, a variable-sized array), optimized +/// for the case when the array is small. It contains some number of elements +/// in-place, which allows it to avoid heap allocation when the actual number of +/// elements is below that threshold. This allows normal "small" cases to be +/// fast without losing generality for large inputs. +/// +/// Note that this does not attempt to be exception safe. +/// +template +class SmallVector : public SmallVectorImpl { + /// Inline space for elements which aren't stored in the base class. + SmallVectorStorage Storage; + +public: + SmallVector() : SmallVectorImpl(N) {} + + explicit SmallVector(size_t Size, const T &Value = T()) + : SmallVectorImpl(N) { + this->assign(Size, Value); + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { + this->append(S, E); + } + + template + explicit SmallVector(Container &&c) : SmallVectorImpl(N) { + this->append(c.begin(), c.end()); + } + + SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { + this->assign(IL); + } + + SmallVector(const SmallVector &RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(RHS); + } + + const SmallVector &operator=(const SmallVector &RHS) { + SmallVectorImpl::operator=(RHS); + return *this; + } + + SmallVector(SmallVector &&RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + template + const SmallVector &operator=(const Container &RHS) { + this->assign(RHS.begin(), RHS.end()); + return *this; + } + + SmallVector(SmallVectorImpl &&RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + const SmallVector &operator=(SmallVector &&RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + const SmallVector &operator=(SmallVectorImpl &&RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + const SmallVector &operator=(std::initializer_list IL) { + this->assign(IL); + return *this; + } +}; + +template +inline size_t capacity_in_bytes(const SmallVector &X) { + return X.capacity_in_bytes(); +} + +} // end namespace at + +namespace std { + + /// Implement std::swap in terms of SmallVector swap. + template + inline void + swap(at::SmallVectorImpl &LHS, at::SmallVectorImpl &RHS) { + LHS.swap(RHS); + } + + /// Implement std::swap in terms of SmallVector swap. + template + inline void + swap(at::SmallVector &LHS, at::SmallVector &RHS) { + LHS.swap(RHS); + } + +} // end namespace std diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp new file mode 100644 index 0000000..62c8356 --- /dev/null +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -0,0 +1,87 @@ +#include +#include + +namespace at { + + +// An empty dense tensor defaults to a 1-dimensional tensor of size [0] +// (recall, it is not a 0-dimensional tensor, because such a tensor would +// a scalar and have one element) +// +// Thus, an empty sparse tensor should be a 1-dimensional tensor of size [0]. +// Furthermore, we have dim == sparseDims + denseDims; since this is a sparse +// tensor, let us say that an empty sparse tensor has sparseDims == 1 and +// denseDims == 0. (There is a degree of freedom here, but given that this +// is a sparse dimension, it seems reasonable to demand that sparseDims > 0). +// +// In an ideal world, this would then mean we allocate a [1,0] size indices +// tensor and a [0] size values tensor for such an empty tensor. However, +// we don't currently support zero-size dimensions, so we can't actually +// do this; so we just allocate zero-size tensors for everything. +SparseTensorImpl::SparseTensorImpl(Type * type) + : TensorImpl(type) + , size_{0} + , sparseDims_(1) + , denseDims_(0) + , indices_(type->toDense().toScalarType(ScalarType::Long).tensor()) + , values_(type->toDense().tensor()) { + AT_ASSERT(type->is_sparse()); + } + +const char * SparseTensorImpl::toString() const { + // TODO: also give back type information + return "SparseTensor"; +} +IntList SparseTensorImpl::sizes() const { + return size_; +} +IntList SparseTensorImpl::strides() const { + AT_ERROR("sparse tensors do not have strides"); +} +int64_t SparseTensorImpl::dim() const { + return sparseDims_ + denseDims_; +} +Scalar SparseTensorImpl::localScalar() { + int64_t n = numel(); + AT_CHECK(n == 1, "a Tensor with ", n, " elements cannot be converted to Scalar"); + if (nnz_ == 0) return Scalar(0); + if (coalesced_) return values_.pImpl->localScalar(); + // You have a non-coalesced scalar sparse tensor?! Wow! Have + // a cookie. + return values_.sum().pImpl->localScalar(); +} +void * SparseTensorImpl::unsafeGetTH(bool retain) { + AT_ERROR("unsafeGetTH not supported for new style TensorImpl"); +} +std::unique_ptr SparseTensorImpl::storage() { + AT_ERROR("sparse tensors do not have storage"); +} + +void SparseTensorImpl::set_indices_and_values(const Tensor& indices, const Tensor& values) { + // TODO: Explicit empty test is needed because we don't handle size zero + // dimensions at the moment + bool empty = values.numel() == 0; + AT_CHECK(values.type().toSparse() == type(), "values type must match sparse tensor type"); + AT_CHECK(indices.type().scalarType() == kLong, "indices must be an int64 tensor"); + AT_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")"); + AT_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")"); + if (!empty) { + AT_CHECK(indices.dim() == 2, "indices must be nDim x nnz"); + AT_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz"); + AT_CHECK(indices.size(0) == sparseDims_, "indices has incorrect first dimension, expected ", sparseDims_, ", got ", indices.size(0)); + AT_CHECK(values.dim() == denseDims_ + 1, "values has incorrect number of dimensions, expected ", denseDims_ + 1, ", got ", values.dim()); + } else { + AT_CHECK(indices.numel() == 0, "if values is empty, indices must be empty too"); + } + indices_ = indices; + values_ = values; + // TODO: Eliminate this ternary when we handle size zero dimensions. + // (Actually, this will "accidentally" work today because all zero-size + // tensors have size [0], and so you'll get 0 when empty is zero; but it's + // more explicit this way.) + nnz_ = empty ? 0 : values.size(0); + coalesced_ = false; +} + + +} // namespace at diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h new file mode 100644 index 0000000..2093b45 --- /dev/null +++ b/aten/src/ATen/SparseTensorImpl.h @@ -0,0 +1,105 @@ +#pragma once + +#include "ATen/Tensor.h" +#include "ATen/TensorImpl.h" +#include "ATen/Error.h" + +namespace at { +struct AT_API SparseTensorImpl : public TensorImpl { + // Stored in COO format, indices + values. + + // Ideal INVARIANTS: + // _sparseDims: range [0, len(shape)]; _sparseDims + _denseDims = len(shape) + // _denseDims : range [0, len(shape)]; _sparseDims + _denseDims = len(shape) + // _indices.shape: dimensionality: 2, shape: (_sparseDims, nnz) + // _values.shape: dimensionality: 1 + _denseDims. shape: (nnz, shape[_sparseDims:]) + + // Actual INVARIANT differences: + // 1) _sparseDims: range [1, len(shape)] (i.e. we don't allow 0 sparse dimensions) + // 2) when nnz = 0, there is strange behavior because we lack 0-dimensional sparse tensors. Namely: + // dimensionality == 0, _sparseDims == 0, _denseDims == 0, _indices.shape == {0}, _values.shape == {0} + // 3) For both _indices.shape and _values.shape, the nnz dimension may be larger than nnz + // 4) For _values.shape, the non-nnz dimensions may be smaller than the corresponding dimension size, e.g. + // a shape (2,3) sparse tensor with _sparseDims == 1, may have _values.shape: (nnz, <=2, <=3). + + + // The true size of the sparse tensor (e.g., if you called to_dense() + // on it). When THTensor merges into TensorImpl, this field + // should move to the parent class. + std::vector size_; + + // The number of non-zero elements. + int64_t nnz_ = 0; + + int64_t sparseDims_ = 0; // number of sparse dimensions + int64_t denseDims_ = 0; // number of dense dimensions + + Tensor indices_; // always a LongTensor + Tensor values_; + + // A sparse tensor is 'coalesced' if every index occurs at most once in + // the indices tensor, and the indices are in sorted order. (This means + // that it is very easy to convert a coalesced tensor to CSR format: you + // need only compute CSR format indices.) + // + // Most math operations can only be performed on coalesced sparse tensors, + // because many algorithms proceed by merging two sorted lists (of indices). + bool coalesced_ = false; + +public: + // Public for now... + explicit SparseTensorImpl(Type * type); + + int64_t nnz() const { return nnz_; } + int64_t sparseDims() const { return sparseDims_; } + int64_t denseDims() const { return denseDims_; } + bool coalesced() const { return coalesced_; } + Tensor indices() const { return indices_; } + Tensor values() const { return values_; } + + const char * toString() const override; + IntList sizes() const override; + IntList strides() const override; + int64_t dim() const override; + Scalar localScalar() override; + void * unsafeGetTH(bool retain) override; + std::unique_ptr storage() override; + + // Some ops do some manual size fiddling. + // TODO: Figure out a more safe way to provide this functionality + std::vector& _sizes_mut() { return size_; } + + // WARNING: This function does NOT preserve invariants of sparseDims/denseDims with + // respect to indices and values + void raw_resize_(int64_t sparseDims, int64_t denseDims, ArrayRef size) { + // UGHHHHH. Legacy special case + if (size.size() == 0) { + size_ = {0}; + } else { + size_ = size; + } + sparseDims_ = sparseDims; + denseDims_ = denseDims; + } + + // TODO: I hate these two setters, please get rid of them!!! + void set_indices(const Tensor& indices) { + AT_ASSERT(indices.type().backend() == at::toDense(type().backend())); + AT_ASSERT(indices.type().scalarType() == kLong); + indices_ = indices; + } + void set_values(const Tensor& values) { + AT_ASSERT(values.type().toSparse() == type()); + values_ = values; + } + + void set_coalesced(bool coalesced) { coalesced_ = coalesced; } + void set_nnz(int64_t nnz) { nnz_ = nnz; } + + // This used to be called THSTensor_(_move) + // NB: This used to be able to avoid a refcount bump, but I was too lazy to + // make it happen + void set_indices_and_values(const Tensor& indices, const Tensor& values); +}; + +} // namespace at diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/SparseTensorRef.h new file mode 100644 index 0000000..9c9fada --- /dev/null +++ b/aten/src/ATen/SparseTensorRef.h @@ -0,0 +1,11 @@ +#pragma once + +namespace at { + +struct Tensor; +struct SparseTensorRef { + explicit SparseTensorRef(const Tensor& t): tref(t) {} + const Tensor& tref; +}; + +} diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h new file mode 100644 index 0000000..582a82a --- /dev/null +++ b/aten/src/ATen/Storage.h @@ -0,0 +1,41 @@ +#pragma once + +#include "ATen/Scalar.h" + +namespace at { + +struct Type; + +struct Storage { + static const char RESIZABLE = 2; + + Storage() {} + Storage(const Storage& other) = delete; + void operator=(const Storage&) = delete; + + virtual ~Storage() {}; + virtual size_t elementSize() const = 0; + virtual size_t size() const = 0; + virtual void* data() = 0; + virtual const void* data() const = 0; + virtual Storage& retain() = 0; + virtual Storage& free() = 0; + virtual void * unsafeGetTH(bool retain) const = 0; + + virtual Storage& resize(int64_t new_size) = 0; + + virtual Type & type() const = 0; + virtual int getDevice() const = 0; + virtual const char * toString() const = 0; + + virtual Storage& fill(Scalar value) = 0; + virtual Storage& set(size_t ind, Scalar value) = 0; + virtual Storage& fast_set(size_t ind, Scalar value) = 0; + virtual Scalar get(size_t ind) = 0; + virtual Scalar fast_get(size_t ind) = 0; + + virtual void set_flag(char flag) = 0; + virtual void clear_flag(char flag) = 0; +}; + +} // namespace at diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h new file mode 100644 index 0000000..11c6ca8 --- /dev/null +++ b/aten/src/ATen/THLongStorageView.h @@ -0,0 +1,77 @@ +#pragma once + +#include "TH/TH.h" +#include "TH/THStorage.hpp" +#include "TH/THTypeConversion.hpp" + +namespace at { + +enum class THLongStorageViewKind { + SIZE, + STRIDE, + LENGTH, +}; + +// make a fake storage out of a size, pointer pair... +// used as an argument where THSize and THStride are passed into TH +class THLongStorageView { +public: + operator THLongStorage*() { + if (storage.size == 0 && zero_dim_to_null) { + return nullptr; + } + return &storage; + } + + /* + // This is done as an enum, and not as static constructors, as there + // is no move/copy constructor for THLongStorageView + + static THLongStorageView makeFromSize(ArrayRef ref) { + ... + } + + static THLongStorageView makeFromLength(ArrayRef ref) { + ... + } + */ + + THLongStorageView(ArrayRef ref, THLongStorageViewKind kind) + : zero_dim_to_null(false) + { + // zero_dim_to_one converts an empty ArrayRef into [1] + // zero_dim_to_null converts an empty ArrayRef into a null THLongStorage + bool zero_dim_to_one = false; + bool noelem_to_empty = false; + switch (kind) { + case THLongStorageViewKind::SIZE: + zero_dim_to_one = true; + break; + case THLongStorageViewKind::STRIDE: + zero_dim_to_null = true; + break; + case THLongStorageViewKind::LENGTH: + break; + } + + if(zero_dim_to_one && ref.size() == 0) { + // make storage of size 0 actually a 1-length storage with 1 element + // so that our 0-dim tensors get allocated as 1-dim inside TH + one = 1; + storage.data_ptr = {&one, kCPU}; // non-owning + storage.size = 1; + } else { + storage.data_ptr = {const_cast(static_cast(ref.data())), kCPU}; // non-owning + storage.size = ref.size(); + } + storage.scalar_type = at::CTypeToScalarType>::to(); + storage.refcount = 0; + storage.flag = 0; + } +private: + int64_t one; + THLongStorage storage; + bool zero_dim_to_null; +}; + +} diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/Tensor.cpp new file mode 100644 index 0000000..88ecdab --- /dev/null +++ b/aten/src/ATen/Tensor.cpp @@ -0,0 +1,14 @@ +#include + +#include + +namespace at { + +void Tensor::print() const { + if (defined()) { + std::cerr << "[" << type().toString() << " " << sizes() << "]" << std::endl; + } else { + std::cerr << "[UndefinedTensor]" << std::endl; + } +} +} // namespace at diff --git a/aten/src/ATen/TensorAccessor.h b/aten/src/ATen/TensorAccessor.h new file mode 100644 index 0000000..e51af27 --- /dev/null +++ b/aten/src/ATen/TensorAccessor.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include + +#include "ATen/ScalarType.h" + +namespace at { + + +template +class TensorAccessorBase { +public: + TensorAccessorBase(T * data_, const int64_t * sizes_, const int64_t * strides_) + : data_(data_), sizes_(sizes_), strides_(strides_) {} + IntList sizes() { + return IntList(sizes_,N); + } + IntList strides() { + return IntList(strides_,N); + } + int64_t stride(int64_t i) { return strides()[i]; } + int64_t size(int64_t i) { return sizes()[i]; } +protected: + T * data_; + const int64_t* sizes_; + const int64_t* strides_; +}; + +template +class TensorAccessor : public TensorAccessorBase { +public: + TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_) + : TensorAccessorBase(data_,sizes_,strides_) {} + + TensorAccessor operator[](int64_t i) { + return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); + } +}; + +template +class TensorAccessor : public TensorAccessorBase { +public: + TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_) + : TensorAccessorBase(data_,sizes_,strides_) {} + T & operator[](int64_t i) { + return this->data_[this->strides_[0]*i]; + } +}; + +} diff --git a/aten/src/ATen/TensorBase.h b/aten/src/ATen/TensorBase.h new file mode 100644 index 0000000..3aea68f --- /dev/null +++ b/aten/src/ATen/TensorBase.h @@ -0,0 +1,108 @@ +#pragma once + +#include "ATen/TensorImpl.h" +#include "ATen/UndefinedTensor.h" + +namespace at { namespace detail { + +// TensorBaseImpl is the base class for Tensor which handles the reference counting +template +struct TensorBaseImpl { + TensorBaseImpl(): TensorBaseImpl(UndefinedTensor::singleton(), false) {} + TensorBaseImpl(TensorImpl * self, bool should_retain) + : pImpl(self) { + if (pImpl == nullptr) { + throw std::runtime_error("TensorBaseImpl with nullptr not supported"); + } + if(should_retain && pImpl != UndefinedTensor::singleton()) { + retain(); + } + } + TensorBaseImpl(const TensorBaseImpl & rhs) + : pImpl(rhs.pImpl) { + if (pImpl != UndefinedTensor::singleton()) { + retain(); + } + } + TensorBaseImpl(TensorBaseImpl && rhs) noexcept + : pImpl(rhs.pImpl) { + rhs.pImpl = UndefinedTensor::singleton(); + } + ~TensorBaseImpl() { + if (pImpl != UndefinedTensor::singleton()) { + release(); + } + } + TensorBaseImpl & operator=(TensorBaseImpl && rhs) & { + rhs.swap(*this); + return *this; + } + TensorBaseImpl & operator=(TensorBaseImpl const & rhs) & { + //TensorBaseImpl ctor retains original rhs.pImpl + //then rhs.pImpl is swapped with this->pImpl + //finally TensorBaseImpl dtor releases rhs.pImpl, which was originally this->pImpl + TensorBaseImpl(rhs).swap(*this); + return *this; + } + int64_t dim() const { + if (is_strong) { + return pImpl->dim(); + } else { + AT_ERROR("Can't call dim() on a WeakTensor"); + } + } + void reset() { + TensorBaseImpl().swap(*this); + } + void reset(TensorImpl * rhs) { + TensorBaseImpl(rhs, true).swap(*this); + } + void reset(TensorImpl * rhs, bool should_retain) { + TensorBaseImpl(rhs, should_retain).swap(*this ); + } + void swap(TensorBaseImpl & rhs) { + TensorImpl * tmp = pImpl; + pImpl = rhs.pImpl; + rhs.pImpl = tmp; + } + TensorImpl * get() const { + return pImpl; + } + TensorImpl * detach() { + TensorImpl * ret = pImpl; + pImpl = UndefinedTensor::singleton(); + return ret; + } + + bool defined() const { + return pImpl != UndefinedTensor::singleton(); + } + + friend struct Type; + + //TODO(zach): sort out friend structes +public: + TensorImpl * pImpl; + +private: + void retain() { + if (is_strong) { + pImpl->retain(); + } else { + pImpl->weak_retain(); + } + } + + void release() { + if (is_strong) { + pImpl->release(); + } else { + pImpl->weak_release(); + } + } +}; + +using TensorBase = TensorBaseImpl; +using WeakTensorBase = TensorBaseImpl; + +}} // namespace at::detail diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp new file mode 100644 index 0000000..98d47ec --- /dev/null +++ b/aten/src/ATen/TensorGeometry.cpp @@ -0,0 +1,23 @@ +#include + +#include + +namespace at { + +bool TensorGeometry::is_contiguous() const { + int64_t dim = sizes_.size(); + int64_t expected_stride = 1; + for (int64_t i = dim - 1; i >= 0; i--) { + if (sizes_[i] != 1 && strides_[i] != expected_stride) { + return false; + } + expected_stride *= sizes_[i]; + } + return true; +} + +Tensor TensorGeometry::zeros_with_stride(const Type& type) const { + return type.tensor(sizes_, strides_).zero_(); +} + +} // namespace at diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h new file mode 100644 index 0000000..60f6098 --- /dev/null +++ b/aten/src/ATen/TensorGeometry.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include + +namespace at { + +struct AT_API TensorGeometry { + TensorGeometry() : storage_offset_(0) {} + + explicit TensorGeometry(IntList sizes) + : sizes_(sizes) + , strides_(sizes.size()) + , storage_offset_(0) { + int64_t dim = sizes.size(); + int64_t expected_stride = 1; + for (int64_t i = dim - 1; i >= 0; i--) { + strides_[i] = expected_stride; + expected_stride *= sizes_[i]; + } + } + + explicit TensorGeometry(const Tensor& t) + : sizes_(t.sizes()) + , strides_(t.strides()) + , storage_offset_(t.storage_offset()) {} + + // true if the tensor is contiguous + bool is_contiguous() const; + + // creates a new tensor with the sizes and strides of the source + Tensor zeros_with_stride(const Type& type) const; + + int64_t dim() const { return sizes_.size(); } + int64_t size(int64_t dim) const { + dim = maybe_wrap_dim(dim, this->dim()); + return sizes_.at(static_cast(dim)); + } + IntList sizes() const { return IntList{ sizes_ }; } + int64_t stride(int64_t dim) const { + dim = maybe_wrap_dim(dim, this->dim()); + return strides_.at(static_cast(dim)); + } + IntList strides() const { return IntList{ strides_ }; } + int64_t storage_offset() const { return storage_offset_; } + int64_t numel() const { + int64_t r = 1; + for (auto s : sizes()) { + r *= s; + } + return r; + } + + TensorGeometry transpose(int64_t dim0, int64_t dim1) { + TensorGeometry r = *this; // copy + AT_CHECK(dim0 < dim(), "transpose: dim0=", dim0, " out of range (dim=", dim(), ")") + AT_CHECK(dim1 < dim(), "transpose: dim1=", dim1, " out of range (dim=", dim(), ")") + std::swap(r.sizes_[dim0], r.sizes_[dim1]); + std::swap(r.strides_[dim0], r.strides_[dim1]); + return r; + } + + std::vector sizes_; + std::vector strides_; + int64_t storage_offset_; +}; + +} // namespace at diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp new file mode 100644 index 0000000..a77664d --- /dev/null +++ b/aten/src/ATen/TensorImpl.cpp @@ -0,0 +1,36 @@ +#include + +#include +#include + +namespace at { +Tensor& TensorImpl::grad() { + AT_ERROR("grad is not implemented for Tensor"); +} + +const Tensor& TensorImpl::grad() const { + AT_ERROR("grad is not implemented for Tensor"); +} + +Tensor TensorImpl::detach() const { + AT_ERROR("detach is not implemented for Tensor"); +} + +void TensorImpl::backward( + at::optional gradient, + bool keep_graph, + bool create_graph) { + AT_ERROR("backward is not implemented for Tensor"); +} + +void TensorImpl::set_data(Tensor new_data) { + AT_ERROR("set_type is not implemented for Tensor"); +} + +void Tensor::backward( + at::optional gradient, + bool keep_graph, + bool create_graph) { + pImpl->backward(std::move(gradient), keep_graph, create_graph); +} +} // namespace at diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h new file mode 100644 index 0000000..f5abf15 --- /dev/null +++ b/aten/src/ATen/TensorImpl.h @@ -0,0 +1,98 @@ +#pragma once + +#include +#include + +#include "ATen/Retainable.h" +#include "ATen/ScalarType.h" +#include "ATen/optional.h" + +namespace at { +class Scalar; +struct Type; +struct Storage; +struct Tensor; +} // namespace at + +namespace at { +struct TensorImpl : public Retainable { + explicit TensorImpl(Type * type) + : is_scalar(false), type_(type) {} + + Type & type() const { + return *type_; + } + virtual const char * toString() const = 0; + virtual IntList sizes() const = 0; + virtual IntList strides() const = 0; + virtual int64_t dim() const = 0; + /** + * Perform a conversion of this tensor to a scalar, if numel() == 1. + * Otherwise, raise an error. + */ + virtual Scalar localScalar() = 0; + virtual void * unsafeGetTH(bool retain) = 0; + virtual std::unique_ptr storage() = 0; + friend struct Type; + + int64_t numel() { + int64_t n = 1; + for (auto s : sizes()) { + n *= s; + } + return n; + } + + // 0-dim patchup of TH requires us to have a flag marking + // if a Tensor should be treated as 0-dim. + // the generated wrapper manipulates this flag. + // the setter should never be exposed in Tensor's public API + // because eventually we would like isScalar() to just be dim() == 0; + bool isScalar() const { + return is_scalar; + } + // this is called by the generated wrapper code when there are conditions + // when this output tensor should be a scalar. e.g. when all inputs + // to a function 'add' were scalars, then condition_when_scalar == true. + // we also prevent this from getting marked as a scalar if it is not + // the right shape afterall. + TensorImpl* maybeScalar(bool condition_when_scalar) { + is_scalar = false; //force dim() to tell the truth for TH + is_scalar = condition_when_scalar && dim() == 1 && sizes()[0] == 1; + return this; + } + void setScalar(bool s) { + is_scalar = s; + } + + // ~~~~~ Autograd API ~~~~~ + // Some methods below are defined in TensorImpl.cpp because Tensor is an + // incomplete type. + + AT_API virtual void set_requires_grad(bool requires_grad) { + AT_ERROR("set_requires_grad is not implemented for Tensor"); + } + AT_API virtual bool requires_grad() const { + AT_ERROR("requires_grad is not implemented for Tensor"); + } + + AT_API virtual Tensor& grad(); + AT_API virtual const Tensor& grad() const; + + AT_API virtual Tensor detach() const; + AT_API virtual void detach_() { + AT_ERROR("detach_ is not implemented for Tensor"); + } + + AT_API virtual void backward( + at::optional gradient, + bool keep_graph, + bool create_graph); + + AT_API virtual void set_data(Tensor new_data); + +protected: + bool is_scalar; + Type * type_; +}; +} // namespace at diff --git a/aten/src/ATen/TensorOperators.h b/aten/src/ATen/TensorOperators.h new file mode 100644 index 0000000..a1b191f --- /dev/null +++ b/aten/src/ATen/TensorOperators.h @@ -0,0 +1,93 @@ +#pragma once + +#include "ATen/Scalar.h" +#include "ATen/Tensor.h" +#include "ATen/Type.h" + +#include +#include + +namespace at { + + +inline Tensor & Tensor::operator=(Scalar v) && { + return fill_(v); +} +inline Tensor Tensor::operator-() const { + return neg(); +} +inline Tensor& Tensor::operator+=(const Tensor & other) { + return add_(other); +} +inline Tensor& Tensor::operator+=(Scalar other) { + return add_(other); +} +inline Tensor& Tensor::operator-=(const Tensor & other) { + return sub_(other); +} +inline Tensor& Tensor::operator-=(Scalar other) { + return sub_(other); +} +inline Tensor& Tensor::operator*=(const Tensor & other) { + return mul_(other); +} +inline Tensor& Tensor::operator*=(Scalar other) { + return mul_(other); +} +inline Tensor& Tensor::operator/=(const Tensor & other) { + return div_(other); +} +inline Tensor& Tensor::operator/=(Scalar other) { + return div_(other); +} +inline Tensor Tensor::operator[](Scalar index) const { + AT_CHECK( + index.local().isIntegral(), + "Can only index tensors with integral scalars (got ", + index.toTensor().type().toString(), ")"); + return select(0, index.toLong()); +} +inline Tensor Tensor::operator[](Tensor index) const { + // These properties are checked in the Scalar constructor, but we already + // check them here to provide more useful diagnostics for the user. + AT_CHECK(index.defined(), "Can only index with tensors that are defined"); + AT_CHECK( + index.dim() == 0, + "Can only index with tensors that are scalars (zero-dim)"); + // The Scalar(Tensor) constructor is explicit, so we need to call it. + return this->operator[](Scalar(index)); +} +inline Tensor Tensor::operator[](int64_t index) const { + return select(0, index); +} + +#define AT_FORALL_BINARY_OPS(_) \ +_(+,x.add(y), y.add(x)) \ +_(*,x.mul(y), y.mul(x)) \ +_(-,x.sub(y), y.type().tensor().resize_(y.sizes()).fill_(x).sub_(y)) \ +_(/,x.div(y), y.type().tensor().resize_(y.sizes()).fill_(x).div_(y)) \ +_(%,x.remainder(y), y.type().tensor().resize_(y.sizes()).fill_(x).remainder_(y)) \ +_(<,x.lt(y), y.gt(x)) \ +_(<=,x.le(y), y.ge(x)) \ +_(>,x.gt(y),y.lt(x)) \ +_(>=,x.ge(y), y.le(x)) \ +_(==,x.eq(y), y.eq(x)) \ +_(!=,x.ne(y), y.ne(x)) + +#define DEFINE_OPERATOR(op,body,reverse_scalar_body) \ +static inline Tensor operator op(const Tensor & x, const Tensor & y) { \ + return body; \ +} \ +static inline Tensor operator op(const Tensor & x, Scalar y) { \ + return body; \ +} \ +static inline Tensor operator op(Scalar x, const Tensor & y) { \ + return reverse_scalar_body; \ +} + + +AT_FORALL_BINARY_OPS(DEFINE_OPERATOR) +#undef DEFINE_OPERATOR +#undef AT_FORALL_BINARY_OPS + +} diff --git a/aten/src/ATen/TensorOptions.cpp b/aten/src/ATen/TensorOptions.cpp new file mode 100644 index 0000000..cb8b9bf --- /dev/null +++ b/aten/src/ATen/TensorOptions.cpp @@ -0,0 +1,19 @@ +#include + +#include +#include +#include +#include +#include + +namespace at { + +TensorOptions::TensorOptions(bool use_thread_local_default_options) { + if (use_thread_local_default_options) { + this->dtype(DefaultTensorOptions::get().dtype()); + this->device(DefaultTensorOptions::get().device()); + this->layout(DefaultTensorOptions::get().layout()); + this->requires_grad(DefaultTensorOptions::get().requires_grad()); + } +} +} // namespace at diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h new file mode 100644 index 0000000..53ad9d8 --- /dev/null +++ b/aten/src/ATen/TensorOptions.h @@ -0,0 +1,279 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { + +/// A class to encapsulate construction axes of a `Tensor`. +/// `TensorOptions` is a virtual class to enable overriding of certain methods +/// by subclasses in other libraries, such as PyTorch. In PyTorch, there is a +/// `torch::TensorOptions` subclass of this `TensorOptions`, which changes +/// `type()` to return a variable type instead of a tensor type, such that +/// variables are created inside factory methods, instead of tensors. +struct AT_API TensorOptions { + TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {} + + /// Constructs the `TensorOptions` with defaults taken from the thread local + /// `TensorOptions` object if `use_thread_local_default_options`, else + /// defaults to: + /// - dtype: kFloat, + /// - device: kCPU, + /// - layout: kStrided, + /// - requires_grad: false + explicit TensorOptions(bool use_thread_local_default_options); + + /// Constructs the `TensorOptions` from the type of the given `Tensor`. + /// If the `Tensor` has a CUDA type, the `device_index` will match that of the + /// tensor. The `requires_grad` property of the tensor is ignored and set to + /// false in the created `TensorOptions`. See the constructor from `Type` for + /// the semantics w.r.t. the `type()` method. + explicit TensorOptions(Tensor tensor, bool discard_runtime_type = false) { + if (!discard_runtime_type) { + type_ = &tensor.type(); + } + this->dtype(tensor.dtype()); + this->device(tensor.device()); + this->layout(tensor.layout()); + } + + /// Constructs the `TensorOptions` from a type and a `device_index`. + /// + /// If `discard_runtime_type` is false (the default), the behavior of + /// `TensorOptions::type()` is changed in that it will always return this + /// `type`, irrespective of any `device` or `dtype` or `layout` specified at a + /// later time. This is to ensure that when a `TensorOptions` object is + /// constructed from a tensor's type, and that type has a dynamic type other + /// than `at::Type` (e.g. `torch::autograd::VariableType`), constructing a new + /// tensor from this `TensorOptions` will use this same derived type. If + /// instead the given `type` were destructured into its components (backend, + /// dtype and layout), information about the runtime type of the `Type` would + /// be lost. Set `discard_runtime_type` to `true` to always destructure the + /// type into its components and discard its runtime type. + /* implicit */ TensorOptions( + const Type& type, + int32_t device_index = -1, + bool discard_runtime_type = false) { + if (!discard_runtime_type) { + type_ = &type; + } + this->dtype(type.scalarType()); + this->device({type.backend(), device_index}); + this->layout(type.layout()); + } + + /// Constructs a `TensorOptions` object with the given layout. + /* implicit */ TensorOptions(Layout layout) : TensorOptions() { + this->layout(layout); + } + + /// Constructs a `TensorOptions` object with the given device. + /* implicit */ TensorOptions(Device device) : TensorOptions() { + this->device(device); + } + + /// Constructs a `TensorOptions` object from a backend, forwarded to the + /// `Device` constructor. + /* implicit */ TensorOptions(Backend backend) + : TensorOptions(Device(backend)) {} + + /// Constructs a `TensorOptions` object with the given dtype. + /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() { + this->dtype(dtype); + } + + /// True if all elements of the `TensorOptions` match that of the other. + bool operator==(const TensorOptions& other) const noexcept { + return dtype_ == other.dtype_ && layout_ == other.layout_ && + device_ == other.device_ && requires_grad_ == other.requires_grad_; + } + + /// True if any of the elements of this `TensorOptions` do not match that of + /// the other. + bool operator!=(const TensorOptions& other) const noexcept { + return !(*this == other); + } + + /// Discards the runtime type stored if the `TensorOptions` was constructed + /// from a `Tensor` or a `Type`. See the documentation of the constructor from + /// a `Type` for implications on the behavior of the `type()` method on + /// `TensorOptions`. + const TensorOptions& discard_runtime_type() const { + type_ = nullptr; + return *this; + } + + /// Sets the device of the `TensorOptions`. + TensorOptions& device(Device device) { + device_ = std::move(device); + update_underlying_type(); + return *this; + } + + /// Sets the device of the `TensorOptions` to CUDA, and then sets the device + /// index to the given one. + TensorOptions& device_index(int32_t device_index) { + return device({Device::Type::CUDA, device_index}); + } + + /// Sets the dtype of the `TensorOptions`. + TensorOptions& dtype(ScalarType dtype) { + dtype_ = dtype; + update_underlying_type(); + return *this; + } + + /// Sets the layout of the `TensorOptions`. + TensorOptions& layout(Layout layout) { + layout_ = layout; + update_underlying_type(); + return *this; + } + + /// Sets the `requires_grad` property of the `TensorOptions`. + TensorOptions& requires_grad(bool requires_grad) { + requires_grad_ = requires_grad; + return *this; + } + + /// Returns the device of the `TensorOptions`. + const Device& device() const noexcept { + return device_; + } + + /// Returns the device index of the `TensorOptions`. + int32_t device_index() const noexcept { + return device_.index(); + } + + /// Returns the dtype of the `TensorOptions`. + ScalarType dtype() const noexcept { + return dtype_; + } + + /// Returns the layout of the `TensorOptions`. + Layout layout() const noexcept { + return layout_; + } + + /// Returns the `requires_grad` property of the `TensorOptions`. + bool requires_grad() const noexcept { + return requires_grad_; + } + + /// Constructs an `at::Type` from the members of the `TensorOptions`. + const Type& type() const { + if (type_ != nullptr) { + return *type_; + } + return getType(backend(), dtype_); + } + + private: + /// Updates any stored underlying type to the current construction axes. + void update_underlying_type() { + if (type_) { + type_ = &type_->toScalarType(dtype_).toBackend(backend()); + } + } + + // Resolves the ATen backend specified by the current construction axes. + Backend backend() const noexcept { + Backend backend; + if (device_.type() == Device::Type::CPU) { + backend = (layout_ == kStrided) ? kCPU : kSparseCPU; + } else { + backend = (layout_ == kStrided) ? kCUDA : kSparseCUDA; + } + return backend; + } + + private: + ScalarType dtype_{kFloat}; + Device device_{Device::Type::CPU}; + Layout layout_{Layout::Strided}; + bool requires_grad_{false}; + // Not part of the observable API, so make `mutable` so we can set it to + // `null` in `discard_runtime_type`. + mutable const Type* type_{nullptr}; +}; + +/// Convenience function that returns a `TensorOptions` object with the `dtype` +/// set to the given one. +inline TensorOptions dtype(ScalarType dtype) { + return TensorOptions().dtype(dtype); +} + +/// Convenience function that returns a `TensorOptions` object with the `layout` +/// set to the given one. +inline TensorOptions layout(Layout layout) { + return TensorOptions().layout(layout); +} + +/// Convenience function that returns a `TensorOptions` object with the `device` +/// set to the given one. +inline TensorOptions device(Device device) { + return TensorOptions().device(std::move(device)); +} + +/// Convenience function that returns a `TensorOptions` object with the +/// `device_index` set to the given one. +inline TensorOptions device_index(int32_t device_index) { + return TensorOptions().device_index(device_index); +} + +/// Convenience function that returns a `TensorOptions` object with the +/// `requires_grad` set to the given one. +inline TensorOptions requires_grad(bool requires_grad = true) { + return TensorOptions().requires_grad(requires_grad); +} + +/// From Tensor.h +inline TensorOptions Tensor::options() const { + return TensorOptions(*this); +} + +namespace detail { +inline Tensor to( + const Tensor& tensor, + const TensorOptions& options, + bool non_blocking) { + // Don't copy if the options match. + if (tensor.options() == options) { + return tensor; + } + DeviceGuard guard(options.device()); + return options.type().copy(tensor, non_blocking); +} +} // namespace detail + +inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) + const { + if (this->device() == device && this->dtype() == dtype) { + return *this; + } + return detail::to(*this, options().device(device).dtype(dtype), non_blocking); +} + +inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const { + if (this->dtype() == dtype) { + return *this; + } + return detail::to(*this, options().dtype(dtype), non_blocking); +} + +inline Tensor Tensor::to(Device device, bool non_blocking) const { + if (this->device() == device) { + return *this; + } + return detail::to(*this, options().device(device), non_blocking); +} +} // namespace at diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp new file mode 100644 index 0000000..2652212 --- /dev/null +++ b/aten/src/ATen/TensorUtils.cpp @@ -0,0 +1,218 @@ +#include "ATen/Config.h" +#include "ATen/TensorUtils.h" + +#include "ATen/ATen.h" + +#include +#include + +namespace at { + +std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) { + if (t.pos == 0) { + // 0 is distinguished; it usually indicates 'self' or the return + // tensor + out << "'" << t.name << "'"; + } else { + out << "argument #" << t.pos << " '" << t.name << "'"; + } + return out; +} + +void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim) { + AT_CHECK(t->dim() == dim, + "Expected ", dim, "-dimensional tensor, but got ", t->dim(), + "-dimensional tensor for ", t," (while checking arguments for ", c, ")"); +} + +void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end) { + AT_CHECK( + t->dim() >= dim_start && t->dim() < dim_end, + "Expected ", dim_start, " to ", (dim_end - 1), " dimensions, but got ", + t->dim(), "-dimensional tensor for ", t, " (while checking arguments for ", + c, ")"); +} + +void checkContiguous(CheckedFrom c, const TensorGeometryArg& t) { + AT_CHECK( + t->is_contiguous(), + "Expected contiguous tensor, but got non-contiguous tensor for ", t, + " (while checking arguments for ", c, ")"); +} + +void checkAllContiguous(CheckedFrom c, at::ArrayRef ts) { + for (auto& t : ts) { + if (!t->defined()) continue; + checkContiguous(c, t); + } +} + +void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntList sizes) { + checkDim(c, t, sizes.size()); + AT_CHECK( + t->sizes().equals(sizes), + "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(), + " for ", t, " (while checking arguments for ", c, ")"); +} + +void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size) { + AT_CHECK( + t->size(dim) == size, + "Expected tensor to have size ", size, " at dimension ", dim, + ", but got size ", t->size(dim), " for ", t, + " (while checking arguments for ", c, ")"); +} + +void checkAllSame(CheckedFrom c, ArrayRef tensors, void(*fn)(CheckedFrom, const TensorArg&, const TensorArg&)) { + const TensorArg* t0 = nullptr; + for (auto& t : tensors) { + if (!t->defined()) continue; + if (t0 != nullptr) { + fn(c, *t0, t); + } else { + t0 = &t; + } + } +} + +void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { + AT_CHECK( + t1->sizes().equals(t2->sizes()), + "Expected tensor for ", t1, " to have same size as tensor for ", t2, + "; but ", t1->sizes(), " does not equal ", t2->sizes(), + " (while checking arguments for ", c, ")"); +} + +void checkAllSameSize(CheckedFrom c, ArrayRef tensors) { + checkAllSame(c, tensors, checkSameSize); +} + +void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel) { + AT_CHECK( + t->numel() == numel, + "Expected tensor for ", t, " to have ", numel, + " elements; but it actually has ", t->numel(), " elements", + " (while checking arguments for ", c, ")"); +} + +void checkSameNumel(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { + AT_CHECK( + t1->numel() == t2->numel(), + "Expected tensor for ", t1, + " to have same number of elements as tensor for ", t2, "; but ", + t1->numel(), " does not equal ", t2->numel(), + " (while checking arguments for ", c, ")"); +} + +void checkAllSameNumel(CheckedFrom c, ArrayRef tensors) { + checkAllSame(c, tensors, checkSameNumel); +} + +void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { + if (! (t1->is_cuda()) || ! (t2->is_cuda())) { + std::ostringstream oss; + if (! t1->is_cuda()) { + oss << "Tensor for " << t1 << " is on CPU, "; + } + if (! t2->is_cuda()) { + oss << "Tensor for " << t2 << " is on CPU, "; + } + oss << "but expected " << ((!(t1->is_cuda() || t2->is_cuda())) ? "them" : "it") + << " to be on GPU (while checking arguments for " << c << ")"; + AT_ERROR(oss.str()); + } + AT_CHECK( + t1->get_device() == t2->get_device(), + "Expected tensor for ", t1, " to have the same device as tensor for ", t2, + "; but device ", t1->get_device(), " does not equal ", t2->get_device(), + " (while checking arguments for ", c, ")"); +} + +void checkAllSameGPU(CheckedFrom c, ArrayRef tensors) { + checkAllSame(c, tensors, checkSameGPU); +} + +void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { + AT_CHECK( + t1->type() == t2->type(), + "Expected tensor for ", t1, " to have the same type as tensor for ", t2, + "; but type ", t1->toString(), " does not equal ", t2->toString(), + " (while checking arguments for ", c, ")"); +} + +void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType ty) { + AT_CHECK( + t->type().scalarType() == ty, + "Expected tensor for ", t, " to have scalar type ", toString(ty), + "; but got ", t->toString(), " instead (while checking arguments for ", c, + ")"); +} + +void checkScalarTypes(CheckedFrom c, const TensorArg& t, + at::ArrayRef l) { + if (std::find(l.begin(), l.end(), t->type().scalarType()) == l.end()) { + std::ostringstream oss; + oss << "Expected tensor for " << t << " to have one of the following " + << "scalar types: "; + size_t i = 0; + for (auto ty : l) { + if (i != 0) { + oss << ", "; + } + oss << toString(ty); + i++; + } + oss << "; but got " << t->toString() + << " instead (while checking arguments for " << c << ")"; + AT_ERROR(oss.str()); + } +} + +void checkAllSameType(CheckedFrom c, ArrayRef tensors) { + checkAllSame(c, tensors, checkSameType); +} + +void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2) { + AT_CHECK( + t1->dim() == t2->dim(), + "Expected tensor for ", t1, " to have the same dimension as tensor for ", + t2, "; but ", t1->dim(), " does not equal ", t2->dim(), + " (while checking arguments for ", c, ")"); +} + +void checkDefined(CheckedFrom c, const TensorArg& t) { + AT_CHECK( + t->defined(), + "Expected tensor for ", t, " to be non-null, but it was undefined ", + " (while checking arguments for ", c, ")"); +} + +void checkAllDefined(CheckedFrom c, ArrayRef ts) { + // NB: don't filter defined here + for (auto t : ts) { + checkDefined(c, t); + } +} + +void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) { + AT_CHECK( + t.type().backend() == backend, + "Expected tensor to have ", toString(backend), + " Backend, but got tensor with ", toString(t.type().backend()), " Backend ", + "(while checking arguments for ", c, ")"); +} + +void checkBackend(CheckedFrom c, ArrayRef tensors, at::Backend backend) { + for (auto &t : tensors) { + checkBackend(c, t, backend); + } +} + +void * maybe_data_ptr(const Tensor& tensor) { + return tensor.defined() ? (void *)tensor.data_ptr() : nullptr; +} + +void * maybe_data_ptr(const TensorArg& tensor) { + return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; +} +} diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h new file mode 100644 index 0000000..cc7453f --- /dev/null +++ b/aten/src/ATen/TensorUtils.h @@ -0,0 +1,81 @@ +#pragma once + +#include "ATen/Tensor.h" +#include "ATen/TensorGeometry.h" +#include "ATen/Utils.h" + +// These functions are NOT in Utils.h, because this file has a dep on Tensor.h + +namespace at { + +// The following are utility functions for checking that arguments +// make sense. These are particularly useful for native functions, +// which do NO argument checking by default. + +struct AT_API TensorArg { + Tensor tensor; + const char* name; + int pos; // 1-indexed + TensorArg(Tensor tensor, const char* name, int pos) + : tensor(std::move(tensor)), name(name), pos(pos) {} + const Tensor* operator->() const { return &tensor; } + const Tensor& operator*() const { return tensor; } +}; + +struct AT_API TensorGeometryArg { + TensorGeometry tensor; + const char* name; + int pos; // 1-indexed + /* implicit */ TensorGeometryArg(TensorArg arg) + : tensor(TensorGeometry{arg.tensor}), name(arg.name), pos(arg.pos) {} + TensorGeometryArg(TensorGeometry tensor, const char* name, int pos) + : tensor(tensor), name(name), pos(pos) {} + const TensorGeometry* operator->() const { return &tensor; } + const TensorGeometry& operator*() const { return tensor; } +}; + +// A string describing which function did checks on its input +// arguments. +// TODO: Consider generalizing this into a call stack. +using CheckedFrom = const char*; + +// The undefined convention: singular operators assume their arguments +// are defined, but functions which take multiple tensors will +// implicitly filter out undefined tensors (to make it easier to perform +// tests which should apply if the tensor is defined, and should not +// otherwise.) +// +// NB: This means that the n-ary operators take lists of TensorArg, +// not TensorGeometryArg, because the Tensor to TensorGeometry +// conversion will blow up if you have undefined tensors. + +AT_API std::ostream& operator<<(std::ostream & out, TensorGeometryArg t); +AT_API void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim); +// NB: this is an inclusive-exclusive range +AT_API void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end); +AT_API void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2); +AT_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t); +AT_API void checkAllContiguous(CheckedFrom c, at::ArrayRef ts); +AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntList sizes); +AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size); +AT_API void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel); +AT_API void checkSameNumel(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2); +AT_API void checkAllSameNumel(CheckedFrom c, ArrayRef tensors); +AT_API void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType s); +AT_API void checkScalarTypes(CheckedFrom c, const TensorArg& t, at::ArrayRef l); +AT_API void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2); +AT_API void checkAllSameGPU(CheckedFrom c, ArrayRef tensors); +AT_API void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2); +AT_API void checkAllSameType(CheckedFrom c, ArrayRef tensors); +AT_API void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2); +AT_API void checkDefined(CheckedFrom c, const TensorArg& t); +AT_API void checkAllDefined(CheckedFrom c, at::ArrayRef t); + +// FixMe: does TensorArg slow things down? +AT_API void checkBackend(CheckedFrom c, at::ArrayRef t, at::Backend backend); + +// Methods for getting data_ptr if tensor is defined +AT_API void * maybe_data_ptr(const Tensor& tensor); +AT_API void * maybe_data_ptr(const TensorArg& tensor); + +} diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp new file mode 100644 index 0000000..9c9e989 --- /dev/null +++ b/aten/src/ATen/UndefinedTensor.cpp @@ -0,0 +1,43 @@ +#include "ATen/UndefinedTensor.h" +#include "ATen/Context.h" +#include "ATen/Error.h" + +namespace at { + +// should this use the globalContext? Can it get a context passed in somehow? +UndefinedTensor::UndefinedTensor() +: TensorImpl(&(globalContext().getType(Backend::Undefined,ScalarType::Undefined))) { +} + +const char * UndefinedTensor::toString() const { + return "UndefinedTensor"; +} + +IntList UndefinedTensor::sizes() const { + AT_ERROR("sizes() called on undefined Tensor"); +} + +int64_t UndefinedTensor::dim() const { + AT_ERROR("dim() called on undefined Tensor"); +} + +const char * UndefinedTensor::typeString() { + return "UndefinedType"; +} +void * UndefinedTensor::unsafeGetTH(bool retain) { + AT_ERROR("unsafeGetTH(bool retain) called on undefined Tensor"); +} +std::unique_ptr UndefinedTensor::storage() { + AT_ERROR("storage() called on undefined Tensor"); +} + +IntList UndefinedTensor::strides() const { + AT_ERROR("strides() called on undefined Tensor"); +} +Scalar UndefinedTensor::localScalar() { + AT_ERROR("localScalar() called on undefined Tensor"); +} + +UndefinedTensor UndefinedTensor::_singleton; + +} diff --git a/aten/src/ATen/UndefinedTensor.h b/aten/src/ATen/UndefinedTensor.h new file mode 100644 index 0000000..d501f24 --- /dev/null +++ b/aten/src/ATen/UndefinedTensor.h @@ -0,0 +1,27 @@ +#pragma once + +#include "ATen/TensorImpl.h" + +namespace at { + +struct AT_API UndefinedTensor final : public TensorImpl { +public: + static inline UndefinedTensor * singleton() { + return &_singleton; + } + const char * toString() const override; + IntList sizes() const override; + IntList strides() const override; + int64_t dim() const override; + Scalar localScalar() override; + void * unsafeGetTH(bool retain) override; + std::unique_ptr storage() override; + static const char * typeString(); +private: + UndefinedTensor(); + static UndefinedTensor _singleton; +public: + friend struct UndefinedType; +}; + +} // namespace at diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp new file mode 100644 index 0000000..068f9b7 --- /dev/null +++ b/aten/src/ATen/UndefinedType.cpp @@ -0,0 +1,76 @@ +#include "ATen/UndefinedType.h" +#include "ATen/Error.h" + +namespace at { + +UndefinedType::UndefinedType(Context* context) + : Type(context, /*is_variable=*/false, /*is_undefined=*/true) {} +ScalarType UndefinedType::scalarType() const { + return ScalarType::Undefined; +} +Backend UndefinedType::backend() const { + return Backend::Undefined; +} +bool UndefinedType::is_cuda() const { return false; } +bool UndefinedType::is_sparse() const { return false; } +bool UndefinedType::is_distributed() const { return false; } + +std::unique_ptr UndefinedType::storage() const { + AT_ERROR("storage not defined for UndefinedType"); +} +std::unique_ptr UndefinedType::storage(size_t size) const { + AT_ERROR("storage(size_t) not defined for UndefinedType"); +} +std::unique_ptr UndefinedType::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { + AT_ERROR("storageFromBlob not defined for UndefinedType"); +} +std::unique_ptr UndefinedType::unsafeStorageFromTH(void * th_pointer, bool retain) const { + AT_ERROR("unsafeStorageFromTH not defined for UndefinedType"); +} +std::unique_ptr UndefinedType::storageWithAllocator(int64_t size, Allocator* allocator) const { + AT_ERROR("storageWithAllocator not defined for UndefinedType"); +} +Tensor UndefinedType::unsafeTensorFromTH(void * th_pointer, bool retain) const { + AT_ERROR("unsafeTensorFromTH not defined for UndefinedType"); +} +std::unique_ptr UndefinedType::generator() const { + AT_ERROR("generator not defined for UndefinedType"); +} + +const char * UndefinedType::toString() const { + return UndefinedType::typeString(); +} +TypeID UndefinedType::ID() const { + return TypeID::Undefined; +} + +size_t UndefinedType::elementSizeInBytes() const { + AT_ERROR("elementSizeInBytes not defined for UndefinedType"); +} + +Type & UndefinedType::toBackend(Backend b) const { + if (b == Backend::Undefined) { + return Type::toBackend(b); + } + AT_ERROR("toBackend not implemented for UndefinedType to non-UndefinedType"); +} +Type & UndefinedType::toScalarType(ScalarType s) const { + if (s == ScalarType::Undefined) { + return Type::toScalarType(s); + } + AT_ERROR("toScalarType not implemented for UndefinedType to non-UndefinedType"); +} + +const char * UndefinedType::typeString() { + return "UndefinedType"; +} + +Tensor & UndefinedType::s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const { + AT_ERROR("s_copy not defined for UndefinedType"); +} + +Tensor & UndefinedType::_s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const { + AT_ERROR("_s_copy_from not defined for UndefinedType"); +} + +} diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h new file mode 100644 index 0000000..913066b --- /dev/null +++ b/aten/src/ATen/UndefinedType.h @@ -0,0 +1,40 @@ +#pragma once + +#include "ATen/Type.h" +#include "ATen/Context.h" +#include "ATen/CheckGenerator.h" + +#ifdef _MSC_VER +#ifdef Type +#undef Type +#endif +#endif + +namespace at { + +struct UndefinedType final : public Type { + explicit UndefinedType(Context* context); + virtual ScalarType scalarType() const override; + virtual Backend backend() const override; + virtual bool is_cuda() const override; + virtual bool is_sparse() const override; + virtual bool is_distributed() const override; + virtual std::unique_ptr storage() const override; + virtual std::unique_ptr storage(size_t size) const override; + virtual std::unique_ptr storageFromBlob(void * data, int64_t size, const std::function & deleter) const override; + virtual std::unique_ptr storageWithAllocator(int64_t size, Allocator* allocator) const override; + virtual std::unique_ptr generator() const override; + virtual const char * toString() const override; + virtual size_t elementSizeInBytes() const override; + virtual Type & toBackend(Backend b) const override; + virtual Type & toScalarType(ScalarType s) const override; + virtual TypeID ID() const override; + static const char * typeString(); + virtual std::unique_ptr unsafeStorageFromTH(void * th_pointer, bool retain) const override; + virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override; + + virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override; + virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override; +}; + +} // namespace at diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp new file mode 100644 index 0000000..3ce4952 --- /dev/null +++ b/aten/src/ATen/Utils.cpp @@ -0,0 +1,15 @@ +#include "ATen/Utils.h" +#include +#include +#include +#include + +namespace at { + +int _crash_if_asan(int arg) { + volatile char x[3]; + x[arg] = 0; + return x[0]; +} + +} // at diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h new file mode 100644 index 0000000..ccefa25 --- /dev/null +++ b/aten/src/ATen/Utils.h @@ -0,0 +1,87 @@ +#pragma once + +#include "ATen/ATenGeneral.h" +#include "ATen/ArrayRef.h" +#include "ATen/Error.h" +#include "ATen/UndefinedTensor.h" + +#include +#include +#include +#include + +#if defined(__clang__) +#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero"))) +#define __ubsan_ignore_vptr__ __attribute__((no_sanitize("vptr"))) +#else +#define __ubsan_ignore_float_divide_by_zero__ +#define __ubsan_ignore_vptr__ +#endif + +namespace at { + +AT_API int _crash_if_asan(int); + +template +static inline T* checked_cast_storage(Base* expr, const char * name, int pos) { + if (typeid(*expr) != typeid(T)) + AT_ERROR("Expected object of type ", T::typeString(), " but found type ", expr->type().toString(), + " for argument #", pos, " '", name, "'"); + return static_cast(expr); +} + +template +inline T* checked_cast_tensor(Base* expr, const char * name, int pos, bool allowNull) { + if(allowNull && expr == UndefinedTensor::singleton()) { + return nullptr; + } + if (typeid(*expr) != typeid(T)) + AT_ERROR("Expected object of type ", T::typeString(), " but found type ", expr->type().toString(), + " for argument #", pos, " '", name, "'"); + return static_cast(expr); +} + +// Converts a TensorList (i.e. ArrayRef to the underlying TH* Tensor Pointer) +template +static inline std::vector tensor_list_checked_cast(ArrayRef tensors, const char * name, int pos) { + std::vector casted(tensors.size()); + for (unsigned int i = 0; i < tensors.size(); ++i) { + auto *expr = tensors[i].pImpl; + auto result = dynamic_cast(expr); + if (result) { + casted[i] = result->tensor; + } else { + AT_ERROR("Expected a Tensor of type ", T::typeString(), " but found a type ", expr->type().toString(), + " for sequence element ", i, " in sequence argument at position #", pos, " '", name, "'"); + + } + } + return casted; +} + +template +std::array check_intlist(ArrayRef list, const char * name, int pos, ArrayRef def={}) { + if (list.empty()) { + list = def; + } + auto res = std::array(); + if (list.size() == 1 && N > 1) { + res.fill(list[0]); + return res; + } + if (list.size() != N) { + AT_ERROR("Expected a list of ", N, " ints but got ", list.size(), " for argument #", pos, " '", name, "'"); + } + std::copy_n(list.begin(), N, res.begin()); + return res; +} + +inline int64_t sum_intlist(ArrayRef list) { + return std::accumulate(list.begin(), list.end(), 0); +} + +inline int64_t prod_intlist(ArrayRef list) { + return std::accumulate(list.begin(), list.end(), 1, std::multiplies()); +} + +} // at diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h new file mode 100644 index 0000000..a07efa2 --- /dev/null +++ b/aten/src/ATen/WrapDimUtils.h @@ -0,0 +1,89 @@ +#pragma once + +#include "ATen/TensorImpl.h" +#include + +namespace at { + +static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) { + if (dim_post_expr <= 0) { + if (!wrap_scalar) { + std::ostringstream oss; + oss << "dimension specified as " << dim << " but tensor has no dimensions"; + throw std::runtime_error(oss.str()); + } + dim_post_expr = 1; // this will make range [-1, 0] + } + + int64_t min = -dim_post_expr; + int64_t max = dim_post_expr - 1; + AT_CHECK( + dim >= min && dim <= max, + "Dimension out of range (expected to be in range of [", + min, ", ", max, "], but got ", dim, ")"); + if (dim < 0) dim += dim_post_expr; + return dim; +} + +static inline int64_t maybe_wrap_dim(int64_t dim, TensorImpl *tensor) { + return maybe_wrap_dim(dim, tensor->dim()); +} + +static inline int64_t maybe_wrap_dim(int64_t dim, TensorList tensors) { + if (tensors.size() == 0) { + // can't wrap empty TensorList; rely on underlying implementation to throw error if necessary. + return dim; + } + return maybe_wrap_dim(dim, tensors[0].dim()); +} + +static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector> & tensor_sizes) { + if (tensor_sizes.size() == 0) { + // can't wrap empty list; rely on underlying implementation to throw error if necessary + return dim; + } + return maybe_wrap_dim(dim, tensor_sizes[0].size()); +} + +// wrap each of dims basing on dim_post_expr +static inline void maybe_wrap_dims(std::vector& dims, int64_t dim_post_expr) { + if (dim_post_expr <= 0) { + dim_post_expr = 1; // this will make range [-1, 0] + } + int64_t min = -dim_post_expr; + int64_t max = dim_post_expr - 1; + for (auto& dim : dims) { + AT_CHECK( + dim >= min && dim <= max, + "Dimension out of range (expected to be in range of [", + min, ", ", max, "], but got ", dim, ")"); + if (dim < 0) dim += dim_post_expr; + } +} + +// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible +// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors +// to be "skipped" (both for wrap dimension behavior and dimension size checking). +// We maintain this behavior for backwards compatibility, but only for this specific size +// (i.e. other empty sizes are not skipped). +static inline int64_t legacy_cat_wrap_dim(int64_t dim, const std::vector>& tensor_sizes) { + for (auto& sizes : tensor_sizes) { + if (sizes == std::vector({0})) { + continue; + } + return maybe_wrap_dim(dim, sizes.size()); + } + return dim; +} + +static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) { + for (auto& tensor : tensors) { + if (tensor.dim() == 1 && tensor.sizes()[0] == 0) { + continue; + } + return maybe_wrap_dim(dim, tensor.dim()); + } + return dim; +} + +} diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h new file mode 100644 index 0000000..f3d3a81 --- /dev/null +++ b/aten/src/ATen/WrapDimUtilsMulti.h @@ -0,0 +1,26 @@ +#pragma once + +#include "ATen/TensorImpl.h" +#include "ATen/WrapDimUtils.h" +#include +#include + +namespace at { + +// This is in an extra file to work around strange interaction of +// bitset on Windows with operator overloading + +constexpr size_t dim_bitset_size = 64; + +static inline std::bitset dim_list_to_bitset(IntList dims, int64_t ndims, bool wrap_scalar=true) { + AT_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported"); + std::bitset seen; + for (size_t i = 0; i < dims.size(); i++) { + size_t dim = maybe_wrap_dim(dims[i], ndims); + AT_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims"); + seen[dim] = true; + } + return seen; +} + +} diff --git a/aten/src/ATen/code_template.py b/aten/src/ATen/code_template.py new file mode 100644 index 0000000..f239030 --- /dev/null +++ b/aten/src/ATen/code_template.py @@ -0,0 +1,77 @@ +import re + +# match $identifier or ${identifier} and replace with value in env +# If this identifier is at the beginning of whitespace on a line +# and its value is a list then it is treated as +# block subsitution by indenting to that depth and putting each element +# of the list on its own line +# if the identifier is on a line starting with non-whitespace and a list +# then it is comma separated ${,foo} will insert a comma before the list +# if this list is not empty and ${foo,} will insert one after. + + +class CodeTemplate(object): + substitution_str = '(^[^\n\S]*)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})' + + # older versions of Python have a bug where \w* does not work, + # so we need to replace with the non-shortened version [a-zA-Z0-9_]* + # https://bugs.python.org/issue18647 + + substitution_str = substitution_str.replace('\w', '[a-zA-Z0-9_]') + + subtitution = re.compile(substitution_str, re.MULTILINE) + + @staticmethod + def from_file(filename): + with open(filename, 'r') as f: + return CodeTemplate(f.read()) + + def __init__(self, pattern): + self.pattern = pattern + + def substitute(self, env={}, **kwargs): + def lookup(v): + return kwargs[v] if v in kwargs else env[v] + + def indent_lines(indent, v): + return "".join([indent + l + "\n" for e in v for l in str(e).splitlines()]).rstrip() + + def replace(match): + indent = match.group(1) + key = match.group(2) + comma_before = '' + comma_after = '' + if key[0] == "{": + key = key[1:-1] + if key[0] == ",": + comma_before = ', ' + key = key[1:] + if key[-1] == ',': + comma_after = ', ' + key = key[:-1] + v = lookup(key) + if indent is not None and isinstance(v, list): + return indent_lines(indent, v) + elif isinstance(v, list): + middle = ', '.join([str(x) for x in v]) + if len(v) == 0: + return middle + return comma_before + middle + comma_after + else: + return (indent or '') + str(v) + return self.subtitution.sub(replace, self.pattern) + + +if __name__ == "__main__": + c = CodeTemplate("""\ + int foo($args) { + + $bar + $bar + $a+$b + } + int commatest(int a${,stuff}) + int notest(int a${,empty,}) + """) + print(c.substitute(args=["hi", 8], bar=["what", 7], + a=3, b=4, stuff=["things...", "others"], empty=[])) diff --git a/aten/src/ATen/common_with_cwrap.py b/aten/src/ATen/common_with_cwrap.py new file mode 100644 index 0000000..9596369 --- /dev/null +++ b/aten/src/ATen/common_with_cwrap.py @@ -0,0 +1,207 @@ +# this code should be common among cwrap and ATen preprocessing +# for now, I have put it in one place but right now is copied out of cwrap + +from copy import deepcopy +from itertools import product + + +def parse_arguments(args): + new_args = [] + for arg in args: + # Simple arg declaration of form " " + if isinstance(arg, str): + t, _, name = arg.partition(' ') + new_args.append({'type': t, 'name': name}) + elif isinstance(arg, dict): + if 'arg' in arg: + arg['type'], _, arg['name'] = arg['arg'].partition(' ') + del arg['arg'] + new_args.append(arg) + else: + assert False + return new_args + + +def set_declaration_defaults(declaration): + declaration.setdefault('arguments', []) + declaration.setdefault('return', 'void') + if 'cname' not in declaration: + declaration['cname'] = declaration['name'] + if 'backends' not in declaration: + declaration['backends'] = ['CPU', 'CUDA'] + if 'api_name' not in declaration: + declaration['api_name'] = (declaration['python_name'] + if 'python_name' in declaration else declaration['name']) + # Simulate multiple dispatch, even if it's not necessary + if 'options' not in declaration: + declaration['options'] = [{'arguments': declaration['arguments']}] + del declaration['arguments'] + # Parse arguments (some of them can be strings) + for option in declaration['options']: + option['arguments'] = parse_arguments(option['arguments']) + # Propagate defaults from declaration to options + for option in declaration['options']: + for k, v in declaration.items(): + # TODO(zach): why does cwrap not propagate 'name'? I need it + # propagaged for ATen + if k != 'options': + option.setdefault(k, v) + +# TODO(zach): added option to remove keyword handling for C++ which cannot +# support it. + + +def filter_unique_options(options, allow_kwarg, type_to_signature, remove_self): + def exclude_arg(arg): + return arg.get('ignore_check') or arg['type'] == 'CONSTANT' + + def exclude_arg_with_self_check(arg): + return exclude_arg(arg) or (remove_self and arg['name'] == 'self') + + def signature(option, kwarg_only_count): + if kwarg_only_count == 0: + kwarg_only_count = None + else: + kwarg_only_count = -kwarg_only_count + arg_signature = '#'.join( + type_to_signature.get(arg['type'], arg['type']) + for arg in option['arguments'][:kwarg_only_count] + if not exclude_arg_with_self_check(arg)) + if kwarg_only_count is None: + return arg_signature + kwarg_only_signature = '#'.join( + arg['name'] + '#' + arg['type'] + for arg in option['arguments'][kwarg_only_count:] + if not exclude_arg(arg)) + return arg_signature + "#-#" + kwarg_only_signature + seen_signatures = set() + unique = [] + for option in options: + # if only check num_kwarg_only == 0 if allow_kwarg == False + limit = len(option['arguments']) if allow_kwarg else 0 + for num_kwarg_only in range(0, limit + 1): + sig = signature(option, num_kwarg_only) + if sig not in seen_signatures: + if num_kwarg_only > 0: + for arg in option['arguments'][-num_kwarg_only:]: + arg['kwarg_only'] = True + unique.append(option) + seen_signatures.add(sig) + break + return unique + + +def enumerate_options_due_to_default(declaration, + allow_kwarg=True, type_to_signature=[], remove_self=True): + + # Checks to see if an argument with a default keyword is a Tensor that + # by default can be NULL. In this case, instead of generating another + # option that excludes this argument, we will instead generate a single + # function call that allows for the Tensor to be NULL + def is_nullable_tensor_arg(arg): + return arg['type'] == 'THTensor*' and arg['default'] == 'nullptr' + + # TODO(zach): in cwrap this is shared among all declarations + # but seems to assume that all declarations will have the same + new_options = [] + for option in declaration['options']: + optional_args = [] + for i, arg in enumerate(option['arguments']): + if 'default' in arg: + optional_args.append(i) + for permutation in product((True, False), repeat=len(optional_args)): + option_copy = deepcopy(option) + option_copy['has_full_argument_list'] = sum(permutation) == len(optional_args) + for i, bit in zip(optional_args, permutation): + arg = option_copy['arguments'][i] + # PyYAML interprets NULL as None... + arg['default'] = 'NULL' if arg['default'] is None else arg['default'] + if not bit: + arg['declared_type'] = arg['type'] + arg['type'] = 'CONSTANT' + arg['ignore_check'] = True + new_options.append(option_copy) + declaration['options'] = filter_unique_options(new_options, + allow_kwarg, type_to_signature, remove_self) + + +def sort_by_number_of_options(declaration, reverse=True): + def num_checked_args(option): + return sum(map(lambda a: not a.get('ignore_check', False), option['arguments'])) + declaration['options'].sort(key=num_checked_args, reverse=reverse) + + +class Function(object): + + def __init__(self, name): + self.name = name + self.arguments = [] + + def add_argument(self, arg): + assert isinstance(arg, Argument) + self.arguments.append(arg) + + def __repr__(self): + return self.name + '(' + ', '.join(map(lambda a: a.__repr__(), self.arguments)) + ')' + + +class Argument(object): + + def __init__(self, _type, name, is_optional): + self.type = _type + self.name = name + self.is_optional = is_optional + + def __repr__(self): + return self.type + ' ' + self.name + + +def parse_header(path): + with open(path, 'r') as f: + lines = f.read().split('\n') + + # Remove empty lines and prebackend directives + lines = filter(lambda l: l and not l.startswith('#'), lines) + # Remove line comments + lines = map(lambda l: l.partition('//'), lines) + # Select line and comment part + lines = map(lambda l: (l[0].strip(), l[2].strip()), lines) + # Remove trailing special signs + lines = map(lambda l: (l[0].rstrip(');').rstrip(','), l[1]), lines) + # Split arguments + lines = map(lambda l: (l[0].split(','), l[1]), lines) + # Flatten lines + new_lines = [] + for l, c in lines: + for split in l: + new_lines.append((split, c)) + lines = new_lines + del new_lines + # Remove unnecessary whitespace + lines = map(lambda l: (l[0].strip(), l[1]), lines) + # Remove empty lines + lines = filter(lambda l: l[0], lines) + generic_functions = [] + for l, c in lines: + if l.startswith('TH_API void THNN_'): + fn_name = l.lstrip('TH_API void THNN_') + if fn_name[0] == '(' and fn_name[-2] == ')': + fn_name = fn_name[1:-2] + else: + fn_name = fn_name[:-1] + generic_functions.append(Function(fn_name)) + elif l.startswith('THC_API void THNN_'): + fn_name = l.lstrip('THC_API void THNN_') + if fn_name[0] == '(' and fn_name[-2] == ')': + fn_name = fn_name[1:-2] + else: + fn_name = fn_name[:-1] + generic_functions.append(Function(fn_name)) + elif l: + t, name = l.split() + if '*' in name: + t = t + '*' + name = name[1:] + generic_functions[-1].add_argument( + Argument(t, name, '[OPTIONAL]' in c)) + return generic_functions diff --git a/aten/src/ATen/copy_wrapper.py b/aten/src/ATen/copy_wrapper.py new file mode 100644 index 0000000..02eb56e --- /dev/null +++ b/aten/src/ATen/copy_wrapper.py @@ -0,0 +1,248 @@ +from code_template import CodeTemplate +from function_wrapper import nested_dict + +FILE = CodeTemplate("""\ +// ${generated_comment} + +#include "ATen/Config.h" + +#include "TH/TH.h" +${cuda_includes} +#include "ATen/Utils.h" +${copy_includes} + +namespace at { + +${copy_functions} + +} +""") + +CUDA_INCLUDES = """\ +#undef THNN_ +#include "THC/THC.h" +""" + +# NB: The copy templates static_cast both dst and src, even though +# technically we also perform a checked_cast_tensor in the prologue +# of the copy (meaning that hypothetically, an already casted tensor +# is available. However, in s_copy, the casted tensor is dst, while +# in _s_copy_from, the casted tensor is src. So we can reuse the logic +# in both cases, we unconditionally cast both tensors (and rely +# on the surrounding code to establish the necessary invariants.) + +COPY = CodeTemplate("""\ +${THTensor}_copy${cuda}${src_scalar_name}(${state,}\ +static_cast<${dst_tensor}*>(dst.pImpl)->tensor, \ +static_cast<${src_tensor}*>(src.pImpl)->tensor); +""") + +COPY_ASYNC_CPU = CodeTemplate("""\ +if (non_blocking) { + ${THTensor}_copyAsyncCPU(${state,}\ +static_cast<${dst_tensor}*>(dst.pImpl)->tensor, \ +static_cast<${src_tensor}*>(src.pImpl)->tensor); + break; +} +""") + +COPY_ASYNC_CUDA = CodeTemplate("""\ +if (non_blocking) { + ${THTensor}_copyAsyncCuda(${state,}\ +static_cast<${dst_tensor}*>(dst.pImpl)->tensor, \ +static_cast<${src_tensor}*>(src.pImpl)->tensor); + break; +} +""") + +CASE = CodeTemplate("""\ +case ${case_id}: + ${copies} + break; +""") + +FUNCTION = CodeTemplate("""\ +Tensor & ${Type}::s_copy_(Tensor & dst, const Tensor & src, bool non_blocking) const { + // code generated by copy_wrapper + ${checked_cast_dst} + switch (src.type().ID()) { + ${copy_body} + default: + ${function_fallthrough} + } + dst.pImpl->setScalar(src.pImpl->isScalar()); + return dst; +} +""") + +FUNCTION_FALLTHROUGH_REDISPATCH = "return src.type()._s_copy_from(src, dst, non_blocking);" + +FUNCTION_FALLTHROUGH_ERROR = """\ +AT_ERROR("copy does not support ", src.type().toString(), " to ", toString(), " copy."); +""" + +FUNCTION_FROM = CodeTemplate("""\ +Tensor & ${Type}::_s_copy_from(const Tensor & src, Tensor & dst, bool non_blocking) const { + // code generated by copy_wrapper + ${checked_cast_src} + switch (dst.type().ID()) { + ${copy_body} + default: + AT_ERROR("copy does not support ", toString(), " to ", dst.type().toString(), " copy."); + break; + } + dst.pImpl->setScalar(src.pImpl->isScalar()); + return dst; // NB! dst +} +""") + +# NB: Hypothetically, someone could call s_copy_from directly and get an error +# message which claims something is not supported, when it actually is. But +# the correct fix in this case was to NOT call copy_from +FUNCTION_FROM_SWAP = CodeTemplate("""\ +Tensor & ${Type}::_s_copy_from(const Tensor & src, Tensor & dst, bool non_blocking) const { + AT_ERROR("copy does not support ", src.type().toString(), " to ", dst.type().toString(), " copy (s_copy_from case)."); +} +""") + + +def create_one_copy(dst_type, all_types): + copy_body = [] + + for src_type in all_types: + if dst_type['Density'] == 'Sparse' or src_type['Density'] == 'Sparse': + # skip sparse copies, which are not yet implemented + continue + cuda = '' + state = [] + if src_type['Backend'] == 'CUDA' or dst_type['Backend'] == 'CUDA': + state.append('context->getTHCState()') + if src_type['Backend'] == 'CUDA': + if dst_type['Backend'] == 'CUDA': + cuda = 'Cuda' + else: + # don't attempt to process CPU-CUDA; this is handled in the + # redispatch + continue + + body_env = nested_dict({ + 'src_scalar_name': src_type['ScalarName'], + 'case_id': src_type['TypeID'], + 'src_tensor': src_type['Tensor'], + 'dst_tensor': dst_type['Tensor'], + 'cuda': cuda, + 'state': state, + }, dst_type) + + copies = [] + if dst_type['ScalarType'] == src_type['ScalarType']: + if dst_type['Backend'] == 'CUDA' and src_type['Backend'] == 'CPU': + copies.append(COPY_ASYNC_CPU.substitute(body_env)) + copies.append(COPY.substitute(body_env)) + + copy_body.append(CASE.substitute(body_env, copies=copies)) + + if dst_type['Backend'] == 'CPU': + # CPU fallthrough needs to redispatch to _s_copy_from + # (Backend == CPU implies Dense) + assert dst_type['Density'] == 'Dense' + function_fallthrough = FUNCTION_FALLTHROUGH_REDISPATCH + else: + function_fallthrough = FUNCTION_FALLTHROUGH_ERROR + + # Note [checked_cast_tensor is for dense only] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # checked_cast_tensor is only needed for backends which implement + # copy and thus do a cast. Sparse does not support copies, so there + # is no need to do a checked cast. (Furthermore, the code as written + # will not work, as it will try to there is no derived Tensor type + # for sparse.) + checked_cast_dst = '' + if dst_type['Density'] == 'Dense': + checked_cast_dst = 'checked_cast_tensor<{}>(dst.pImpl, "dst", 0, false);'.format(dst_type['Tensor']) + + env = nested_dict({ + 'function_fallthrough': function_fallthrough, + 'checked_cast_dst': checked_cast_dst, + }, dst_type) + return FUNCTION.substitute(env, copy_body=copy_body) + + +def create_one_copy_from(src_type, all_types): + if src_type['DenseBackend'] == 'CPU': + return FUNCTION_FROM_SWAP.substitute(src_type) + + copy_body = [] + + for dst_type in all_types: + if dst_type['Density'] == 'Sparse' or src_type['Density'] == 'Sparse': + # skip sparse copies, which are not yet implemented + continue + cuda = '' + state = [] + if src_type['Backend'] == 'CUDA': + cuda = 'Cuda' + if dst_type['Backend'] == 'CUDA' or src_type['Backend'] == 'CUDA': + state.append('context->getTHCState()') + + body_env = nested_dict({ + 'src_scalar_name': src_type['ScalarName'], + 'case_id': dst_type['TypeID'], + 'src_tensor': src_type['Tensor'], + 'dst_tensor': dst_type['Tensor'], + 'cuda': cuda, + 'state': state, + }, dst_type) + + copies = [] + if dst_type['ScalarType'] == src_type['ScalarType']: + # NB: Technically, we have already short-circuited the + # src_type['Backend'] == 'CUDA' case at the beginning of this + # function + if dst_type['Backend'] == 'CPU' and src_type['Backend'] == 'CUDA': + copies.append(COPY_ASYNC_CUDA.substitute(body_env)) + copies.append(COPY.substitute(body_env)) + + copy_body.append(CASE.substitute(body_env, copies=copies)) + + # See Note [checked_cast_tensor is for dense only] + checked_cast_src = '' + if src_type['Density'] != 'Sparse': + checked_cast_src = 'checked_cast_tensor<{}>(src.pImpl, "src", 0, false);'.format(src_type['Tensor']) + + return FUNCTION_FROM.substitute(src_type, copy_body=copy_body, checked_cast_src=checked_cast_src) + + +def create(all_types, backend): + top_env = { + 'copy_includes': [], + 'copy_functions': [], + 'cuda_includes': [], + 'generated_comment': '@' + 'generated by aten/src/ATen/copy_wrapper.py' + } + + if backend == 'CUDA': + top_env['cuda_includes'].append(CUDA_INCLUDES) + + # Headers to include + for the_type in all_types: + # CUDA backend requires all headers (as it also manages CPU-CUDA + # conversions), but CPU backend should only have CPU headers + if backend == 'CPU' and the_type['DenseBackend'] != 'CPU': + continue + top_env['copy_includes'].append( + '#include "ATen/{}.h"'.format(the_type['Type'])) + if the_type['Density'] != 'Sparse': + # only Dense tensors have a derived Tensor type + top_env['copy_includes'].append( + '#include "ATen/{}.h"'.format(the_type['Tensor'])) + + # Code generation + for the_type in all_types: + # Only generate code for the requested backend + if the_type['DenseBackend'] != backend: + continue + top_env['copy_functions'].append(create_one_copy(the_type, all_types)) + top_env['copy_functions'].append(create_one_copy_from(the_type, all_types)) + + return FILE.substitute(top_env) diff --git a/aten/src/ATen/cpu/vec256/functional.h b/aten/src/ATen/cpu/vec256/functional.h new file mode 100644 index 0000000..c5e4efb --- /dev/null +++ b/aten/src/ATen/cpu/vec256/functional.h @@ -0,0 +1,139 @@ +#pragma once +#include "vec256.h" + +namespace at { namespace vec256 { + +// TODO: Make this more efficient +template +inline scalar_t vec_reduce_all( + const Op& vec_fun, + vec256::Vec256 acc_vec, + int64_t size) { + using Vec = vec256::Vec256; + scalar_t acc_arr[Vec::size]; + acc_vec.store(acc_arr); + for (int64_t i = 1; i < size; i++) { + scalar_t acc_arr_next[Vec::size]; + acc_arr_next[0] = acc_arr[i]; + Vec acc_vec_next = Vec::loadu(acc_arr_next); + acc_vec = vec_fun(acc_vec, acc_vec_next); + } + acc_vec.store(acc_arr); + return acc_arr[0]; +} + +template +inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) { + using Vec = vec256::Vec256; + if (size < Vec::size) + return vec_reduce_all(vec_fun, Vec::loadu(data, size), size); + int64_t d = Vec::size; + Vec acc_vec = Vec::loadu(data); + for (; d < size - (size % Vec::size); d += Vec::size) { + Vec data_vec = Vec::loadu(data + d); + acc_vec = vec_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(vec_fun, acc_vec, Vec::size); +} + +template +inline scalar_t map_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + scalar_t* data, + int64_t size) { + using Vec = vec256::Vec256; + if (size < Vec::size) + return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size); + int64_t d = Vec::size; + Vec acc_vec = map_fun(Vec::loadu(data)); + for (; d < size - (size % Vec::size); d += Vec::size) { + Vec data_vec = Vec::loadu(data + d); + data_vec = map_fun(data_vec); + acc_vec = red_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + data_vec = map_fun(data_vec); + acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(red_fun, acc_vec, Vec::size); +} + +template +inline scalar_t map2_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + scalar_t* data, + scalar_t* data2, + int64_t size) { + using Vec = vec256::Vec256; + if (size < Vec::size) { + Vec data_vec = Vec::loadu(data, size); + Vec data2_vec = Vec::loadu(data2, size); + data_vec = map_fun(data_vec, data2_vec); + return vec_reduce_all(red_fun, data_vec, size); + } + int64_t d = Vec::size; + Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2)); + for (; d < size - (size % Vec::size); d += Vec::size) { + Vec data_vec = Vec::loadu(data + d); + Vec data2_vec = Vec::loadu(data2 + d); + data_vec = map_fun(data_vec, data2_vec); + acc_vec = red_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + Vec data2_vec = Vec::loadu(data2 + d, size - d); + data_vec = map_fun(data_vec, data2_vec); + acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(red_fun, acc_vec, Vec::size); +} + +template +inline void map( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data, + int64_t size) { + using Vec = vec256::Vec256; + int64_t d = 0; + for (; d < size - (size % Vec::size); d += Vec::size) { + Vec output_vec = vec_fun(Vec::loadu(input_data + d)); + output_vec.store(output_data + d); + } + if (size - d > 0) { + Vec output_vec = vec_fun(Vec::loadu(input_data + d, size - d)); + output_vec.store(output_data + d, size - d); + } +} + +template +inline void map2( + const Op& vec_fun, + scalar_t* output_data, + scalar_t* input_data, + scalar_t* input_data2, + int64_t size) { + using Vec = vec256::Vec256; + int64_t d = 0; + for (; d < size - (size % Vec::size); d += Vec::size) { + Vec data_vec = Vec::loadu(input_data + d); + Vec data_vec2 = Vec::loadu(input_data2 + d); + Vec output_vec = vec_fun(data_vec, data_vec2); + output_vec.store(output_data + d); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(input_data + d, size - d); + Vec data_vec2 = Vec::loadu(input_data2 + d, size - d); + Vec output_vec = vec_fun(data_vec, data_vec2); + output_vec.store(output_data + d, size - d); + } +} + +}} // namespace at::vec256 diff --git a/aten/src/ATen/cpu/vec256/intrinsics.h b/aten/src/ATen/cpu/vec256/intrinsics.h new file mode 100644 index 0000000..442e8fd --- /dev/null +++ b/aten/src/ATen/cpu/vec256/intrinsics.h @@ -0,0 +1,28 @@ +#pragma once + +#if defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#if _MSC_VER <= 1900 +#define _mm256_extract_epi64(X, Y) (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2)) +#define _mm256_extract_epi32(X, Y) (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4)) +#define _mm256_extract_epi16(X, Y) (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8)) +#define _mm256_extract_epi8(X, Y) (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16)) +#endif +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include +#endif diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h new file mode 100644 index 0000000..98f1158 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vec256.h @@ -0,0 +1,35 @@ +#pragma once + +#include "intrinsics.h" + +#include "vec256_base.h" +#include "vec256_float.h" +#include "vec256_double.h" +#include "vec256_int.h" + +#include +#include +#include +#include +#include + +namespace at { +namespace vec256 { +namespace { + +template +std::ostream& operator<<(std::ostream& stream, const Vec256& vec) { + T buf[Vec256::size]; + vec.store(buf); + stream << "vec["; + for (int i = 0; i != Vec256::size; i++) { + if (i != 0) { + stream << ", "; + } + stream << buf[i]; + } + stream << "]"; + return stream; +} + +}}} diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h new file mode 100644 index 0000000..a2ca760 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -0,0 +1,211 @@ +#pragma once + +#include +#include +#include + +#include "ATen/Utils.h" + +#if defined(__GNUC__) +#define __at_align32__ __attribute__((aligned(32))) +#elif defined(_WIN32) +#define __at_align32__ __declspec(align(32)) +#else +#define __at_align32__ +#endif + +namespace at { +namespace vec256 { +namespace { + +// NOTE: If you specialize on a type, you must define all operations! + +// emulates vectorized types +template +struct Vec256 { +private: + T values[32 / sizeof(T)] = {0}; +public: + static constexpr int size = 32 / sizeof(T); + Vec256() {} + Vec256(T val) { + for (int i = 0; i != size; i++) { + values[i] = val; + } + } + template + static Vec256 blend(Vec256 a, Vec256 b) { + int64_t mask = mask_; + Vec256 vec; + for (int64_t i = 0; i < size; i++) { + if (mask & 0x01) { + vec[i] = b[i]; + } else { + vec[i] = a[i]; + } + mask = mask >> 1; + } + return vec; + } + static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + Vec256 vec; + for (int64_t i = 0; i < size; i++) { + if (i < count) { + vec[i] = b[i]; + } else { + vec[i] = a[i]; + } + } + return vec; + } + static Vec256 loadu(const void* ptr) { + Vec256 vec; + std::memcpy(vec.values, ptr, 32); + return vec; + } + static Vec256 loadu(const void* ptr, int64_t count) { + Vec256 vec; + std::memcpy(vec.values, ptr, count * sizeof(T)); + return vec; + } + void store(void* ptr, int count = size) const { + std::memcpy(ptr, values, count * sizeof(T)); + } + const T& operator[](int idx) const { + return values[idx]; + } + T& operator[](int idx) { + return values[idx]; + } + Vec256 map(T (*f)(T)) const { + Vec256 ret; + for (int64_t i = 0; i != size; i++) { + ret[i] = f(values[i]); + } + return ret; + } + Vec256 abs() const { + Vec256 ret; + for (int64_t i = 0; i < size; i++) { + ret[i] = values[i] < 0 ? -values[i] : values[i]; + } + return ret; + } + Vec256 acos() const { + return map(std::acos); + } + Vec256 asin() const { + return map(std::asin); + } + Vec256 atan() const { + return map(std::atan); + } + Vec256 erf() const { + return map(std::erf); + } + Vec256 erfc() const { + return map(std::erfc); + } + Vec256 exp() const { + return map(std::exp); + } + Vec256 expm1() const { + return map(std::expm1); + } + Vec256 log() const { + return map(std::log); + } + Vec256 log10() const { + return map(std::log10); + } + Vec256 log1p() const { + return map(std::log1p); + } + Vec256 log2() const { + return map(std::log2); + } + Vec256 ceil() const { + return map(std::ceil); + } + Vec256 cos() const { + return map(std::cos); + } + Vec256 cosh() const { + return map(std::cosh); + } + Vec256 floor() const { + return map(std::floor); + } + Vec256 neg() const { + return map([](T x) { return -x; }); + } + Vec256 round() const { + return map(std::round); + } + Vec256 sin() const { + return map(std::sin); + } + Vec256 sinh() const { + return map(std::sinh); + } + Vec256 tan() const { + return map(std::tan); + } + Vec256 tanh() const { + return map(std::tanh); + } + Vec256 trunc() const { + return map(std::trunc); + } + Vec256 sqrt() const { + return map(std::sqrt); + } + Vec256 reciprocal() const { + return map([](T x) { return (T)(1) / x; }); + } + Vec256 rsqrt() const { + return map([](T x) { return 1 / std::sqrt(x); }); + } +}; + +template Vec256 operator+(const Vec256 &a, const Vec256 &b) { + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size; i++) { + c[i] = a[i] + b[i]; + } + return c; +} + +template Vec256 operator-(const Vec256 &a, const Vec256 &b) { + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size; i++) { + c[i] = a[i] - b[i]; + } + return c; +} + +template Vec256 operator*(const Vec256 &a, const Vec256 &b) { + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size; i++) { + c[i] = a[i] * b[i]; + } + return c; +} + +template Vec256 operator/(const Vec256 &a, const Vec256 &b) __ubsan_ignore_float_divide_by_zero__ { + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size; i++) { + c[i] = a[i] / b[i]; + } + return c; +} + +template Vec256 max(const Vec256 &a, const Vec256 &b) { + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size; i++) { + c[i] = std::max(a[i], b[i]); + } + return c; +} + +}}} diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h new file mode 100644 index 0000000..975948a --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vec256_double.h @@ -0,0 +1,183 @@ +#pragma once + +#include "intrinsics.h" +#include "vec256_base.h" +#if defined(__AVX__) && !defined(_MSC_VER) +#include +#endif + +namespace at { +namespace vec256 { +namespace { + +#if defined(__AVX__) && !defined(_MSC_VER) + +template <> class Vec256 { +private: + __m256d values; +public: + static constexpr int size = 4; + Vec256() {} + Vec256(__m256d v) : values(v) {} + Vec256(double val) { + values = _mm256_set1_pd(val); + } + operator __m256d() const { + return values; + } + template + static Vec256 blend(Vec256 a, Vec256 b) { + return _mm256_blend_pd(a.values, b.values, mask); + } + static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + static Vec256 loadu(const void* ptr, int64_t count = size) { + if (count == size) + return _mm256_loadu_pd(reinterpret_cast(ptr)); + + __at_align32__ double tmp_values[size]; + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(double)); + return _mm256_load_pd(tmp_values); + } + void store(void* ptr, int count = size) const { + if (count == size) { + _mm256_storeu_pd(reinterpret_cast(ptr), values); + } else { + double tmp_values[size]; + _mm256_storeu_pd(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(double)); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + Vec256 map(double (*f)(double)) const { + __at_align32__ double tmp[4]; + store(tmp); + for (int64_t i = 0; i < 4; i++) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vec256 abs() const { + auto mask = _mm256_set1_pd(-0.f); + return _mm256_andnot_pd(mask, values); + } + Vec256 acos() const { + return Vec256(Sleef_acosd4_u10(values)); + } + Vec256 asin() const { + return Vec256(Sleef_asind4_u10(values)); + } + Vec256 atan() const { + return Vec256(Sleef_atand4_u10(values)); + } + Vec256 erf() const { + return Vec256(Sleef_erfd4_u10(values)); + } + Vec256 erfc() const { + return Vec256(Sleef_erfcd4_u15(values)); + } + Vec256 exp() const { + return Vec256(Sleef_expd4_u10(values)); + } + Vec256 expm1() const { + return Vec256(Sleef_expm1d4_u10(values)); + } + Vec256 log() const { + return Vec256(Sleef_logd4_u10(values)); + } + Vec256 log2() const { + return Vec256(Sleef_log2d4_u10(values)); + } + Vec256 log10() const { + return Vec256(Sleef_log10d4_u10(values)); + } + Vec256 log1p() const { + return Vec256(Sleef_log1pd4_u10(values)); + } + Vec256 sin() const { + return map(std::sin); + } + Vec256 sinh() const { + return map(std::sinh); + } + Vec256 cos() const { + return map(std::cos); + } + Vec256 cosh() const { + return map(std::cos); + } + Vec256 ceil() const { + return _mm256_ceil_pd(values); + } + Vec256 floor() const { + return _mm256_floor_pd(values); + } + Vec256 neg() const { + return _mm256_xor_pd(_mm256_set1_pd(-0.), values); + } + Vec256 round() const { + return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vec256 tan() const { + return map(std::tan); + } + Vec256 tanh() const { + return Vec256(Sleef_tanhd4_u10(values)); + } + Vec256 trunc() const { + return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vec256 sqrt() const { + return _mm256_sqrt_pd(values); + } + Vec256 reciprocal() const { + return _mm256_div_pd(_mm256_set1_pd(1), values); + } + Vec256 rsqrt() const { + return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values)); + } +}; + +template <> +Vec256 inline operator+(const Vec256& a, const Vec256& b) { + return _mm256_add_pd(a, b); +} + +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_pd(a, b); +} + +template <> +Vec256 inline operator*(const Vec256& a, const Vec256& b) { + return _mm256_mul_pd(a, b); +} + +template <> +Vec256 inline operator/(const Vec256& a, const Vec256& b) { + return _mm256_div_pd(a, b); +} + +template <> +Vec256 inline max(const Vec256& a, const Vec256& b) { + return _mm256_max_pd(a, b); +} + +#endif + +}}} diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h new file mode 100644 index 0000000..09db2f4 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vec256_float.h @@ -0,0 +1,188 @@ +#pragma once + +#include "intrinsics.h" +#include "vec256_base.h" +#if defined(__AVX__) && !defined(_MSC_VER) +#include +#endif + +namespace at { +namespace vec256 { +namespace { + +#if defined(__AVX__) && !defined(_MSC_VER) + +template <> class Vec256 { +private: + __m256 values; +public: + static constexpr int64_t size = 8; + Vec256() {} + Vec256(__m256 v) : values(v) {} + Vec256(float val) { + values = _mm256_set1_ps(val); + } + operator __m256() const { + return values; + } + template + static Vec256 blend(Vec256 a, Vec256 b) { + return _mm256_blend_ps(a.values, b.values, mask); + } + static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vec256 loadu(const void* ptr, int64_t count = size) { + if (count == size) + return _mm256_loadu_ps(reinterpret_cast(ptr)); + __at_align32__ float tmp_values[size]; + std::memcpy( + tmp_values, reinterpret_cast(ptr), count * sizeof(float)); + return _mm256_loadu_ps(tmp_values); + } + void store(void* ptr, int64_t count = size) const { + if (count == size) { + _mm256_storeu_ps(reinterpret_cast(ptr), values); + } else { + float tmp_values[size]; + _mm256_storeu_ps(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(float)); + } + } + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + Vec256 map(float (*f)(float)) const { + __at_align32__ float tmp[8]; + store(tmp); + for (int64_t i = 0; i < 8; i++) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vec256 abs() const { + auto mask = _mm256_set1_ps(-0.f); + return _mm256_andnot_ps(mask, values); + } + Vec256 acos() const { + return Vec256(Sleef_acosf8_u10(values)); + } + Vec256 asin() const { + return Vec256(Sleef_asinf8_u10(values)); + } + Vec256 atan() const { + return Vec256(Sleef_atanf8_u10(values)); + } + Vec256 erf() const { + return Vec256(Sleef_erff8_u10(values)); + } + Vec256 erfc() const { + return Vec256(Sleef_erfcf8_u15(values)); + } + Vec256 exp() const { + return Vec256(Sleef_expf8_u10(values)); + } + Vec256 expm1() const { + return Vec256(Sleef_expm1f8_u10(values)); + } + Vec256 log() const { + return Vec256(Sleef_logf8_u10(values)); + } + Vec256 log2() const { + return Vec256(Sleef_log2f8_u10(values)); + } + Vec256 log10() const { + return Vec256(Sleef_log10f8_u10(values)); + } + Vec256 log1p() const { + return Vec256(Sleef_log1pf8_u10(values)); + } + Vec256 sin() const { + return map(std::sin); + } + Vec256 sinh() const { + return map(std::sinh); + } + Vec256 cos() const { + return map(std::cos); + } + Vec256 cosh() const { + return map(std::cosh); + } + Vec256 ceil() const { + return _mm256_ceil_ps(values); + } + Vec256 floor() const { + return _mm256_floor_ps(values); + } + Vec256 neg() const { + return _mm256_xor_ps(_mm256_set1_ps(-0.f), values); + } + Vec256 round() const { + return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + Vec256 tan() const { + return map(std::tan); + } + Vec256 tanh() const { + return Vec256(Sleef_tanhf8_u10(values)); + } + Vec256 trunc() const { + return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); + } + Vec256 sqrt() const { + return _mm256_sqrt_ps(values); + } + Vec256 reciprocal() const { + return _mm256_div_ps(_mm256_set1_ps(1), values); + } + Vec256 rsqrt() const { + return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values)); + } +}; + +template <> +Vec256 inline operator+(const Vec256& a, const Vec256& b) { + return _mm256_add_ps(a, b); +} + +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_ps(a, b); +} + +template <> +Vec256 inline operator*(const Vec256& a, const Vec256& b) { + return _mm256_mul_ps(a, b); +} + +template <> +Vec256 inline operator/(const Vec256& a, const Vec256& b) { + return _mm256_div_ps(a, b); +} + +template <> +Vec256 inline max(const Vec256& a, const Vec256& b) { + return _mm256_max_ps(a, b); +} + +#endif + +}}} diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h new file mode 100644 index 0000000..19a0a93 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vec256_int.h @@ -0,0 +1,296 @@ +#pragma once + +#include "intrinsics.h" +#include "vec256_base.h" + +namespace at { +namespace vec256 { +namespace { + +#ifdef __AVX2__ + +struct Vec256i { +protected: + __m256i values; +public: + Vec256i() {} + Vec256i(__m256i v) : values(v) {} + operator __m256i() const { + return values; + } +}; + +template <> +struct Vec256 : public Vec256i { + static constexpr int size = 4; + using Vec256i::Vec256i; + Vec256() {} + Vec256(int64_t v) { values = _mm256_set1_epi64x(v); } + template + static Vec256 blend(Vec256 a, Vec256 b) { + __at_align32__ int64_t tmp_values[size]; + a.store(tmp_values); + if (mask & 0x01) + tmp_values[0] = _mm256_extract_epi64(b.values, 0); + if (mask & 0x02) + tmp_values[1] = _mm256_extract_epi64(b.values, 1); + if (mask & 0x04) + tmp_values[2] = _mm256_extract_epi64(b.values, 2); + if (mask & 0x08) + tmp_values[3] = _mm256_extract_epi64(b.values, 3); + return loadu(tmp_values); + } + static Vec256 + set(Vec256 a, Vec256 b, int64_t count = size) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + static Vec256 loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vec256 loadu(const void* ptr, int64_t count) { + __at_align32__ int64_t tmp_values[size]; + std::memcpy(tmp_values, ptr, count * sizeof(int64_t)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size) const { + if (count == size) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else { + __at_align32__ int64_t tmp_values[size]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); + } + } + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; + Vec256 abs() const { + auto zero = _mm256_set1_epi64x(0); + auto is_larger = _mm256_cmpgt_epi64(zero, values); + auto inverse = _mm256_xor_si256(values, is_larger); + return _mm256_sub_epi64(inverse, is_larger); + } +}; + +template <> +struct Vec256 : public Vec256i { + static constexpr int size = 8; + using Vec256i::Vec256i; + Vec256() {} + Vec256(int32_t v) { values = _mm256_set1_epi32(v); } + template + static Vec256 blend(Vec256 a, Vec256 b) { + return _mm256_blend_epi32(a, b, mask); + } + static Vec256 + set(Vec256 a, Vec256 b, int32_t count = size) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + return b; + } + static Vec256 loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vec256 loadu(const void* ptr, int32_t count) { + __at_align32__ int32_t tmp_values[size]; + std::memcpy(tmp_values, ptr, count * sizeof(int32_t)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size) const { + if (count == size) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else { + __at_align32__ int32_t tmp_values[size]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); + } + } + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; + Vec256 abs() const { + return _mm256_abs_epi32(values); + } +}; + +template <> +struct Vec256 : public Vec256i { + static constexpr int size = 16; + using Vec256i::Vec256i; + Vec256() {} + Vec256(int16_t v) { values = _mm256_set1_epi16(v); } + template + static Vec256 blend(Vec256 a, Vec256 b) { + __at_align32__ int16_t tmp_values[size]; + a.store(tmp_values); + if (mask & 0x01) + tmp_values[0] = _mm256_extract_epi16(b.values, 0); + if (mask & 0x02) + tmp_values[1] = _mm256_extract_epi16(b.values, 1); + if (mask & 0x04) + tmp_values[2] = _mm256_extract_epi16(b.values, 2); + if (mask & 0x08) + tmp_values[3] = _mm256_extract_epi16(b.values, 3); + if (mask & 0x10) + tmp_values[4] = _mm256_extract_epi16(b.values, 4); + if (mask & 0x20) + tmp_values[5] = _mm256_extract_epi16(b.values, 5); + if (mask & 0x40) + tmp_values[6] = _mm256_extract_epi16(b.values, 6); + if (mask & 0x80) + tmp_values[7] = _mm256_extract_epi16(b.values, 7); + if (mask & 0x100) + tmp_values[8] = _mm256_extract_epi16(b.values, 8); + if (mask & 0x200) + tmp_values[9] = _mm256_extract_epi16(b.values, 9); + if (mask & 0x400) + tmp_values[10] = _mm256_extract_epi16(b.values, 10); + if (mask & 0x800) + tmp_values[11] = _mm256_extract_epi16(b.values, 11); + if (mask & 0x1000) + tmp_values[12] = _mm256_extract_epi16(b.values, 12); + if (mask & 0x2000) + tmp_values[13] = _mm256_extract_epi16(b.values, 13); + if (mask & 0x4000) + tmp_values[14] = _mm256_extract_epi16(b.values, 14); + if (mask & 0x8000) + tmp_values[15] = _mm256_extract_epi16(b.values, 15); + return loadu(tmp_values); + } + static Vec256 + set(Vec256 a, Vec256 b, int16_t count = size) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + static Vec256 loadu(const void* ptr) { + return _mm256_loadu_si256(reinterpret_cast(ptr)); + } + static Vec256 loadu(const void* ptr, int16_t count) { + __at_align32__ int16_t tmp_values[size]; + std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); + return loadu(tmp_values); + } + void store(void* ptr, int count = size) const { + if (count == size) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); + } else { + __at_align32__ int16_t tmp_values[size]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); + } + } + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; + Vec256 abs() const { + return _mm256_abs_epi16(values); + } +}; + +template <> +Vec256 inline operator+(const Vec256& a, const Vec256& b) { + return _mm256_add_epi64(a, b); +} + +template <> +Vec256 inline operator+(const Vec256& a, const Vec256& b) { + return _mm256_add_epi32(a, b); +} + +template <> +Vec256 inline operator+(const Vec256& a, const Vec256& b) { + return _mm256_add_epi16(a, b); +} + +// AVX2 has no intrinsic for int64_t multiply so it needs to be emulated +// This could be implemented more efficiently using epi32 instructions +// This is also technically avx compatible, but then we'll need AVX +// code for add as well. +template <> +Vec256 inline operator*(const Vec256& a, const Vec256& b) { + int64_t a0 = _mm256_extract_epi64(a, 0); + int64_t a1 = _mm256_extract_epi64(a, 1); + int64_t a2 = _mm256_extract_epi64(a, 2); + int64_t a3 = _mm256_extract_epi64(a, 3); + + int64_t b0 = _mm256_extract_epi64(b, 0); + int64_t b1 = _mm256_extract_epi64(b, 1); + int64_t b2 = _mm256_extract_epi64(b, 2); + int64_t b3 = _mm256_extract_epi64(b, 3); + + int64_t c0 = a0 * b0; + int64_t c1 = a1 * b1; + int64_t c2 = a2 * b2; + int64_t c3 = a3 * b3; + + return _mm256_set_epi64x(c3, c2, c1, c0); +} + +template <> +Vec256 inline operator*(const Vec256& a, const Vec256& b) { + return _mm256_mullo_epi32(a, b); +} + +template <> +Vec256 inline operator*(const Vec256& a, const Vec256& b) { + return _mm256_mullo_epi16(a, b); +} +#endif + +}}} diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h new file mode 100644 index 0000000..72877bc --- /dev/null +++ b/aten/src/ATen/cpu/vml.h @@ -0,0 +1,168 @@ +#pragma once + +#include "ATen/Config.h" +#include "ATen/Parallel.h" +#include "ATen/cpu/vec256/functional.h" +#include "ATen/cpu/vec256/vec256.h" + +// This header implements various unary operations using a MKL VML style +// interface. + +// It implements various functions with a simple interface +// For example it enables the user to call vsin(float* out, const float* in, +// size) This functions takes a pointer to a contious output array of floats and +// a constant input array. It will then apply sin to each value in in the input +// array and write the result into the output array. out and in may point to the +// same memory, i.e. this fully supports in-place operations. These functions +// also implement their own parallelization, so take precautions when calling +// these from threaded functions. + +// When MKL is available it will call into MKL's VML library similar to NumPy +// If MKL is not available it will use SLEEF. + +// This file might be compiled under AVX or AVX2 when called from e.g. +// UnaryOpsKernel.cpp + +#include +#include +#include +#include +#include + +#if AT_MKL_ENABLED() && !defined(__APPLE__) +#include +#endif + +// [Note SSE-AVX transitions] +// There is a bug in Glibc2.23 +// https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280. Calling zeroall +// when using AVX/AVX2 code resolves this. +#if defined(__AVX__) && defined(__GLIBC__) && __GLIBC_MINOR__ == 23 +#define DL_RUNTIME_BUG(op, type) \ + volatile type x = (type)(1); \ + x = std::op(x); \ + _mm256_zeroall(); +#else +#define DL_RUNTIME_BUG(op, type) +#endif + +namespace at { +namespace vml { +namespace { + +using namespace vec256; + +template +inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) { + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { + map( + [](const Vec256& x) { + return Vec256((scalar_t)(1)) / x.sqrt(); + }, + out + begin, + in + begin, + end - begin); + }); +} + +// NB: We ignore numerical errors by convention and leave them to the user + +// We unfortunately need to duplicate code here to deal with the SSE-AVX +// transition bug (see [Note SSE-AVX transitions]). As soon as we can expect +// users to use a version of glibc newer than 2.23 we will be able to ditch +// this. This duplication is also necessary since not all functions (e.g. rsqrt) +// might be part of cmath. + +#define IMPLEMENT_VML_BUG(op) \ + template \ + inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) { \ + DL_RUNTIME_BUG(op, scalar_t) \ + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { \ + map([](const Vec256& x) { return x.op(); }, \ + out + begin, \ + in + begin, \ + end - begin); \ + }); \ + } + +#define IMPLEMENT_VML(op) \ + template \ + inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) { \ + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { \ + map([](const Vec256& x) { return x.op(); }, \ + out + begin, \ + in + begin, \ + end - begin); \ + }); \ + } + +IMPLEMENT_VML_BUG(abs) +IMPLEMENT_VML_BUG(acos) +IMPLEMENT_VML_BUG(asin) +IMPLEMENT_VML_BUG(atan) +IMPLEMENT_VML_BUG(ceil) +IMPLEMENT_VML_BUG(cos) +// IMPLEMENT_VML_BUG(cosh) +IMPLEMENT_VML_BUG(erf) +IMPLEMENT_VML_BUG(erfc) +IMPLEMENT_VML_BUG(exp) +IMPLEMENT_VML_BUG(expm1) +IMPLEMENT_VML_BUG(floor) +IMPLEMENT_VML(reciprocal) +IMPLEMENT_VML_BUG(log) +IMPLEMENT_VML_BUG(log10) +IMPLEMENT_VML_BUG(log1p) +IMPLEMENT_VML_BUG(log2) +IMPLEMENT_VML(neg) +IMPLEMENT_VML_BUG(sin) +// IMPLEMENT_VML_BUG(sinh) +IMPLEMENT_VML_BUG(sqrt) +IMPLEMENT_VML_BUG(round) +IMPLEMENT_VML(rsqrt) +IMPLEMENT_VML_BUG(tan) +IMPLEMENT_VML_BUG(tanh) +IMPLEMENT_VML_BUG(trunc) + +#if AT_MKL_ENABLED() && !defined(__APPLE__) + +#define IMPLEMENT_VML_MKL(op, mklop) \ + template <> \ + inline void v##op(float* out, const float* in, int64_t size) { \ + vms##mklop(size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \ + } \ + template <> \ + inline void v##op(double* out, const double* in, int64_t size) { \ + vmd##mklop(size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \ + } + +// NB: abs, cosh and sinh were temporarily disabled due to issues with Apple clang + +IMPLEMENT_VML_MKL(abs, Abs) +IMPLEMENT_VML_MKL(acos, Acos) +IMPLEMENT_VML_MKL(asin, Asin) +IMPLEMENT_VML_MKL(atan, Atan) +IMPLEMENT_VML_MKL(cos, Cos) +// IMPLEMENT_VML_MKL(cosh, Cosh) +IMPLEMENT_VML_MKL(erf, Erf) +IMPLEMENT_VML_MKL(erfc, Erfc) +IMPLEMENT_VML_MKL(exp, Exp) +IMPLEMENT_VML_MKL(expm1, Expm1) +IMPLEMENT_VML_MKL(log, Ln) +IMPLEMENT_VML_MKL(log10, Log10) +IMPLEMENT_VML_MKL(log1p, Log1p) +IMPLEMENT_VML_MKL(sin, Sin) +// IMPLEMENT_VML_MKL(sinh, Sinh) +IMPLEMENT_VML_MKL(sqrt, Sqrt) +IMPLEMENT_VML_MKL(tan, Tan) +IMPLEMENT_VML_MKL(tanh, Tanh) +IMPLEMENT_VML_MKL(trunc, Trunc) + +#if INTEL_MKL_VERSION >= 20180406 +IMPLEMENT_VML_MKL(log2, Log2) +#endif + +#endif + +} // namespace +} // namespace vml +} // namespace at diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h new file mode 100644 index 0000000..4dade5e --- /dev/null +++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h @@ -0,0 +1,11 @@ +#pragma once + +#ifdef _WIN32 +# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) +# define AT_CUDA_API __declspec(dllexport) +# else +# define AT_CUDA_API __declspec(dllimport) +# endif +#else +# define AT_CUDA_API +#endif diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh new file mode 100644 index 0000000..e34cd14 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -0,0 +1,908 @@ +#pragma once + +#include "detail/IndexUtils.cuh" +#include "ATen/TensorUtils.h" +#include "THC/THCAtomics.cuh" + +// +// This file contains pointwise operation functions and kernels that +// work on both contiguous and non-contiguous tensor arguments of +// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without +// copying or temporary storage. +// + +namespace at { +namespace cuda { + +// TODO: combine with TensorArg? So far that's been for debugging, and this is functional... +enum class TensorArgType { ReadWrite, ReadOnly }; + +// Rearrange dimensions for pointwise operations so that strides are in +// decreasing order as much as possible, so that kernels have better memory +// access patterns. +// +// For example, consider a binary operation on two "transposed" 2-dim tensors: +// sizes: 256 512 +// aInfo->strides: 1 256 +// bInfo->strides: 1 256 +// +// Given this, each concurrent memory access inside kernelPointwiseApply2() is +// exactly 256 elements apart, resulting in poor performance. +// +// This function exchanges dimensions so that memory access is contiguous: +// sizes: 512 256 +// aInfo->strides: 256 1 +// bInfo->strides: 256 1 +// +// (Actually, it becomes even better because now collapseDims() can turn each +// input into one contiguous array.) +// +// In general, given M (<=4) TensorInfo's with N dimensions, we can view each +// strides[i] (0 <= i < N) as an M-tuple. Given each pair i < j, we exchange +// strides[i] and [j] if +// (1) strides[i][k] < strides[j][k] for some k (0 <= k < M) +// (exchanging them will benefit input #k), and +// (2) strides[i][k] <= strieds[j][k] for all k +// (exchanging them will not make any input worse). +template +void rearrangeDims(detail::TensorInfo* aInfo, + detail::TensorInfo* bInfo = nullptr, + detail::TensorInfo* cInfo = nullptr, + detail::TensorInfo* dInfo = nullptr) { + int numInfos = 1; + int dims = aInfo->dims; + IndexType *sizes[4] = { aInfo->sizes, }; + IndexType *strides[4] = { aInfo->strides, }; + + if (bInfo != nullptr) { + ++numInfos; + if (bInfo->dims != dims) return; + sizes[1] = bInfo->sizes; + strides[1] = bInfo->strides; + } + + if (cInfo != nullptr) { + ++numInfos; + if (cInfo->dims != dims) return; + sizes[2] = cInfo->sizes; + strides[2] = cInfo->strides; + } + + if (dInfo != nullptr) { + ++numInfos; + if (dInfo->dims != dims) return; + sizes[3] = dInfo->sizes; + strides[3] = dInfo->strides; + } + + // Bail out if sizes do not match: we are using "deprecated pointwise + // behavior" among tensors of different shapes but same number of elements. + for (int i = 1; i < numInfos; ++i) { + for (int j = 0; j < dims; ++j) { + if (sizes[i][j] != sizes[0][j]) return; + } + } + + for (int i = 0; i < dims - 1; ++i) { + // No need to consider dimensions of size 1. + if (sizes[0][i] == 1) continue; + + for (int j = i + 1; j < dims; ++j) { + if (sizes[0][j] == 1) continue; + + // Compare the relative sizes of strides between dim #i and dim #j. + bool hasIncreasingStrides = false; + bool hasDecreasingStrides = false; + + for (int k = 0; k < numInfos; k++) { + IndexType stride_i = strides[k][i]; + IndexType stride_j = strides[k][j]; + if (stride_i < stride_j) { + hasIncreasingStrides = true; + } else if (stride_i > stride_j) { + hasDecreasingStrides = true; + } + } + + if (hasIncreasingStrides && !hasDecreasingStrides) { + for (int k = 0; k < numInfos; k++) { + IndexType size = sizes[k][i]; + sizes[k][i] = sizes[k][j]; + sizes[k][j] = size; + + IndexType stride = strides[k][i]; + strides[k][i] = strides[k][j]; + strides[k][j] = stride; + } + } + } + } +} + +// Threads per block for our apply kernel +// FIXME: use occupancy calculator instead +#define AT_APPLY_THREADS_PER_BLOCK 32 * 16 +#define AT_APPLY_BLOCKS_PER_SM 4 + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) +#endif +__global__ void +kernelPointwiseApply2(detail::TensorInfo a, + detail::TensorInfo b, + IndexType totalElements, + Op op) { + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + // Convert `linearIndex` into an offset of `a` + const IndexType aOffset = + detail::IndexToOffset::get(linearIndex, a); + + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = + detail::IndexToOffset::get(linearIndex, b); + + op(a.data[aOffset], b.data[bOffset]); + } +} + + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) +#endif +__global__ void +kernelPointwiseApply3(detail::TensorInfo a, + detail::TensorInfo b, + detail::TensorInfo c, + IndexType totalElements, + Op op) { + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + // Convert `linearIndex` into an offset of `a` + const IndexType aOffset = + detail::IndexToOffset::get(linearIndex, a); + + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = + detail::IndexToOffset::get(linearIndex, b); + + // Convert `linearIndex` into an offset of `c` + const IndexType cOffset = + detail::IndexToOffset::get(linearIndex, c); + + op(a.data[aOffset], b.data[bOffset], c.data[cOffset]); + } +} + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) +#endif +__global__ void +kernelPointwiseApply4(detail::TensorInfo a, + detail::TensorInfo b, + detail::TensorInfo c, + detail::TensorInfo d, + IndexType totalElements, + Op op) { + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + // Convert `linearIndex` into an offset of `a` + const IndexType aOffset = + detail::IndexToOffset::get(linearIndex, a); + + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = + detail::IndexToOffset::get(linearIndex, b); + + // Convert `linearIndex` into an offset of `c` + const IndexType cOffset = + detail::IndexToOffset::get(linearIndex, c); + + // Convert `linearIndex` into an offset of `d` + const IndexType dOffset = + detail::IndexToOffset::get(linearIndex, d); + + op(a.data[aOffset], b.data[bOffset], c.data[cOffset], d.data[dOffset]); + } +} + +/** + Computes ceil(a / b) +*/ +template +__host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) { + return (a + b - 1) / b; +} + +inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) { + if (curDevice == -1) return false; + uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast(AT_APPLY_THREADS_PER_BLOCK)); + uint64_t maxGridX = at::globalContext().getDeviceProperties(curDevice)->maxGridSize[0]; + if (numBlocks > maxGridX) + numBlocks = maxGridX; + grid = dim3(numBlocks); + return true; +} + +inline dim3 getApplyBlock() { + return dim3(AT_APPLY_THREADS_PER_BLOCK); +} + +/* + Apply a pointwise operator to two tensors. + + The calling convention for op is a function/functor that takes takes two references to + type scalar; at least one of these references should be non-const in order to write the output. + For example, to compute a = b^2, op would be of the form: + [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; }; +*/ +template +bool CUDA_tensor_apply2(at::Tensor a, + at::Tensor b, + Op op, + TensorArgType aType = TensorArgType::ReadWrite, + TensorArgType bType = TensorArgType::ReadOnly) { + checkBackend("CUDA_tensor_apply2", {a, b}, Backend::CUDA); + int64_t totalElements = a.numel(); + + if (totalElements != b.numel()) { + return false; + } + + if (a.dim() > MAX_TENSORINFO_DIMS || + b.dim() > MAX_TENSORINFO_DIMS) { + return false; + } + + if (a.numel() == 0) { + // Empty tensor; do nothing + return true; + } + const dim3 block = getApplyBlock(); + + dim3 grid; + int64_t curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + Tensor oldA; + Tensor oldB; + + if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { + // Must perform in contiguous space + oldA = a; + a = a.contiguous(); + } + if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) { + // Must perform in contiguous space + oldB = b; + b = b.contiguous(); + } + + // It is possible that the tensor dimensions are able to be collapsed, + // and thus we can reduce the actual code complexity of the copy by + // exploiting this knowledge statically, since the div/mod is the + // most expensive part of the operation, more so than memory accesses. + // For instance, when copying a non-contiguous to a contiguous tensor + // (or vice versa), the contiguous tensor can be collapsed to one + // dimension, and the loop to translate the linear index to the array + // index can be similarly collapsed. That is what this unrolling is for. + +#define HANDLE_CASE(TYPE, A, B) \ + kernelPointwiseApply2 \ + <<>>( \ + aInfo, bInfo, (TYPE) totalElements, op); + +#define HANDLE_B_CASE(TYPE, A, B) { \ + switch (B) { \ + case 1: \ + HANDLE_CASE(TYPE, A, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, -1); \ + break; \ + } \ +} + +#define HANDLE_A_CASE(TYPE, A, B) { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B); \ + break; \ + } \ +} + + if (detail::canUse32BitIndexMath(a) && + detail::canUse32BitIndexMath(b)) { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + rearrangeDims(&aInfo, &bInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); +#if CUDA_VERSION < 9000 + if (!(aInfo.isContiguous() && bInfo.isContiguous())) + grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); +#endif + + HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); + } else { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + rearrangeDims(&aInfo, &bInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1 && bInfo.dims == 1) { + kernelPointwiseApply2 + <<>>( + aInfo, bInfo, (uint64_t) totalElements, op); + } else { +#if CUDA_VERSION < 9000 + grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); +#endif + kernelPointwiseApply2 + <<>>( + aInfo, bInfo, (uint64_t) totalElements, op); + } + } +#undef HANDLE_CASE +#undef HANDLE_B_CASE +#undef HANDLE_A_CASE + + if (oldA.defined()) { + // Ignore overlaps when copying back; if we use copy + // instead, it will recursively try and invoke ourselves to make + // oldA contiguous. + oldA._copy_ignoring_overlaps_(a); + a = oldA; + } + + if (oldB.defined()) { + // Ignore overlaps when copying back; if we use copy + // instead, it will recursively try and invoke ourselves to make + // oldB contiguous. + oldB._copy_ignoring_overlaps_(b); + b = oldB; + } + + return true; +} + +/* + Apply a pointwise operator to three tensors. + + The calling convention for op is a function/functor that takes takes three references to + type scalar; at least one of these references should be non-const in order to write the output. + For example, to compute a = b + c, op would be of the form: + [] __device__ (scalar &a_val, const scalar &b_val, const scalar &c_val) { + a_val = b_val + c_val; + }; +*/ +template +bool CUDA_tensor_apply3(at::Tensor a, + at::Tensor b, + at::Tensor c, + const Op& op, + TensorArgType aType = TensorArgType::ReadWrite, + TensorArgType bType = TensorArgType::ReadOnly, + TensorArgType cType = TensorArgType::ReadOnly) { + checkBackend("CUDA_tensor_apply3", {a, b, c}, Backend::CUDA); + int64_t totalElements = a.numel(); + + if (totalElements != b.numel() || + totalElements != c.numel()) { + return false; + } + + if (a.dim() > MAX_TENSORINFO_DIMS || + b.dim() > MAX_TENSORINFO_DIMS || + c.dim() > MAX_TENSORINFO_DIMS) { + return false; + } + + if (a.numel() == 0) { + // Empty tensor; do nothing + return true; + } + + const dim3 block = getApplyBlock(); + + dim3 grid; + int64_t curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + Tensor oldA; + Tensor oldB; + Tensor oldC; + + if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { + // Must perform in contiguous space + oldA = a; + a = a.contiguous(); + } + if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) { + // Must perform in contiguous space + oldB = b; + b = b.contiguous(); + } + if (cType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(c)) { + // Must perform in contiguous space + oldC = c; + c = c.contiguous(); + } + +#define HANDLE_CASE(TYPE, A, B, C) \ + kernelPointwiseApply3 \ + <<>>( \ + aInfo, bInfo, cInfo, (TYPE) totalElements, op); + +#define HANDLE_C_CASE(TYPE, A, B, C) { \ + switch (C) { \ + case 1: \ + HANDLE_CASE(TYPE, A, B, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, B, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, B, -1); \ + break; \ + } \ +} + +#define HANDLE_B_CASE(TYPE, A, B, C) { \ + switch (B) { \ + case 1: \ + HANDLE_C_CASE(TYPE, A, 1, C); \ + break; \ + case 2: \ + HANDLE_C_CASE(TYPE, A, 2, C); \ + break; \ + default: \ + HANDLE_C_CASE(TYPE, A, -1, C); \ + break; \ + } \ +} + +#define HANDLE_A_CASE(TYPE, A, B, C) { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B, C); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B, C); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B, C); \ + break; \ + } \ +} + + if (detail::canUse32BitIndexMath(a) && + detail::canUse32BitIndexMath(b) && + detail::canUse32BitIndexMath(c)) { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + + detail::TensorInfo cInfo = + detail::getTensorInfo(c); + + rearrangeDims(&aInfo, &bInfo, &cInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + cInfo.collapseDims(); + +#if CUDA_VERSION < 9000 + if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous())) + grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); +#endif + HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims); + } else { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + + detail::TensorInfo cInfo = + detail::getTensorInfo(c); + + rearrangeDims(&aInfo, &bInfo, &cInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + cInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) { + kernelPointwiseApply3 + <<>>( + aInfo, bInfo, cInfo, (uint64_t) totalElements, op); + } else { +#if CUDA_VERSION < 9000 + grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); +#endif + + kernelPointwiseApply3 + <<>>( + aInfo, bInfo, cInfo, (uint64_t) totalElements, op); + } + } +#undef HANDLE_CASE +#undef HANDLE_C_CASE +#undef HANDLE_B_CASE +#undef HANDLE_A_CASE + + if (oldA.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldA contiguous. + oldA._copy_ignoring_overlaps_(a); + a = oldA; + } + + if (oldB.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldB contiguous. + oldB._copy_ignoring_overlaps_(b); + b = oldB; + } + + if (oldC.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldC contiguous. + oldC._copy_ignoring_overlaps_(c); + c = oldC; + } + + return true; +} + +/* + Apply a pointwise operator to four tensors. + + The calling convention for op is a function/functor that takes takes four references to + type scalar; at least one of these references should be non-const in order to write the output. + For example, to compute a = b + c * d, op would be of the form: + [] __device__ (scalar &a_val, const scalar &b_val, const scalar &c_val, const scalar &d_val) { + a_val = b_val + c_val * d_val; + }; +*/ +template +bool CUDA_tensor_apply4(at::Tensor a, + at::Tensor b, + at::Tensor c, + at::Tensor d, + const Op& op, + TensorArgType aType = TensorArgType::ReadWrite, + TensorArgType bType = TensorArgType::ReadOnly, + TensorArgType cType = TensorArgType::ReadOnly, + TensorArgType dType = TensorArgType::ReadOnly) { + checkBackend("CUDA_tensor_apply4", {a, b, c, d}, Backend::CUDA); + int64_t totalElements = a.numel(); + + if (totalElements != b.numel() || + totalElements != c.numel() || + totalElements != d.numel()) { + return false; + } + + if (a.dim() > MAX_TENSORINFO_DIMS || + b.dim() > MAX_TENSORINFO_DIMS || + c.dim() > MAX_TENSORINFO_DIMS || + d.dim() > MAX_TENSORINFO_DIMS) { + return false; + } + + if (a.numel() == 0) { + // Empty tensor; do nothing + return true; + } + + const dim3 block = getApplyBlock(); + + dim3 grid; + int64_t curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + Tensor oldA; + Tensor oldB; + Tensor oldC; + Tensor oldD; + + if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { + // Must perform in contiguous space + oldA = a; + a = a.contiguous(); + } + if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) { + // Must perform in contiguous space + oldB = b; + b = b.contiguous(); + } + if (cType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(c)) { + // Must perform in contiguous space + oldC = c; + c = c.contiguous(); + } + if (dType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(c)) { + // Must perform in contiguous space + oldD = d; + d = d.contiguous(); + } + +#define HANDLE_CASE(TYPE, A, B, C, D) \ + kernelPointwiseApply4 \ + <<>>( \ + aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op); + +#define HANDLE_D_CASE(TYPE, A, B, C, D) { \ + switch (D) { \ + case 1: \ + HANDLE_CASE(TYPE, A, B, C, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, B, C, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, B, C, -1); \ + break; \ + } \ +} + +#define HANDLE_C_CASE(TYPE, A, B, C, D) { \ + switch (C) { \ + case 1: \ + HANDLE_D_CASE(TYPE, A, B, 1, D); \ + break; \ + case 2: \ + HANDLE_D_CASE(TYPE, A, B, 2, D); \ + break; \ + default: \ + HANDLE_D_CASE(TYPE, A, B, -1, D); \ + break; \ + } \ +} + +#define HANDLE_B_CASE(TYPE, A, B, C, D) { \ + switch (B) { \ + case 1: \ + HANDLE_C_CASE(TYPE, A, 1, C, D); \ + break; \ + case 2: \ + HANDLE_C_CASE(TYPE, A, 2, C, D); \ + break; \ + default: \ + HANDLE_C_CASE(TYPE, A, -1, C, D); \ + break; \ + } \ +} + +#define HANDLE_A_CASE(TYPE, A, B, C, D) { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B, C, D); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B, C, D); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B, C, D); \ + break; \ + } \ +} + + if (detail::canUse32BitIndexMath(a) && + detail::canUse32BitIndexMath(b) && + detail::canUse32BitIndexMath(c) && + detail::canUse32BitIndexMath(d)) { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + + detail::TensorInfo cInfo = + detail::getTensorInfo(c); + + detail::TensorInfo dInfo = + detail::getTensorInfo(d); + + rearrangeDims(&aInfo, &bInfo, &cInfo, &dInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + cInfo.collapseDims(); + dInfo.collapseDims(); + +#if CUDA_VERSION < 9000 + if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous() && dInfo.isContiguous())) + grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); +#endif + HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims, dInfo.dims); + } else { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + + detail::TensorInfo cInfo = + detail::getTensorInfo(c); + + detail::TensorInfo dInfo = + detail::getTensorInfo(d); + + rearrangeDims(&aInfo, &bInfo, &cInfo, &dInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + cInfo.collapseDims(); + dInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1 && dInfo.dims == 1) { + kernelPointwiseApply4 + <<>>( + aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op); + } else { +#if CUDA_VERSION < 9000 + grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); +#endif + + kernelPointwiseApply4 + <<>>( + aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op); + } + } +#undef HANDLE_CASE +#undef HANDLE_D_CASE +#undef HANDLE_C_CASE +#undef HANDLE_B_CASE +#undef HANDLE_A_CASE + + if (oldA.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldA contiguous. + oldA._copy_ignoring_overlaps_(a); + a = oldA; + } + + if (oldB.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldB contiguous. + oldB._copy_ignoring_overlaps_(b); + b = oldB; + } + + if (oldC.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldC contiguous. + oldC._copy_ignoring_overlaps_(c); + c = oldC; + } + + if (oldD.defined()) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldC contiguous. + oldD._copy_ignoring_overlaps_(c); + d = oldD; + } + + return true; +} + +} // cuda +} // at diff --git a/aten/src/ATen/cuda/CUDAConfig.h.in b/aten/src/ATen/cuda/CUDAConfig.h.in new file mode 100644 index 0000000..72adee5 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAConfig.h.in @@ -0,0 +1,7 @@ +#pragma once + +// Test these using #if AT_CUDNN_ENABLED(), not #ifdef, so that it's +// obvious if you forgot to include Config.h +// c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined + +#define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@ diff --git a/aten/src/ATen/cuda/CUDAGenerator.cpp b/aten/src/ATen/cuda/CUDAGenerator.cpp new file mode 100644 index 0000000..38fcd84 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAGenerator.cpp @@ -0,0 +1,56 @@ +#include "ATen/Config.h" + +#include "ATen/CUDAGenerator.h" +#include "ATen/Context.h" +#include "THCTensorRandom.h" +#include + +// There is only one CUDAGenerator instance. Calls to seed(), manualSeed(), +// initialSeed(), and unsafeGetTH() refer to the THCGenerator on the current +// device. + +THCGenerator* THCRandom_getGenerator(THCState* state); + +namespace at { + +CUDAGenerator::CUDAGenerator(Context * context_) + : context(context_) +{ +} + +CUDAGenerator::~CUDAGenerator() { + // no-op Generator state is global to the program +} + +CUDAGenerator& CUDAGenerator::copy(const Generator& from) { + throw std::runtime_error("CUDAGenerator::copy() not implemented"); +} + +CUDAGenerator& CUDAGenerator::free() { + THCRandom_shutdown(context->getTHCState()); + return *this; +} + +uint64_t CUDAGenerator::seed() { + return THCRandom_initialSeed(context->getTHCState()); +} + +uint64_t CUDAGenerator::initialSeed() { + return THCRandom_initialSeed(context->getTHCState()); +} + +CUDAGenerator& CUDAGenerator::manualSeed(uint64_t seed) { + THCRandom_manualSeed(context->getTHCState(), seed); + return *this; +} + +CUDAGenerator& CUDAGenerator::manualSeedAll(uint64_t seed) { + THCRandom_manualSeedAll(context->getTHCState(), seed); + return *this; +} + +void * CUDAGenerator::unsafeGetTH() { + return (void*)THCRandom_getGenerator(context->getTHCState()); +} + +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu new file mode 100644 index 0000000..c13efc6 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAHalf.cu @@ -0,0 +1,56 @@ +#include "ATen/cuda/CUDAHalf.cuh" +#include "ATen/Half.h" + +#include +#include +#include + +namespace at { +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) +template <> AT_CUDA_API +half convert(Half aten_half) { + return half{aten_half.x}; +} + +template <> AT_CUDA_API +half convert(double value) { + return half{Half(value).x}; +} + +template <> AT_CUDA_API +Half convert(half cuda_half) { + return Half(cuda_half.x, Half::from_bits); +} +#else +template <> AT_CUDA_API +half convert(Half aten_half) { + __half_raw x_raw; + x_raw.x = aten_half.x; + return half(x_raw); +} + +template <> AT_CUDA_API +Half convert(half cuda_half) { + __half_raw raw(cuda_half); + return Half(raw.x, Half::from_bits); +} + +template <> AT_CUDA_API +half convert(double value) { + __half_raw raw; + raw.x = Half(value).x; + return half {raw}; +} + +template <> __half HalfFix(Half h) { + __half_raw raw; + raw.x = h.x; + return __half{raw}; +} + +template <> Half HalfFix(__half h) { + __half_raw raw(h); + return Half(raw.x, Half::from_bits); +} +#endif +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh new file mode 100644 index 0000000..87e7621 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAHalf.cuh @@ -0,0 +1,18 @@ +#pragma once + +#include "ATen/cuda/ATenCUDAGeneral.h" +#include "ATen/Half.h" + +#include +#include +#include + +namespace at { +template <> AT_CUDA_API half convert(Half aten_half); +template <> AT_CUDA_API Half convert(half cuda_half); +template <> AT_CUDA_API half convert(double value); +#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) +template <> __half HalfFix(Half h); +template <> Half HalfFix(__half h); +#endif +} // namespace at diff --git a/aten/src/ATen/cuda/CUDATensorMethods.cuh b/aten/src/ATen/cuda/CUDATensorMethods.cuh new file mode 100644 index 0000000..39f81d9 --- /dev/null +++ b/aten/src/ATen/cuda/CUDATensorMethods.cuh @@ -0,0 +1,15 @@ +#pragma once + +#include "ATen/Tensor.h" +#include "ATen/Half.h" + +#include +#include +#include + +namespace at { +template <> +inline __half* Tensor::data() const { + return reinterpret_cast<__half*>(data()); +} +} // namespace at diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp new file mode 100644 index 0000000..3e6c683 --- /dev/null +++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp @@ -0,0 +1,17 @@ +#include +#include +#include + +#include +#include + +#include + +namespace at { namespace cuda { + +at::Allocator* getPinnedMemoryAllocator() { + auto state = globalContext().lazyInitCUDA(); + return state->cudaHostAllocator; +} + +}} // namespace at::cuda diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.h b/aten/src/ATen/cuda/PinnedMemoryAllocator.h new file mode 100644 index 0000000..f3aa457 --- /dev/null +++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace at { namespace cuda { + +at::Allocator* getPinnedMemoryAllocator(); + +}} // namespace at::cuda diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp new file mode 100644 index 0000000..2969924 --- /dev/null +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -0,0 +1,238 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "THC/THC.h" +#include + +#if AT_CUDNN_ENABLED() +#include "ATen/cudnn/cudnn-wrapper.h" +#endif + +#include + +#include +#include +#include + +namespace at { +namespace cuda { +namespace detail { +namespace { + +void check_status(int32_t status) { + AT_CHECK( + static_cast(status) == cudaSuccess, + "CUDA error (", + static_cast(status), + "): ", + cudaGetErrorString(static_cast(status))); +} + +void set_device(int32_t device) { + check_status(cudaSetDevice(device)); +} + +void get_device(int32_t* device) { + check_status(cudaGetDevice(device)); +} + +void unchecked_set_device(int32_t device) { + const auto return_code = cudaSetDevice(device); + (void)return_code; +} + +void cuda_stream_create_with_priority( + cudaStream_t* pStream +, int32_t flags +, int32_t priority) { +#ifndef __HIP_PLATFORM_HCC__ + check_status(cudaStreamCreateWithPriority(pStream, flags, priority)); +#else + check_status(cudaStreamCreateWithFlags(pStream, flags)); +#endif +} + +void cuda_stream_destroy(cudaStream_t stream) { + check_status(cudaStreamDestroy(stream)); +} + +struct DynamicCUDAInterfaceSetter { + DynamicCUDAInterfaceSetter() { + at::detail::DynamicCUDAInterface::set_device = set_device; + at::detail::DynamicCUDAInterface::get_device = get_device; + at::detail::DynamicCUDAInterface::unchecked_set_device = + unchecked_set_device; + at::detail::DynamicCUDAInterface::cuda_stream_create_with_priority = + cuda_stream_create_with_priority; + at::detail::DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy; + } +}; + +// Single, global, static (because of the anonymous namespace) instance, whose +// constructor will set the static members of `DynamicCUDAInterface` to CUDA +// functions when the ATen CUDA library is loaded. +DynamicCUDAInterfaceSetter _; +} // namespace + +// NB: deleter is dynamic, because we need it to live in a separate +// compilation unit (alt is to have another method in hooks, but +// let's not if we don't need to!) +std::unique_ptr CUDAHooks::initCUDA() const { + THCState* thc_state = THCState_alloc(); + // Caching allocator has no context + THCState_setDeviceAllocator(thc_state, THCCachingAllocator_get()); + thc_state->cudaHostAllocator = getTHCCachingHostAllocator(); + THCudaInit(thc_state); + return std::unique_ptr( + thc_state, [](THCState* p) { + if (p) + THCState_free(p); + }); +} + +std::unique_ptr CUDAHooks::initCUDAGenerator( + Context* context) const { + return std::unique_ptr(new CUDAGenerator(context)); +} + +bool CUDAHooks::hasCUDA() const { + int count; + cudaError_t err = cudaGetDeviceCount(&count); + if (err == cudaErrorInsufficientDriver) { + return false; + } + return true; +} + +bool CUDAHooks::hasCuDNN() const { + return AT_CUDNN_ENABLED(); +} + +#ifndef __HIP_PLATFORM_HCC__ +cusparseHandle_t CUDAHooks::getCurrentCUDASparseHandle(THCState* thc_state) const { + return THCState_getCurrentSparseHandle(thc_state); +} +#endif +struct cudaDeviceProp* CUDAHooks::getCurrentDeviceProperties( + THCState* thc_state) const { + return THCState_getCurrentDeviceProperties(thc_state); +} +struct cudaDeviceProp* CUDAHooks::getDeviceProperties( + THCState* thc_state, + int device) const { + return THCState_getDeviceProperties(thc_state, device); +} + +int64_t CUDAHooks::current_device() const { + int device; + cudaError_t err = cudaGetDevice(&device); + if (err == cudaSuccess) { + return device; + } + return -1; +} + +Allocator* CUDAHooks::getPinnedMemoryAllocator() const { + return at::cuda::getPinnedMemoryAllocator(); +} + +void CUDAHooks::registerCUDATypes(Context* context) const { + register_cuda_types(context); +} + +bool CUDAHooks::compiledWithCuDNN() const { + return AT_CUDNN_ENABLED(); +} + +bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { +#if AT_CUDNN_ENABLED() + cudaDeviceProp* prop = + getCurrentDeviceProperties(globalContext().getTHCState()); + // NOTE: extra parenthesis around numbers disable clang warnings about + // dead code + return ( + (CUDNN_VERSION >= (6021)) || + (CUDNN_VERSION >= (6000) && prop->major >= 5)); +#else + return false; +#endif +} + +long CUDAHooks::versionCuDNN() const { +#if AT_CUDNN_ENABLED() + return CUDNN_VERSION; +#else + AT_ERROR("Cannot query CuDNN version if ATen_cuda is not built with CuDNN"); +#endif +} + +double CUDAHooks::batchnormMinEpsilonCuDNN() const { +#if AT_CUDNN_ENABLED() + return CUDNN_BN_MIN_EPSILON; +#else + AT_ERROR( + "Cannot query CUDNN_BN_MIN_EPSILON if ATen_cuda is not built with CuDNN"); +#endif +} + +int64_t CUDAHooks::cuFFTGetPlanCacheMaxSize() const { +#ifndef __HIP_PLATFORM_HCC__ + return at::native::detail::cufft_get_plan_cache_max_size_impl(); +#else + AT_ERROR("cuFFT with HIP is not supported"); +#endif +} + +void CUDAHooks::cuFFTSetPlanCacheMaxSize(int64_t max_size) const { +#ifndef __HIP_PLATFORM_HCC__ + at::native::detail::cufft_set_plan_cache_max_size_impl(max_size); +#else + AT_ERROR("cuFFT with HIP is not supported"); +#endif +} + +int64_t CUDAHooks::cuFFTGetPlanCacheSize() const { +#ifndef __HIP_PLATFORM_HCC__ + return at::native::detail::cufft_get_plan_cache_size_impl(); +#else + AT_ERROR("cuFFT with HIP is not supported"); +#endif +} + +void CUDAHooks::cuFFTClearPlanCache() const { +#ifndef __HIP_PLATFORM_HCC__ + at::native::detail::cufft_clear_plan_cache_impl(); +#else + AT_ERROR("cuFFT with HIP is not supported"); +#endif +} + +int CUDAHooks::getNumGPUs() const { + int count; + auto err = cudaGetDeviceCount(&count); + if (err == cudaErrorNoDevice) { + return 0; + } else if (err != cudaSuccess) { + AT_ERROR( + "CUDA error (", static_cast(err), "): ", cudaGetErrorString(err)); + } + return count; +} + +// Sigh, the registry doesn't support namespaces :( +using at::CUDAHooksRegistry; +using at::RegistererCUDAHooksRegistry; + +REGISTER_CUDA_HOOKS(CUDAHooks); + +} // namespace detail +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h new file mode 100644 index 0000000..d88ac0d --- /dev/null +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -0,0 +1,36 @@ +#include + +#include + +// TODO: No need to have this whole header, we can just put it all in +// the cpp file + +namespace at { namespace cuda { namespace detail { + +// The real implementation of CUDAHooksInterface +struct CUDAHooks : public at::CUDAHooksInterface { + CUDAHooks(at::CUDAHooksArgs) {} + std::unique_ptr initCUDA() const override; + std::unique_ptr initCUDAGenerator(Context*) const override; + bool hasCUDA() const override; + bool hasCuDNN() const override; +#ifndef __HIP_PLATFORM_HCC__ + cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const override; +#endif + struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const override; + struct cudaDeviceProp* getDeviceProperties(THCState*, int device) const override; + int64_t current_device() const override; + Allocator* getPinnedMemoryAllocator() const override; + void registerCUDATypes(Context*) const override; + bool compiledWithCuDNN() const override; + bool supportsDilatedConvolutionWithCuDNN() const override; + long versionCuDNN() const override; + double batchnormMinEpsilonCuDNN() const override; + int64_t cuFFTGetPlanCacheMaxSize() const override; + void cuFFTSetPlanCacheMaxSize(int64_t max_size) const override; + int64_t cuFFTGetPlanCacheSize() const override; + void cuFFTClearPlanCache() const override; + int getNumGPUs() const override; +}; + +}}} // at::cuda::detail diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cu b/aten/src/ATen/cuda/detail/IndexUtils.cu new file mode 100644 index 0000000..43b2637 --- /dev/null +++ b/aten/src/ATen/cuda/detail/IndexUtils.cu @@ -0,0 +1,97 @@ +#include "IndexUtils.cuh" + +namespace at { +namespace cuda { +namespace detail { + +struct SizeAndStride { + int64_t size; + int64_t stride; +}; + +/* + A comparator that will sort SizeAndStride structs by stride, + in ascending order. + */ + int compareSizeAndStride(const void* a, const void* b) { + const SizeAndStride* aS = (const SizeAndStride*) a; + const SizeAndStride* bS = (const SizeAndStride*) b; + + if (aS->stride < bS->stride) return -1; + if (aS->stride == bS->stride) return 0; + return 1; +} + +/* +Returns false if there is no possibility that the tensor +has "overlapping" indices and true otherwise. +"Overlapping" indices are two+ valid indices that specify +the same offset within the tensor. +The function does this by checking for a sufficient but not +necessary condition of no overlap. In particular, that +that there exists an ordering of the tensor's dimensions +that is nicely "nested," with each dimension contained +within the next one. +*/ +bool maybeOverlappingIndices(const Tensor& t) { + /* Extract size/stride arrays; only consider size >1 dims. */ + SizeAndStride *info = (SizeAndStride *)alloca(sizeof(SizeAndStride) * t.dim()); + int dims = t.dim(); + int nonSize1Dims = 0; + for (int i = 0; i < dims; ++i) { + int64_t size = t.size(i); + if (size > 1) { + info[nonSize1Dims].size = size; + info[nonSize1Dims].stride = t.stride(i); + + if (info[nonSize1Dims].stride < 1) { + return true; + } + + ++nonSize1Dims; + } + } + + // Short-circuits if tensor is a single element. + if (nonSize1Dims == 0) { + return false; + } + + /* Ascending order (innermost dimension in sorted view is at [0]) */ + qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); + + for (int i = 0; i < (nonSize1Dims - 1); ++i) { + if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) { + return true; + } + } + + return false; +} + +bool canUse32BitIndexMath(const Tensor& t, int64_t max_elem) { + int64_t elements = t.numel(); + if (elements >= max_elem) { + return false; + } + + int64_t offset = 0; + int64_t linearId = elements - 1; + + for (int i = t.dim() - 1; i >= 0; --i) { + int64_t curDimIndex = linearId % t.size(i); + int64_t curDimOffset = curDimIndex * t.stride(i); + offset += curDimOffset; + linearId /= t.size(i); + } + + if (offset >= max_elem) { + return false; + } + + return true; +} + +} // detail +} // cuda +} // at diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cuh b/aten/src/ATen/cuda/detail/IndexUtils.cuh new file mode 100644 index 0000000..9bbf8f7 --- /dev/null +++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh @@ -0,0 +1,32 @@ +#pragma once + +#include "ATen/ATen.h" +#include "TensorInfo.cuh" +#include + +namespace at { +namespace cuda { +namespace detail { + +bool maybeOverlappingIndices(const at::Tensor& t); +bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits::max()); + +template +TensorInfo +getTensorInfo(const at::Tensor& t) { + IndexType sz[MAX_TENSORINFO_DIMS]; + IndexType st[MAX_TENSORINFO_DIMS]; + + int dims = t.dim(); + for (int i = 0; i < dims; ++i) { + sz[i] = t.size(i); + st[i] = t.stride(i); + } + + return TensorInfo( + t.data(), dims, sz, st); +} + +} // detail +} // cuda +} // at diff --git a/aten/src/ATen/cuda/detail/TensorInfo.cuh b/aten/src/ATen/cuda/detail/TensorInfo.cuh new file mode 100644 index 0000000..e0ada29 --- /dev/null +++ b/aten/src/ATen/cuda/detail/TensorInfo.cuh @@ -0,0 +1,186 @@ +#pragma once + +#include "ATen/ATen.h" + +namespace at { +namespace cuda { +namespace detail { + +#define MAX_TENSORINFO_DIMS 25 + +// CUDA kernel argument that defines tensor layout +template +struct TensorInfo { + TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]); + + // Set the size of the given dimension to 1, as if it were a + // reduction dim (allows you to calculate offsets of the reduction + // slice) + void reduceDim(int dim); + + /* + Updates the TensorInfo's dims, sizes, and strides to reflect a "collapse" of + the info, possibly excluding the optional excludeDim. A "collapsed" version + of the info is the fewest dims that order the tensor's elements in the same + way as the original info. If excludeDim is specified, the collapse is the + fewest dims that order the tensor's elements as the original and preserve the + excluded dimension, unless the tensor collapses to a point. + + Returns the (new) index of the preserved dimension if excludeDim is + specified. Returns 0 if the tensor is collapsed to a point. Returns -1 + otherwise. + */ + int collapseDims(const int excludeDim = -1); + + // Contiguous tensors of more than one dimension are collapsed down + // to one tensor + __host__ __device__ inline bool isContiguous() const { + return (dims == 1 && strides[0] == 1); + } + + T* data; + IndexType sizes[MAX_TENSORINFO_DIMS]; + IndexType strides[MAX_TENSORINFO_DIMS]; + int dims; +}; + +template +TensorInfo::TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]) { + data = p; + dims = dim; + AT_ASSERT(dims < MAX_TENSORINFO_DIMS); + + for (int i = 0; i < dim; ++i) { + sizes[i] = sz[i]; + strides[i] = st[i]; + } +} + +template +void +TensorInfo::reduceDim(int dim) { + AT_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1"); + sizes[dim] = 1; +} + +template +int +TensorInfo::collapseDims(const int excludeDim) { + + AT_CHECK(excludeDim >= -1 && excludeDim < dims, + "expected excluded dim between -1 and dims - 1"); + + int stopDim = (excludeDim == -1) ? dims : excludeDim; + int newIndex = -1; + int oldIndex = 0; + int remappedExcludedDim = -1; + + while (oldIndex < dims) { + // Finds a dimension to collapse into + for (; oldIndex < stopDim; ++oldIndex) { + if (sizes[oldIndex] == 1) { + continue; + } + + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + ++oldIndex; + break; + } + + // Collapses dims + for (; oldIndex < stopDim; ++oldIndex) { + if (sizes[oldIndex] == 1) { + continue; + } + + if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) { + sizes[newIndex] *= sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + } else { + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + } + } + + // Handles excludeDim being set (oldIndex == excludeDim) + if (oldIndex != dims) { + + // Preserves excluded dimension + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + remappedExcludedDim = newIndex; + + // Restarts iteration after excludeDim + ++oldIndex; + stopDim = dims; + } + } + + // Handles special case of all dims size 1 + if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) { + dims = 1; + sizes[0] = 1; + strides[0] = 1; + + return 0; + } + + dims = newIndex + 1; + return remappedExcludedDim; +} + +// Translate a linear index for the apply to a T* offset; +// specialized on `Dims` to reduce nvcc compilation time +template +struct IndexToOffset { + static __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + // Uses static dims + for (int i = Dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +// Uses dynamic (runtime) instead of static (compiletime) dims +template +struct IndexToOffset { + static inline __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + for (int i = info.dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +} // detail +} // cuda +} // at diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp new file mode 100644 index 0000000..aafaebf --- /dev/null +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -0,0 +1,135 @@ +#include "Descriptors.h" + +#include + +#include +#include +#include + +namespace at { namespace native { + +namespace { + +inline cudnnDataType_t getDataType(const at::Type& t) { + auto scalar_type = t.scalarType(); + if (scalar_type == at::kFloat) { + return CUDNN_DATA_FLOAT; + } else if (scalar_type == at::kHalf) { + return CUDNN_DATA_HALF; + } else if (scalar_type == at::kDouble) { + return CUDNN_DATA_DOUBLE; + } + throw std::runtime_error("TensorDescriptor only supports double, float and half tensors"); +} + +inline cudnnDataType_t getDataType(const at::Tensor& t) { + return getDataType(t.type()); +} + +} // anonymous namespace + + +void TensorDescriptor::set(const at::Tensor &t, size_t pad) { + set(getDataType(t), t.sizes(), t.strides(), pad); +} + +void TensorDescriptor::set(cudnnDataType_t datatype, IntList t_sizes, IntList t_strides, size_t pad) { + size_t dim = t_sizes.size(); + if (dim > CUDNN_DIM_MAX || pad > CUDNN_DIM_MAX) +#define _STR(X) #X +#define STR(X) _STR(X) + throw std::runtime_error("cuDNN supports only up to " STR(CUDNN_DIM_MAX) " dimensions"); +#undef _STR +#undef STR + int size[CUDNN_DIM_MAX]; + int stride[CUDNN_DIM_MAX]; + for (size_t i = 0; i < dim; ++i) { + size[i] = static_cast(t_sizes[i]); + stride[i] = static_cast(t_strides[i]); + } + for (size_t i = dim; i < pad; ++i) { + size[i] = 1; + stride[i] = 1; + } + set(datatype, static_cast(std::max(dim, pad)), size, stride); +} + +std::string cudnnTypeToString(cudnnDataType_t dtype) { + switch (dtype) { + case CUDNN_DATA_FLOAT: + return "CUDNN_DATA_FLOAT"; + case CUDNN_DATA_DOUBLE: + return "CUDNN_DATA_DOUBLE"; + case CUDNN_DATA_HALF: + return "CUDNN_DATA_HALF"; + case CUDNN_DATA_INT8: + return "CUDNN_DATA_INT8"; + case CUDNN_DATA_INT32: + return "CUDNN_DATA_INT32"; + case CUDNN_DATA_INT8x4: + return "CUDNN_DATA_INT8x4"; +#if CUDNN_VERSION >= 7100 + case CUDNN_DATA_UINT8: + return "CUDNN_DATA_UINT8"; + case CUDNN_DATA_UINT8x4: + return "CUDNN_DATA_UINT8x4"; +#endif + default: + std::ostringstream oss; + oss << "(unknown data-type " << static_cast(dtype) << ")"; + return oss.str(); + } +} + +std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) { + out << "TensorDescriptor " << static_cast(d.desc()) << "\n"; + int nbDims; + int dimA[CUDNN_DIM_MAX]; + int strideA[CUDNN_DIM_MAX]; + cudnnDataType_t dtype; + cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA); + out << " type = " << cudnnTypeToString(dtype) << "\n"; + out << " nbDims = " << nbDims << "\n"; + // Read out only nbDims of the arrays! + out << " dimA = "; + for (auto i : ArrayRef{dimA, static_cast(nbDims)}) { + out << i << ", "; + } + out << "\n"; + out << " strideA = "; + for (auto i : ArrayRef{strideA, static_cast(nbDims)}) { + out << i << ", "; + } + out << "\n"; + return out; +} + +void TensorDescriptor::print() { std::cout << *this; } + +void FilterDescriptor::set(const at::Tensor &t, int64_t pad) { + auto dim = t.ndimension(); + if (dim > CUDNN_DIM_MAX || pad > CUDNN_DIM_MAX) +#define _STR(X) #X +#define STR(X) _STR(X) + throw std::runtime_error("cuDNN supports only up to " STR(CUDNN_DIM_MAX) " dimensions"); +#undef _STR +#undef STR + if (!t.is_contiguous()) { + // NB: It is possible for this test to be insufficient, because the + // Tensor passed in to set the filter descriptor may not be the actual + // Tensor whose data pointer is passed to cuDNN. Nevertheless, + // that is the common case, so we can catch most client errors with this test. + throw std::runtime_error("cuDNN filters (a.k.a. weights) must be contiguous"); + } + int size[CUDNN_DIM_MAX]; + for (int i = 0; i < dim; ++i) { + size[i] = (int) t.size(i); + } + for (int i = dim; i < pad; ++i) { + size[i] = (int) 1; + } + dim = std::max(dim, pad); + set(getDataType(t), (int) dim, size); +} + +}} diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h new file mode 100644 index 0000000..2bf7f0a --- /dev/null +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -0,0 +1,334 @@ +#pragma once + +#include "Exceptions.h" + +#include "cudnn-wrapper.h" +#include +#include +#include "ATen/cuda/ATenCUDAGeneral.h" +#include + +#if CUDNN_VERSION < 7000 + +#include + +/* +Note [cuDNN dropout descriptor initialization] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In most cases, setting descriptors in cuDNN is cheap (e.g., +cudnnSetTensorNdDescriptor). However, this is not the case for +cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an +expensive precomputation to initialize the random number generator states. In +cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor, +which means that law-abiding clients were expected to generate a dropout +descriptor once and cache it. However, our ATen interface is (1) stateless (so +we can't cache the descriptors) and (2) does not accept arbitrary user types in +its interface (so we can't pass the descriptor in). This puts us in a pickle. + +In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which +forgoes the expensive initialization process, and can initialize the +descriptor with a pre-initialized state CUDA tensor. This is great, because +it means we can simply pass in the state tensor and then initialize the +descriptor internally. Unfortunately, this function is not available in +cuDNN 6. + +To work around this, we break the cuDNN abstraction barrier, and have +the struct layout of the underlaying dropout descriptor. With this struct, +we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great! +*/ + +// Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization] +struct cudnnDropoutStruct { + float dropout; + int nstates; + void * states; +}; + +#endif + +namespace at { namespace native { + +// TODO: Add constructors for all of the descriptors + +inline int dataSize(cudnnDataType_t dataType) +{ + switch (dataType) { + case CUDNN_DATA_HALF: return 2; + case CUDNN_DATA_FLOAT: return 4; + default: return 8; + } +} + +// The stride for a size-1 dimensions is not uniquely determined; in +// fact, it can be anything you want, because the fact that the +// tensor is size 1 at this dimension means that you will never actually +// try advancing your pointer by this stride. +// +// However, CuDNN has a much more stringent requirement on strides: +// if you are passing a contiguous input, it better be the case +// that the stride for dim i is the product of the sizes of dims +// i+1 to the end. This stride is indeed uniquely determined. This +// function modifies 'stride' in place so this invariant holds. +static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) { + int64_t z = 1; + for(int d = dim-1; d >= 0; d--) + { + if (size[d] == 1) { + stride[d] = z; + } else { + z *= size[d]; + } + } +} + +template +struct DescriptorDeleter { + void operator()(T* x) { + if (x != nullptr) { + AT_CUDNN_CHECK(dtor(x)); + } + } +}; + +// A generic class for wrapping cuDNN descriptor types. All you need +// is to give the underlying type the Descriptor_t points to (usually, +// if it's cudnnTensorDescriptor_t it points to cudnnTensorStruct), +// the constructor and the destructor. Subclasses are responsible +// for defining a set() function to actually set the descriptor. +// +// Descriptors default construct to a nullptr, and have a descriptor +// initialized the first time you call set() or any other initializing +// function. +template +class AT_CUDA_API Descriptor +{ +public: + // TODO: Figure out why const-correctness doesn't work here + + // Use desc() to access the underlying descriptor pointer in + // a read-only fashion. Most client code should use this. + // If the descriptor was never initialized, this will return + // nullptr. + T* desc() const { return desc_.get(); } + T* desc() { return desc_.get(); } + + // Use mut_desc() to access the underlying desciptor pointer + // if you intend to modify what it points to (e.g., using + // cudnnSetFooDescriptor). This will ensure that the descriptor + // is initialized. Code in this file will use this function. + T* mut_desc() { init(); return desc_.get(); } +protected: + void init() { + if (desc_ == nullptr) { + T* raw_desc; + AT_CUDNN_CHECK(ctor(&raw_desc)); + desc_.reset(raw_desc); + } + } +private: + std::unique_ptr> desc_; +}; + +class AT_CUDA_API TensorDescriptor + : public Descriptor +{ +public: + TensorDescriptor() {} + explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) { + set(t, pad); + } + + // Note [CuDNN broadcast padding] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // pad specifies the minimum dimensionality of the tensor descriptor + // we produce (it doesn't have anything to do with, e.g., convolution + // padding). If 't' is lower-dimensional than 'pad', the remaining + // dimensions (on the right) are padded with ones. This doesn't + // affect the underlying data layout. This is particularly useful for + // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is + // done in two steps: first, the client code is expected to pad out + // (the dimensions) input tensors to be the same dimension as the + // target broadcast, and then second, CuDNN takes of actually + // broadcasting size 1 dimensions. + + void set(const at::Tensor &t, size_t pad = 0); + void set(cudnnDataType_t dataType, IntList sizes, IntList strides, size_t pad = 0); + + void print(); + +private: + void set(cudnnDataType_t dataType, int dim, int* size, int* stride) { + fixSizeOneDimStride(dim, size, stride); + AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride)); + } +}; + +std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); + +class FilterDescriptor + : public Descriptor +{ +public: + void set(const at::Tensor &t, int64_t pad = 0); + +private: + void set(cudnnDataType_t dataType, int dim, int* size) { + AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, CUDNN_TENSOR_NCHW, dim, size)); + } +}; + +struct AT_CUDA_API ConvolutionDescriptor + : public Descriptor +{ + void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) { + cudnnDataType_t mathType = dataType; + if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT; + AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, + CUDNN_CROSS_CORRELATION, mathType)); +#if CUDNN_VERSION >= 7000 + AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups)); + AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH)); + if(dataType == CUDNN_DATA_HALF) + AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); +#endif + } +}; + +struct AT_CUDA_API SpatialTransformerDescriptor + : public Descriptor +{ + void set(cudnnDataType_t dataType, int dim, int* size) { + AT_CUDNN_CHECK(cudnnSetSpatialTransformerNdDescriptor(mut_desc(), CUDNN_SAMPLER_BILINEAR, dataType, dim, size)); + } +}; + +#if CUDNN_VERSION < 7000 + +// See Note [cuDNN dropout descriptor initialization] +inline cudnnStatus_t cudnnRestoreDropoutDescriptor( + cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed) { + // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends. + // This is not entirely accurate but is good enough to catch some API + // uses which would not be compatible in cuDNN 7. Feel free to fix + // this if you notice something is wrong. + if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE; + if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE; + size_t expectedStateSizeInBytes; + // State size will differ depending on size of GPU + auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes); + if (ret != CUDNN_STATUS_SUCCESS) return ret; + if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE; + dropoutDesc->dropout = dropout; + dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t); + dropoutDesc->states = states; + return CUDNN_STATUS_SUCCESS; +} + +#endif // CUDNN_VERSION + +struct AT_CUDA_API DropoutDescriptor + : public Descriptor +{ + at::Tensor state; + + // Initialize a dropout descriptor's RNG state. + // WARNING: This function is very expensive, avoid calling this function! + // NB: it takes a Type so that we can generate a Variable if necessary. + void initialize_rng(const Type& type, cudnnHandle_t handle, float dropout, long long int seed) { + AT_ASSERTM(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout"); + size_t state_size; + AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size)); + AT_ASSERT(type.is_cuda()); + AT_ASSERT(type.scalarType() == kByte); + state = at::empty({static_cast(state_size)}, type); + AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed)); + } + + // Restore a dropout descriptor given a dropout probability and existing RNG state. + // See Note [cuDNN dropout descriptor initialization] + void set(cudnnHandle_t handle, float dropout, at::Tensor state_) { + AT_ASSERTM(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout"); + state = state_; + void *state_ptr = state.data_ptr(); + size_t state_size = state.size(0); + // NB: The seed doesn't actually matter, so we give a dummy value + AT_CUDNN_CHECK(cudnnRestoreDropoutDescriptor(mut_desc(), handle, dropout, state_ptr, state_size, 0 /* seed */)); + } + + // Restore a dropout descriptor corresponding to no dropout + // See Note [cuDNN dropout descriptor initialization] + void set_no_dropout(cudnnHandle_t handle) { + // NB: seed doesn't matter when dropout = 0, because no random number + // initialization actually takes place when there is no dropout. + // NB: Empirically, cudnnSetDropoutDescriptor is cheap when + // dropoot == 0 + AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */)); + } +}; + +struct AT_CUDA_API RNNDescriptor + : public Descriptor +{ + DropoutDescriptor dropout_desc_; + void set(cudnnHandle_t handle, int hidden_size, int num_layers, DropoutDescriptor&& dropout_desc, + cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional, + cudnnRNNMode_t mode, cudnnDataType_t datatype) { + dropout_desc_ = std::move(dropout_desc); + AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v6( + handle, + mut_desc(), + hidden_size, + num_layers, + dropout_desc_.desc(), + input_mode, + bidirectional, + mode, + CUDNN_RNN_ALGO_STANDARD, + datatype)); +#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000 + cudaDeviceProp* prop = globalContext().getCurrentDeviceProperties(); + if (prop->major >= 7) { + if (datatype == CUDNN_DATA_HALF) { + cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH); + } else { + // Technically, as the default it's not necessary to explicitly + // set this. + cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_DEFAULT_MATH); + } + } +#endif + } +}; + +union Constant +{ + float f; + double d; + Constant(cudnnDataType_t dataType, double value) { + if (dataType == CUDNN_DATA_HALF || dataType == CUDNN_DATA_FLOAT) { + f = (float) value; + } else { + d = value; + } + } +}; + +}} // namespace diff --git a/aten/src/ATen/cudnn/Exceptions.h b/aten/src/ATen/cudnn/Exceptions.h new file mode 100644 index 0000000..b59127e --- /dev/null +++ b/aten/src/ATen/cudnn/Exceptions.h @@ -0,0 +1,17 @@ +#pragma once +#include +#define AT_CUDNN_CHECK(STATUS) \ + if (STATUS != CUDNN_STATUS_SUCCESS) { \ + if (STATUS == CUDNN_STATUS_NOT_SUPPORTED) { \ + AT_ERROR( \ + "CuDNN error: ", \ + cudnnGetErrorString(STATUS), \ + ". This error may appear if you passed in a non-contiguous input."); \ + } else { \ + AT_ERROR("CuDNN error: ", cudnnGetErrorString(STATUS)); \ + } \ + } +#define AT_CUDA_CHECK(STATUS) \ + if (STATUS != cudaSuccess) { \ + AT_ERROR("CUDA error: ", cudaGetErrorString(STATUS)); \ + } diff --git a/aten/src/ATen/cudnn/Handles.cpp b/aten/src/ATen/cudnn/Handles.cpp new file mode 100644 index 0000000..7aae86d --- /dev/null +++ b/aten/src/ATen/cudnn/Handles.cpp @@ -0,0 +1,51 @@ +#include "Handles.h" + +#include "Exceptions.h" + +#include +#include + +// TODO: Get rid of the mutex, and just initialize these +// handles in at::Context along with lazy CUDA initialization + +namespace at { namespace native { + +namespace { + +struct Handle { + cudnnHandle_t handle; + Handle() : handle(NULL) { + AT_CUDNN_CHECK(cudnnCreate(&handle)); + } + ~Handle() { + if (handle) { +// this is because of something dumb in the ordering of +// destruction. Sometimes atexit, the cuda context (or something) +// would already be destroyed by the time this gets destroyed. It +// happens in fbcode setting. @colesbury and I decided to not destroy +// the handle as a workaround. +// - @soumith +#ifdef NO_CUDNN_DESTROY_HANDLE +#else + cudnnDestroy(handle); +#endif + } + } +}; + +std::mutex mutex; +std::unordered_map handles; + +} // namespace + + +cudnnHandle_t getCudnnHandle() +{ + int device; + AT_CUDA_CHECK(cudaGetDevice(&device)); + + std::lock_guard guard(mutex); + return handles[device].handle; +} + +}} // namespace at::cudnn diff --git a/aten/src/ATen/cudnn/Handles.h b/aten/src/ATen/cudnn/Handles.h new file mode 100644 index 0000000..369b1f3 --- /dev/null +++ b/aten/src/ATen/cudnn/Handles.h @@ -0,0 +1,10 @@ +#pragma once + +#include "cudnn-wrapper.h" +#include "ATen/cuda/ATenCUDAGeneral.h" + +namespace at { namespace native { + +AT_CUDA_API cudnnHandle_t getCudnnHandle(); + +}} // namespace diff --git a/aten/src/ATen/cudnn/README.md b/aten/src/ATen/cudnn/README.md new file mode 100644 index 0000000..057fbc9 --- /dev/null +++ b/aten/src/ATen/cudnn/README.md @@ -0,0 +1,4 @@ +All files living in this directory are written with the assumption that cuDNN is available, +which means that these code are not guarded by `#if AT_CUDNN_ENABLED()`. Therefore, whenever +you need to use definitions from here, please guard the `#include` and +definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](native/cudnn/BatchNorm.cpp). diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp new file mode 100644 index 0000000..7c11d46 --- /dev/null +++ b/aten/src/ATen/cudnn/Types.cpp @@ -0,0 +1,24 @@ +#include "Types.h" + +#include + +namespace at { namespace native { + +cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) { + if (tensor.type().scalarType() == at::kFloat) { + return CUDNN_DATA_FLOAT; + } else if (tensor.type().scalarType() == at::kDouble) { + return CUDNN_DATA_DOUBLE; + } else if (tensor.type().scalarType() == at::kHalf) { + return CUDNN_DATA_HALF; + } + std::string msg("getCudnnDataType() not supported for "); + msg += at::toString(tensor.type().scalarType()); + throw std::runtime_error(msg); +} + +int64_t cudnn_version() { + return CUDNN_VERSION; +} + +}} // namespace at::cudnn diff --git a/aten/src/ATen/cudnn/Types.h b/aten/src/ATen/cudnn/Types.h new file mode 100644 index 0000000..33fa8e6 --- /dev/null +++ b/aten/src/ATen/cudnn/Types.h @@ -0,0 +1,12 @@ +#pragma once + +#include "cudnn-wrapper.h" +#include + +namespace at { namespace native { + +cudnnDataType_t getCudnnDataType(const at::Tensor& tensor); + +int64_t cudnn_version(); + +}} // namespace at::cudnn diff --git a/aten/src/ATen/cudnn/Utils.h b/aten/src/ATen/cudnn/Utils.h new file mode 100644 index 0000000..c2e5dcb --- /dev/null +++ b/aten/src/ATen/cudnn/Utils.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include "THC/THC.h" +#include "cudnn-wrapper.h" +#include "Handles.h" + +namespace at { namespace native { + +inline void setCuDNNStreamToCurrent() { + // TODO: Should getCurrentStream be a method on Context? + AT_CUDNN_CHECK(cudnnSetStream(getCudnnHandle(), THCState_getCurrentStream(globalContext().getTHCState()))); +} + +// cuDNN has a buggy check for tensor being contiguous (that is, it does +// not ignore stride for dimension that is equal to 0). This function +// makes tensors which have zero stride contiguous, by setting the +// strides to 1 as cuDNN likes. +inline Tensor contiguousIfZeroInStrides(const Tensor& t) { + for (auto s : t.strides()) { + if (s == 0) return t.contiguous(); + } + return t; +} + +}} diff --git a/aten/src/ATen/cudnn/cudnn-wrapper.h b/aten/src/ATen/cudnn/cudnn-wrapper.h new file mode 100644 index 0000000..320646e --- /dev/null +++ b/aten/src/ATen/cudnn/cudnn-wrapper.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +#define STRINGIFY(x) #x +#define STRING(x) STRINGIFY(x) + +#if CUDNN_MAJOR < 6 +#pragma message ("CuDNN v" STRING(CUDNN_MAJOR) " found, but need at least CuDNN v6. You can get the latest version of CuDNN from https://developer.nvidia.com/cudnn or disable CuDNN with NO_CUDNN=1") +#pragma message "We strongly encourage you to move to 6.0 and above." +#pragma message "This message is intended to annoy you enough to update." +#endif + +#undef STRINGIFY +#undef STRING + diff --git a/aten/src/ATen/cwrap_parser.py b/aten/src/ATen/cwrap_parser.py new file mode 100644 index 0000000..f020dd0 --- /dev/null +++ b/aten/src/ATen/cwrap_parser.py @@ -0,0 +1,22 @@ +import yaml + +# follows similar logic to cwrap, ignores !inc, and just looks for [[]] + + +def parse(filename): + with open(filename, 'r') as file: + declaration_lines = [] + declarations = [] + in_declaration = False + for line in file.readlines(): + line = line.rstrip() + if line == '[[': + declaration_lines = [] + in_declaration = True + elif line == ']]': + in_declaration = False + declaration = yaml.load('\n'.join(declaration_lines)) + declarations.append(declaration) + elif in_declaration: + declaration_lines.append(line) + return declarations diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp new file mode 100644 index 0000000..b6897ed --- /dev/null +++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp @@ -0,0 +1,76 @@ +#include + +#include + +#include +#include +#include + +namespace at { +namespace detail { + +void default_set_device(int32_t) { + AT_ERROR( + "DynamicCUDAInterface::set_device called " + "before CUDA library was loaded"); +} + +void default_get_device(int32_t*) { + AT_ERROR( + "DynamicCUDAInterface::get_device called " + "before CUDA library was loaded"); +} + +void default_unchecked_set_device(int32_t) { + AT_ERROR( + "DynamicCUDAInterface::unchecked_set_device called " + "before CUDA library was loaded"); +} + +void default_cuda_stream_create_with_priority(cudaStream_t*, int32_t, int32_t) { + AT_ERROR( + "DynamicCUDAInterface::cuda_stream_create_with_priority called " + "before CUDA library was loaded"); +} + +void default_cuda_stream_destroy(cudaStream_t) { + AT_ERROR( + "DynamicCUDAInterface::cuda_stream_destroy called " + "before CUDA library was loaded"); +} + +// Default the static members of DynamicCUDAInterface. +void (*DynamicCUDAInterface::set_device)(int32_t) = default_set_device; +void (*DynamicCUDAInterface::get_device)(int32_t*) = default_get_device; +void (*DynamicCUDAInterface::unchecked_set_device)(int32_t) = + default_unchecked_set_device; +void (*DynamicCUDAInterface::cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t) + = default_cuda_stream_create_with_priority; +void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) + = default_cuda_stream_destroy; + + +const CUDAHooksInterface& getCUDAHooks() { + static std::unique_ptr cuda_hooks; + // NB: The once_flag here implies that if you try to call any CUDA + // functionality before libATen_cuda.so is loaded, CUDA is permanently + // disabled for that copy of ATen. In principle, we can relax this + // restriction, but you might have to fix some code. See getVariableHooks() + // for an example where we relax this restriction (but if you try to avoid + // needing a lock, be careful; it doesn't look like Registry.h is thread + // safe...) + static std::once_flag once; + std::call_once(once, [] { + cuda_hooks = CUDAHooksRegistry()->Create("CUDAHooks", CUDAHooksArgs{}); + if (!cuda_hooks) { + cuda_hooks = + std::unique_ptr(new CUDAHooksInterface()); + } + }); + return *cuda_hooks; +} +} // namespace detail + +AT_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) + +} // namespace at diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h new file mode 100644 index 0000000..e15cf36 --- /dev/null +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -0,0 +1,182 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +// Forward declare these CUDA types here to avoid including CUDA headers in +// ATen headers, which would make ATen always require CUDA to build. +struct THCState; +struct cudaDeviceProp; +struct CUstream_st; +typedef struct CUstream_st* cudaStream_t; + +#ifndef __HIP_PLATFORM_HCC__ +// pyHIPIFY rewrites this as: +// +// struct cusparseContext; +// typedef struct cusparseContext *hipsparseHandle_t; +// +// however, this forward declaration is wrong +// the way that the HIP headers define hipsparseHandle_t is +// +// typedef cusparseHandle_t hipsparseHandle_t +// +// so the rewrite is wrong. +struct cusparseContext; +typedef struct cusparseContext *cusparseHandle_t; +#endif + +namespace at { +class Context; +} + +// NB: Class must live in `at` due to limitations of Registry.h. +namespace at { + +constexpr const char* CUDA_HELP = + "PyTorch splits its backend into two shared libraries: a CPU library " + "and a CUDA library; this error has occurred because you are trying " + "to use some CUDA functionality, but the CUDA library has not been " + "loaded by the dynamic linker for some reason. The CUDA library MUST " + "be loaded, EVEN IF you don't directly use any symbols from the CUDA library! " + "One common culprit is a lack of -Wl,--no-as-needed in your link arguments; many " + "dynamic linkers will delete dynamic library dependencies if you don't " + "depend on any of their symbols. You can check if this has occurred by " + "using ldd on your binary to see if there is a dependency on *_cuda.so " + "library."; + +// The CUDAHooksInterface is an omnibus interface for any CUDA functionality +// which we may want to call into from CPU code (and thus must be dynamically +// dispatched, to allow for separate compilation of CUDA code). How do I +// decide if a function should live in this class? There are two tests: +// +// 1. Does the *implementation* of this function require linking against +// CUDA libraries? +// +// 2. Is this function *called* from non-CUDA ATen code? +// +// (2) should filter out many ostensible use-cases, since many times a CUDA +// function provided by ATen is only really ever used by actual CUDA code. +// +// TODO: Consider putting the stub definitions in another class, so that one +// never forgets to implement each virtual function in the real implementation +// in CUDAHooks. This probably doesn't buy us much though. +struct AT_API CUDAHooksInterface { + // This should never actually be implemented, but it is used to + // squelch -Werror=non-virtual-dtor + virtual ~CUDAHooksInterface() {} + + // Initialize THCState and, transitively, the CUDA state + virtual std::unique_ptr initCUDA() const { + AT_ERROR("Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP); + } + + virtual std::unique_ptr initCUDAGenerator(Context*) const { + AT_ERROR("Cannot initialize CUDA generator without ATen_cuda library. ", CUDA_HELP); + } + + virtual bool hasCUDA() const { + return false; + } + + virtual bool hasCuDNN() const { + return false; + } + +#ifndef __HIP_PLATFORM_HCC__ + virtual cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const { + AT_ERROR("Cannot getCurrentCUDASparseHandle() without ATen_cuda library. ", CUDA_HELP); + } +#endif + + virtual struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const { + AT_ERROR("Cannot getCurrentDeviceProperties() without ATen_cuda library. ", CUDA_HELP); + } + + virtual struct cudaDeviceProp* getDeviceProperties(THCState*, int device) + const { + AT_ERROR("Cannot getDeviceProperties() without ATen_cuda library. ", CUDA_HELP); + } + + virtual int64_t current_device() const { + return -1; + } + + virtual Allocator* getPinnedMemoryAllocator() const { + AT_ERROR("Pinned memory requires CUDA. ", CUDA_HELP); + } + + virtual void registerCUDATypes(Context*) const { + AT_ERROR("Cannot registerCUDATypes() without ATen_cuda library. ", CUDA_HELP); + } + + virtual bool compiledWithCuDNN() const { + return false; + } + + virtual bool supportsDilatedConvolutionWithCuDNN() const { + return false; + } + + virtual long versionCuDNN() const { + AT_ERROR("Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); + } + + virtual double batchnormMinEpsilonCuDNN() const { + AT_ERROR( + "Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP); + } + + virtual int64_t cuFFTGetPlanCacheMaxSize() const { + AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); + } + + virtual void cuFFTSetPlanCacheMaxSize(int64_t max_size) const { + AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); + } + + virtual int64_t cuFFTGetPlanCacheSize() const { + AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); + } + + virtual void cuFFTClearPlanCache() const { + AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); + } + + virtual int getNumGPUs() const { + return 0; + } +}; + +// NB: dummy argument to suppress "ISO C++11 requires at least one argument +// for the "..." in a variadic macro" +struct AT_API CUDAHooksArgs {}; + +AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) +#define REGISTER_CUDA_HOOKS(clsname) \ + AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname) + +namespace detail { +AT_API const CUDAHooksInterface& getCUDAHooks(); + +/// This class exists to let us access `cudaSetDevice`, `cudaGetDevice` and CUDA +/// error handling functions, when CUDA is available. These functions will first +/// default to no-ops. When the `ATen` GPU library is loaded, they will be set to +/// the `cudaSetDevice`/`cudaGetDevice` functions. This allows us to access them +/// with only a single pointer indirection, while virtual dispatch would require +/// two (one for the virtual call, one for `cudaSetDevice`/`cudaGetDevice`). +struct AT_API DynamicCUDAInterface { + static void (*set_device)(int32_t); + static void (*get_device)(int32_t*); + static void (*unchecked_set_device)(int32_t); + static void (*cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t); + static void (*cuda_stream_destroy)(cudaStream_t); +}; +} // namespace detail +} // namespace at diff --git a/aten/src/ATen/detail/UniqueVoidPtr.cpp b/aten/src/ATen/detail/UniqueVoidPtr.cpp new file mode 100644 index 0000000..07531d8 --- /dev/null +++ b/aten/src/ATen/detail/UniqueVoidPtr.cpp @@ -0,0 +1,7 @@ +#include + +namespace at { namespace detail { + +void deleteNothing(void*) {} + +}} // namespace at diff --git a/aten/src/ATen/detail/UniqueVoidPtr.h b/aten/src/ATen/detail/UniqueVoidPtr.h new file mode 100644 index 0000000..866c0ef --- /dev/null +++ b/aten/src/ATen/detail/UniqueVoidPtr.h @@ -0,0 +1,84 @@ +#include + +#include + +namespace at { + +using DeleterFnPtr = void(*)(void*); + +namespace detail { + +// Does not delete anything +AT_API void deleteNothing(void*); + +// A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but +// with three major differences: +// +// 1) It is specialized to void +// +// 2) It is specialized for a function pointer deleter +// void(void* ctx); i.e., the deleter doesn't take a +// reference to the data, just to a context pointer +// (erased as void*). In fact, internally, this pointer +// is implemented as having an owning reference to +// context, and a non-owning reference to data; this is why +// you release_context(), not release() (the conventional +// API for release() wouldn't give you enough information +// to properly dispose of the object later.) +// +// 3) The deleter is guaranteed to be called when the unique +// pointer is destructed and the context is non-null; this is different +// from std::unique_ptr where the deleter is not called if the +// data pointer is null. +// +// Some of the methods have slightly different types than std::unique_ptr +// to reflect this. +// +class UniqueVoidPtr { +private: + // Lifetime tied to ctx_ + void* data_; + std::unique_ptr ctx_; +public: + UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {} + explicit UniqueVoidPtr(void* data) : data_(data), ctx_(nullptr, &deleteNothing) {} + UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter) + : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {} + void* operator->() const { return data_; } + void* get() const { return data_; } + void* get_context() const { return ctx_.get(); } + void* release_context() { return ctx_.release(); } + template + T* cast_context(DeleterFnPtr expected_deleter) const { + if (get_deleter() != expected_deleter) return nullptr; + return static_cast(get_context()); + } + operator bool() const { return data_ || ctx_; } + DeleterFnPtr get_deleter() const { return ctx_.get_deleter(); } +}; + + +// Note [How UniqueVoidPtr is implemented] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// UniqueVoidPtr solves a common problem for allocators of tensor data, which +// is that the data pointer (e.g., float*) which you are interested in, is not +// the same as the context pointer (e.g., DLManagedTensor) which you need +// to actually deallocate the data. Under a conventional deleter design, you +// have to store extra context in the deleter itself so that you can actually +// delete the right thing. Implementing this with standard C++ is somewhat +// error-prone: if you use a std::unique_ptr to manage tensors, the deleter will +// not be called if the data pointer is nullptr, which can cause a leak if the +// context pointer is non-null (and the deleter is responsible for freeing both +// the data pointer and the context pointer). +// +// So, in our reimplementation of unique_ptr, which just store the context +// directly in the unique pointer, and attach the deleter to the context +// pointer itself. In simple cases, the context pointer is just the pointer +// itself. + +inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return !sp; } +inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return !sp; } +inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return sp; } +inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return sp; } + +}} // namespace at::detail diff --git a/aten/src/ATen/detail/VariableHooksInterface.cpp b/aten/src/ATen/detail/VariableHooksInterface.cpp new file mode 100644 index 0000000..8569052 --- /dev/null +++ b/aten/src/ATen/detail/VariableHooksInterface.cpp @@ -0,0 +1,29 @@ +#include + +namespace at { + +namespace detail { + + // NB: The VariableHooks returned by this function may CHANGE after dlopen() + // NB: This function takes a lock, don't call it from perf critical paths + const VariableHooksInterface& getVariableHooks() { + static std::mutex var_hooks_mutex; + static std::unique_ptr var_hooks = nullptr; + static std::unique_ptr default_var_hooks = + std::unique_ptr(new VariableHooksInterface()); + std::lock_guard lock(var_hooks_mutex); + + if (!var_hooks) { + var_hooks = VariableHooksRegistry()->Create("VariableHooks", VariableHooksArgs{}); + } + if (var_hooks) { + return *var_hooks; + } + return *default_var_hooks; + } + +} + +AT_DEFINE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs) + +} // namespace at::detail diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/detail/VariableHooksInterface.h new file mode 100644 index 0000000..2871164 --- /dev/null +++ b/aten/src/ATen/detail/VariableHooksInterface.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + +namespace at { + class Context; +} + +// NB: Registry class not actually in the namespace detail, due to limitations +// of Registry.h +namespace at { + +// The VariableHooksInterface is an interface for autograd functionality +// which currently doesn't live in libATen.so AND needs to be called from +// ATen. In this case, it is only the type registry for Variable types, +// letting us add extra variables types if CUDA types are initialized lazily. +// +// We may choose to absorb autograd into ATen, in which case this interface is obsolete. +// +struct AT_API VariableHooksInterface { + + // This should never actually be implemented, but it is used to + // squelch -Werror=non-virtual-dtor + virtual ~VariableHooksInterface() {} + + virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const { + // no-op if Variable not available; it'll get handled (if at all) when + // libtorch.so gets loaded + } + +}; + +// NB: dummy argument to suppress "ISO C++11 requires at least one argument +// for the "..." in a variadic macro" +struct AT_API VariableHooksArgs {}; + +AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs) +#define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname) + +namespace detail { + AT_API const VariableHooksInterface& getVariableHooks(); +} + +} // namespace at diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h new file mode 100644 index 0000000..f8dc8fc --- /dev/null +++ b/aten/src/ATen/dlpack.h @@ -0,0 +1,141 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current version of dlpack */ +#define DLPACK_VERSION 010 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*! + * \brief The device type in DLContext. + */ +typedef enum { + kDLCPU = 1, + kDLGPU = 2, + // kDLCPUPinned = kDLCPU | kDLGPU + kDLCPUPinned = 3, + kDLOpenCL = 4, + kDLMetal = 8, + kDLVPI = 9, + kDLROCM = 10, +} DLDeviceType; + +/*! + * \brief A Device context for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! \brief The device index */ + int device_id; +} DLContext; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + kDLInt = 0U, + kDLUInt = 1U, + kDLFloat = 2U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. + * + * Examples + * - float: type_code = 2, bits = 32, lanes=1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4 + * - int8: type_code = 0, bits = 8, lanes=1 + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The opaque data pointer points to the allocated data. + * This will be CUDA device pointer or cl_mem handle in OpenCL. + * This pointer is always aligns to 256 bytes as in CUDA. + */ + void* data; + /*! \brief The device context of the tensor */ + DLContext ctx; + /*! \brief Number of dimensions */ + int ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor, + * can be NULL, indicating tensor is compact. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to faciliate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! \brief Destructor signature void (*)(void*) - this should be called + * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL + * if there is no way for the caller to provide a reasonable destructor. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/aten/src/ATen/extract_cwrap.py b/aten/src/ATen/extract_cwrap.py new file mode 100644 index 0000000..64c2281 --- /dev/null +++ b/aten/src/ATen/extract_cwrap.py @@ -0,0 +1,38 @@ +from optparse import OptionParser + +parser = OptionParser() +parser.add_option('-o', '--output', help='where to write the result file.', + action='store', default='.') +options, _ = parser.parse_args() + +files = [ + # '../../csrc/cudnn/cuDNN.cwrap', + '../../csrc/generic/TensorMethods.cwrap', + # '../../csrc/generic/methods/SparseTensor.cwrap', + '../../csrc/generic/methods/Tensor.cwrap', + '../../csrc/generic/methods/TensorApply.cwrap', + '../../csrc/generic/methods/TensorCompare.cwrap', + '../../csrc/generic/methods/TensorCuda.cwrap', + '../../csrc/generic/methods/TensorMath.cwrap', + '../../csrc/generic/methods/TensorRandom.cwrap', + # '../../csrc/generic/methods/TensorSerialization.cwrap', +] + +declaration_lines = [] + +for filename in files: + with open(filename, 'r') as file: + in_declaration = False + for line in file.readlines(): + line = line.rstrip() + if line == '[[': + in_declaration = True + declaration_lines.append(line) + elif line == ']]': + in_declaration = False + declaration_lines.append(line) + elif in_declaration: + declaration_lines.append(line) + +with open(options.output, 'w') as output: + output.write('\n'.join(declaration_lines) + '\n') diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py new file mode 100644 index 0000000..1c06654 --- /dev/null +++ b/aten/src/ATen/function_wrapper.py @@ -0,0 +1,1517 @@ +# HEY! Trying to understand what this file does? Read +# "what has to be done to add a Operation ..." first! + +import re +from code_template import CodeTemplate + +try: + import typing # noqa: F401 +except ImportError: + raise RuntimeError( + 'Missing build dependency: Unable to import the `typing` module. ' + 'Please install it via `conda install typing` or `pip install typing`') + +# flake8 doesn't take into account usages in type annotations. +from typing import Union, Set # noqa: F401 +from typing import Any, Dict, List, Optional, Tuple, NamedTuple + +try: + from mypy_extensions import TypedDict +except ImportError: + # Avoid the dependency on the mypy_extensions package. + # It is required, however, for type checking. + def TypedDict(name, attrs, total=True): # type: ignore + return Dict[Any, Any] + +import sys +if sys.version_info[0] == 3: + string_type = str +else: + string_type = basestring + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# what has to be done to add a Operation ... +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# 1. if broadcasting or without the full list of arguments, add a non-virtual +# declaration under Type.h (right now, we call this template +# BROADCAST but it also handles default arguments) +TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\ +${return_type} ${api_name}(${type_method_formals_with_defaults}) const; +""") +# 2. broadcasting functions are implemented in Type.cpp +TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\ +${return_type} Type::${api_name}(${type_method_formals}) const { + ${device_guard_declaration} + Tensor ${broadcast_returns}; + std::tie(${broadcast_returns}) = ${broadcast_function}(${broadcast_actuals}, "${api_name}"); + return ${method_prefix_derived}${api_name}(${broadcast_modified_actuals}); +} +""") +# 3. add virtual dispatch declaration to Type.h and impl to Type.cpp; method_prefix_derived +# is present for providing a base-class definition for a derived-type method with a prefix. +# +# If the declaration is abstract, then the actual implementation will +# be in a derived type; we put in a simple default "not implemented" +# stub. However, if the declaration is concrete, we dispatch to the +# actual implementation. At the moment, this situation *only* occurs +# for 'native' declarations (so the native dispatch is hardcoded into +# the template here.) +TYPE_METHOD_DECLARATION_ABSTRACT = CodeTemplate("""\ +virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const; +""") +TYPE_METHOD_DEFINITION_ABSTRACT = CodeTemplate("""\ +${return_type} Type::${method_prefix_derived}${api_name}(${type_method_formals}) const { + AT_ERROR("${method_prefix_derived}${api_name} is not implemented for type ", toString()); +} +""") +TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\ +virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const; +""") +DEPRECATED_TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\ +AT_DEPRECATED(virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const); +""") +TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\ +${return_type} Type::${api_name}(${type_method_formals}) const { + ${device_guard_declaration} + ${type_definition_body} +} +""") +DEPRECATED_TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\ +${return_type} Type::${api_name}(${type_method_formals}) const { + TensorOptions options(*this); + ${device_guard_declaration} + return at::native::${api_name}(${type_method_actuals}, options); +} +""") +# 4. add virtual override to TypeDerived.h +TYPE_DERIVED_DECLARATION = CodeTemplate("""\ +virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override; +""") +# 5. add override definition to TypeDerived.cpp +TYPE_DERIVED_DEFINITION = CodeTemplate("""\ +${return_type} ${Type}::${method_prefix_derived}${api_name}(${type_method_formals}) const { + ${device_guard_declaration} + ${type_definition_body} +} +""") +# NB: As far as ezyang can tell, we don't *have* to codegen this, +# because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in +# the superclass. But it doesn't seem to be harmful. +TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\ +${return_type} ${Type}::${api_name}(${type_method_formals}) const { + ${device_guard_declaration} + const auto& self_ty = *this; + (void)self_ty; + ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals}); +} +""") +TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\ +${return_type} ${Type}::${api_name}(${type_method_formals}) const { + AT_ERROR("${api_name} not supported on ${Type}"); +} +""") +TYPE_DEFINITION_BODY_NATIVE = CodeTemplate("""\ +${return_call} at::native::${native_type_method_dispatch}(/* native_actuals */ ${native_actuals}); +""") + +# add non-virtual declaration to Tensor.h +TENSOR_METHOD_DECLARATION = CodeTemplate("""\ +${return_type} ${api_name}(${method_formals_with_defaults})${const_mark}; +""") +# add non-virtual declaration to Tensor.cpp +TENSOR_METHOD_DEFINITION = CodeTemplate("""\ +inline ${return_type} Tensor::${api_name}(${method_formals})${const_mark} { + return type().${api_name}(${method_actuals}); +} +""") +# add a method declaration in Functions.h +FUNCTION_DECLARATION = CodeTemplate("""\ +static inline ${return_type} ${api_name}(${formals_with_defaults}); +""") +# add a method declaration in Functions.h +DEPRECATED_FUNCTION_DECLARATION = CodeTemplate("""\ +AT_DEPRECATED(static inline ${return_type} ${api_name}(${formals_with_defaults})); +""") +# add method definition in Functions.h +FUNCTION_DEFINITION = CodeTemplate("""\ +static inline ${return_type} ${api_name}(${formals}) { + return ${inferred_type}.${api_name}(${type_method_actuals}); +} +""") +# add a native declaration for a native function +NATIVE_DECLARATION = CodeTemplate("""\ +AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults}); +""") + +# special method definition for factory functions in Functions.h +FACTORY_DEFINITION = CodeTemplate("""\ +static inline ${return_type} ${api_name}(${formals}) { + const DeviceGuard guard(options.device()); + return at::native::${api_name}(${type_method_actuals}); +} +""") + +# special method definition for *deprecated* factory functions in Functions.h +DEPRECATED_FACTORY_DEFINITION = CodeTemplate("""\ +static inline ${return_type} ${api_name}(${formals}) { + return at::${api_name}(${type_method_actuals}, TensorOptions(${inferred_type})); +} +""") + +# We need to cast to the base type because C++ may hide the base class +# implementation of ${api_name} if we have overloaded a function with +# the same name (but different signature) already +ZERO_DIM_CHECK = CodeTemplate("""\ +if (${check_name}.dim() == 0) { + return static_cast(this)->${api_name}(${zero_dim_actuals}); +}""") + +ZERO_DIM_ONLY = CodeTemplate("""\ +AT_ERROR("${api_name} only supports a 0-dimensional ${check_name} tensor, but got tensor " + "with ", ${check_name}.dim(), " dimension(s)."); +""") + +SPARSE_CHECK = CodeTemplate("""\ +if(${check_name}.type().is_sparse()) { + return static_cast(this)->${api_name}(${sparse_actuals}); +}""") + +BUFFER_DEFINITION = CodeTemplate("""\ +auto ${name}_ = new ${Tensor}(context); +auto ${name} = Tensor(${name}_, false);""") + +CONDITIONAL_INITIALIZER = CodeTemplate("""\ +if (${name}.defined()) { + ${initializer} +}""") + +CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})") + +HALF_CONVERSION = CodeTemplate("convert(${value})") + + +class NYIError(Exception): + """Indicates we don't support this declaration yet""" + + def __init__(self, reason): + self.reason = reason + + +TYPE_FORMAL_GENERIC = { + 'THTensor*': 'Tensor &', + 'THSTensor*': 'SparseTensorRef', + 'THBoolTensor*': 'Tensor &', + 'THIndexTensor*': 'Tensor &', + 'THIntegerTensor*': 'Tensor &', + 'THDenseTensor*': 'Tensor &', + 'THDenseIndexTensor*': 'Tensor &', + 'THStorage*': 'Storage &', + 'THGenerator*': 'Generator *', + 'THSize*': 'IntList', + 'THStride*': 'IntList', + 'accreal': 'Scalar', + 'real': 'Scalar', + 'long': 'int64_t', +} + +DYNAMIC_TYPE = { + 'THTensor*': 'Tensor', + 'THSTensor*': 'SparseTensorRef', + 'THBoolTensor*': 'BoolTensor', + 'THIndexTensor*': 'IndexTensor', + 'THIntegerTensor*': 'IntegerTensor', + 'THDenseTensor*': 'Tensor', + 'THDenseIndexTensor*': 'IndexTensor', + 'THStorage*': 'Storage', + 'THGenerator*': 'Generator*', + 'THSize*': 'IntList', + 'THStride*': 'IntList', + 'accreal': 'accreal', + 'real': 'real', + 'long': 'int64_t', +} + +NATIVE_DYNAMIC_TYPE = { + 'Tensor &': 'Tensor', + 'const Tensor &': 'Tensor', +} + +TYPE_RETURN = { + 'THTensor*': 'Tensor', + 'THIndexTensor*': 'Tensor', + 'THBoolTensor*': 'Tensor', + 'THIntegerTensor*': 'Tensor', + 'THSTensor*': 'Tensor', + 'THDenseTensor*': 'Tensor', + 'THDenseIndexTensor*': 'Tensor', + 'real': 'Tensor', + 'accreal': 'Tensor', + 'long': 'int64_t', +} + +CHECKED_CAST = { + 'THTensor*': + CodeTemplate( + 'checked_cast_tensor<${Tensor}>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'), + 'THSTensor*': + CodeTemplate( + 'checked_cast_tensor(${arg_name}.tref.pImpl,"${arg_name}",${arg_pos},false)'), + 'THBoolTensor*': + CodeTemplate( + 'checked_cast_tensor<${Backend}ByteTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'), + 'THIndexTensor*': + CodeTemplate( + 'checked_cast_tensor<${Backend}LongTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'), + 'THIntegerTensor*': + CodeTemplate( + 'checked_cast_tensor<${Backend}IntTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'), + 'THDenseTensor*': + CodeTemplate( + 'checked_cast_tensor<${DenseTensor}>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'), + 'THDenseIndexTensor*': + CodeTemplate( + 'checked_cast_tensor<${DenseBackend}LongTensor>(${arg_name}.pImpl,"${arg_name}",${arg_pos}, ${null_okay})'), + 'THStorage*': CodeTemplate('checked_cast_storage<${Storage}>(&${arg_name},"${arg_name}",${arg_pos})'), + 'THGenerator*': + CodeTemplate( + 'check_generator<${Backend}Generator>(${arg_name}, &context->defaultGenerator(backend()))'), + # This is a cast done via direct-construction + 'THSize*': CodeTemplate('THLongStorageView ${result_name}(${arg_name}, THLongStorageViewKind::SIZE);'), + # This is a cast done via direct-construction + 'THStride*': CodeTemplate('THLongStorageView ${result_name}(${arg_name}, THLongStorageViewKind::STRIDE);'), + 'real': CodeTemplate('${arg_name}.to${ScalarName}()'), + 'accreal': CodeTemplate('${arg_name}.to${AccScalarName}()'), + 'TensorList': CodeTemplate('tensor_list_checked_cast<${Tensor}, Tensor, ' + '${THTensor}>(${arg_name},"${arg_name}",${arg_pos})'), + 'IntList': CodeTemplate('check_intlist<${size}>(${arg_name}, "${arg_name}", ${arg_pos}${,default_init})') +} + +DIRECT_CONSTRUCTION_CHECKED_CAST = {'THSize*', 'THStride*'} + +CHECKED_USE = { + 'THTensor*': '{}_->tensor', + 'THSTensor*': '{}_->tensor', + 'THIndexTensor*': '{}_->tensor', + 'THBoolTensor*': '{}_->tensor', + 'THIntegerTensor*': '{}_->tensor', + 'THDenseTensor*': '{}_->tensor', + 'THDenseIndexTensor*': '{}_->tensor', + 'THStorage*': '{}_->storage', + 'THGenerator*': '{}_->generator', + 'TensorList': "{0}_.data(), {0}_.size()", +} + +CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL') + +ALLOC_WRAP = { + 'THTensor*': 'new ${Tensor}(context${,arguments})', + 'THBoolTensor*': 'new ${Backend}ByteTensor(context${,arguments})', + 'THIndexTensor*': 'new ${Backend}LongTensor(context${,arguments})', + 'THIntegerTensor*': 'new ${Backend}IntTensor(context${,arguments})', + 'THSTensor*': 'new Sparse${Tensor}(context${,arguments})', + 'THDenseTensor*': 'new ${DenseTensor}(context${,arguments})', + 'THDenseIndexTensor*': 'new ${DenseBackend}LongTensor(context${,arguments})', +} + +# Replacements for constants when calling into TH +CONSTANT_REPLACEMENTS = [ + ('AS_REAL', '${AS_REAL}'), + ('__storage_size.get\\(\\)', + 'THLongStorageView(static_cast(source.size()), THLongStorageViewKind::LENGTH)'), + ('__last_dim', 'self.ndimension()-1'), +] + +# Replacements for constants in header file function definitions +HEADER_CONSTANT_REPLACEMENTS = [ + (r'AS_REAL\((.*)\)', r'\1'), + ('__last_dim', '-1'), +] + + +class nested_dict(object): + def __init__(self, base, parent): + self.base, self.parent = base, parent + + def __getitem__(self, x): + r = self.base.get(x) + if r is not None: + return r + return self.parent[x] + + +Environment = TypedDict('Environment', { + 'ScalarName': str, + 'THTensor': str, + 'THType': str, + 'THTensor': str, + 'Backend': str, + 'AccScalarName': str, +}) + +TopEnvironment = TypedDict('TopEnvironment', { + 'type_registrations': List[str], + 'type_headers': List[str], + 'type_method_declarations': List[str], + 'type_method_definitions': List[str], + 'type_method_inline_definitions': List[str], + 'tensor_method_declarations': List[str], + 'tensor_method_definitions': List[str], + 'function_declarations': List[str], + 'function_definitions': List[str], + 'type_ids': List[str], + 'native_function_declarations': List[str], +}) + +# A Declarations.cwrap formal argument +# type can contain THTensor* types +THFormal = TypedDict('THFormal', { + 'name': str, + 'type': str, + 'dynamic_type': str, + 'kwarg_only': bool, + 'is_nullable': bool, + 'default': str, + 'default_init': str, + 'python_default_init': str, + 'output': bool, + 'size': int, + 'declared_type': str, + 'ignore_check': bool, + 'allocate': bool, + 'mask': bool, + 'if_true': bool, + 'if_false': bool, + 'wrap_dim': str, + # Broadcast is originally a str but gets unwrapped to a List or Dict in-place + 'broadcast': Any, + 'resize': str, + 'cpu_zero': bool, + 'zero': bool, + 'is_type_dispatched': bool, +}, total=False) + +# Generic ATen formal or native_functions.yaml formal argument. +# type can contain Tensor& reference types. +AtFormal = TypedDict('AtFormal', { + 'name': str, + 'type': str, + 'dynamic_type': str, + 'kwarg_only': bool, + 'is_nullable': bool, + 'default': str, + 'default_init': str, + 'python_default_init': str, + 'output': bool, + 'size': int, + 'is_type_dispatched': bool, +}, total=False) + +ReturnType = TypedDict('ReturnType', { + 'name': str, + 'type': str, + 'dynamic_type': str, +}, total=False) + +ReturnDecl = TypedDict('ReturnDecl', { + 'kind': str, + 'type': str, + 'arguments': List[int], +}, total=False) + +# Represents a buffer in nn.yaml +NNBuffer = TypedDict('NNBuffer', { + 'name': str, +}) + +FunctionOption = TypedDict('FunctionOption', { + 'actuals': List[str], + 'api_name': str, + 'arguments': List[THFormal], + 'aten_custom_call': str, + 'aten_dense_sparse': bool, + 'backend_type_pairs': List[Tuple[str, str]], + 'backends': List[str], + 'broadcast_actuals': List[str], + 'broadcast_function': str, + 'broadcast_modified_actuals': List[str], + 'broadcast_returns': List[str], + 'buffers': List[NNBuffer], + # cimpls is really a List[FunctionOption] + 'cimpls': List[Any], + 'cname': str, + 'condition': str, + 'const_mark': str, + 'device_guard': bool, + 'device_guard_declaration': str, + 'with_gil': bool, + 'cpu_half': bool, + 'deprecated': bool, + 'formals_list': List[AtFormal], + 'formals_with_defaults': List[str], + 'formals': List[str], + 'inferred_type': str, + 'inplace': bool, + 'method_actuals': List[str], + 'method_formals_with_defaults': List[str], + 'method_formals': List[str], + 'method_prefix_derived': str, + 'mode': str, + 'name': str, + 'native_actuals': List[str], + 'native_type_method_dispatch': str, + # options should be List[FunctionOption] + 'options': Any, + 'return_call': str, + 'return_type': str, + 'return': ReturnDecl, + 'returns': List[ReturnType], + 'scalar_check': str, + 'sparse': bool, + 'type_definition_body': List[str], + 'type_method_actuals': List[str], + 'type_method_definition_dispatch': str, + 'type_method_formals_with_defaults': List[str], + 'type_method_formals': List[str], + 'variants': str, + 'when_spares_dispatch': str, + 'when_sparse_dispatch': str, + 'with_gil': bool, + 'zero_dim_dispatch_when_scalar': str, + 'zero_dim_tensor_only': bool, +}) + +OutputDeclaration = NamedTuple('OutputDeclaration', [ + ('name', str), + ('method_prefix_derived', str), + ('arguments', List[AtFormal]), + ('method_of', List[str]), + ('mode', str), + ('buffers', Optional[List[str]]), + ('returns', List[ReturnType]), + ('inplace', bool), + ('abstract', bool), + ('device_guard', bool), + ('with_gil', bool), + ('deprecated', bool), +]) + + +def device_guard(option, formals, is_factory_method=False): + # For factory methods the `DeviceGuard` is already in the template. + if option.get('device_guard', True) and not is_factory_method: + tensor_arguments = [f for f in formals if f['dynamic_type'] in {'Tensor', 'TensorList'}] + if tensor_arguments: + tensor_argument = tensor_arguments[0]['name'] + return 'const DeviceGuard device_guard({});'.format(tensor_argument) + return '// DeviceGuard omitted' + + +def is_real_argument_to_wrapper(argument): + # type: (THFormal) -> bool + return not argument.get('output', False) and\ + argument['type'] != 'CONSTANT' and\ + argument['type'] != 'argument' + + +def is_mutable_formal_argument(argument, option): + # type: (THFormal, FunctionOption) -> bool + return argument.get('output') or option['inplace'] and argument['name'] == 'self' + + +def to_return_type(arg, option): + # type: (THFormal, FunctionOption) -> ReturnType + t = arg['type'] + rt = TYPE_RETURN.get(t, t) + if rt == 'Tensor' and not arg.get('allocate'): + rt = rt + ' &' + if not is_mutable_formal_argument(arg, option): + rt = 'const ' + rt + return { + 'name': arg['name'], + 'type': rt, + 'dynamic_type': DYNAMIC_TYPE.get(arg['type'], arg['type']), + } + + +def create_generic(top_env, declarations): + # type: (TopEnvironment, List[FunctionOption]) -> List[OutputDeclaration] + # translates defaults from cwrap types to C++ values + def translate_default(argument, type_str, default): + # type: (THFormal, str, Any) -> Any + if default is None: + # cause the default constructor for the object to run + return '{}' + if 'if_true' in argument: + return argument['default'] == argument['if_true'] + for pattern, replacement in HEADER_CONSTANT_REPLACEMENTS: + default = re.sub(pattern, replacement, str(default)) + if type_str in {'Scalar', 'int64_t', 'double'}: + try: + return int(default) + except Exception: + try: + return float(default) + except Exception: + return default + elif type_str == 'bool': + assert default.lower() in ['true', 'false'] + return default.lower() == 'true' + else: + return default + + # change from THTensor* to Tensor & so we get how it will appear + # in the aten argument list... + def translate_formal(argument, option): + # type: (THFormal, FunctionOption) -> AtFormal + type_str = TYPE_FORMAL_GENERIC.get(argument['type'], argument['type']) + if type_str == 'Tensor &' and not is_mutable_formal_argument(argument, option): + type_str = 'const ' + type_str + translated = { + 'name': argument['name'], + 'type': type_str, + 'dynamic_type': DYNAMIC_TYPE.get(argument['type'], argument['type']), + } # type: AtFormal + if 'kwarg_only' in argument: + translated['kwarg_only'] = argument['kwarg_only'] + if 'default' in argument: + default = translate_default(argument, type_str, argument['default']) + translated['default'] = default + translated['default_init'] = argument.get('default_init', default) + if 'python_default_init' in argument: + assert 'default' not in argument + default = translate_default(argument, type_str, argument['python_default_init']) + translated['python_default_init'] = default + if argument.get('output'): + translated['output'] = True + if argument.get('size'): + translated['size'] = argument['size'] + if argument.get('is_nullable') is not None: + translated['is_nullable'] = argument['is_nullable'] + return translated + + def get_formals(option, include_constants=False): + # type: (FunctionOption, bool) -> List[AtFormal] + seen = set() # type: Set[str] + pos_args = [] # type: List[THFormal] + kwd_args = [] # type: List[THFormal] + + def insert(argument): + # type: (THFormal) -> None + if argument['name'] not in seen: + seen.add(argument['name']) + if argument.get('kwarg_only', False): + kwd_args.append(argument) + else: + pos_args.append(argument) + + def has_output_mask(argument): + # type: (THFormal) -> bool + return argument.get('allocate', False) and argument.get('mask', False) + + for argument in option['arguments']: + if argument.get('output') and not argument.get('allocate', False): + insert(argument) + for argument in option['arguments']: + if argument['type'] == 'THSTensor*': + # only enable for a subset of Dense/Sparse ops + if not (option.get('aten_dense_sparse', False)): + raise NYIError("Sparse Tensor") + + if include_constants and argument['type'] == 'CONSTANT': + insert(argument) + elif is_real_argument_to_wrapper(argument): + insert(argument) + if any(has_output_mask(arg) for arg in option['arguments']): + mask_size = sum(has_output_mask(arg) for arg in option['arguments']) + insert({ + 'name': 'output_mask', + # NB: Lack of space in comma works around parsing + # problem in gen_variable_type.py + 'type': 'std::array'.format(mask_size), + 'default': '{{' + ', '.join(['true'] * mask_size) + '}}', + }) + + result = pos_args + kwd_args + return [translate_formal(argument, option) for argument in result] + + def get_return_types(option): + # type: (FunctionOption) -> List[ReturnType] + ret = option['return'] + if ret['kind'] == 'arguments': + argument_indices = ret['arguments'] + if len(argument_indices) == 1: + the_arg = option['arguments'][argument_indices[0]] + return [to_return_type(the_arg, option)] + else: + return [to_return_type(option['arguments'][idx], option) + for idx in argument_indices] + elif ret['kind'] == 'type': + return [{ + 'type': TYPE_RETURN.get(ret['type'], ret['type']), + 'dynamic_type': DYNAMIC_TYPE.get(ret['type'], ret['type']), + }] + else: + raise Exception("format_return_type") + + def format_return_type(return_types): + # type: (List[ReturnType]) -> str + if len(return_types) == 1: + return return_types[0]['type'] + return "std::tuple<{}>".format(','.join(r['type'] for r in return_types)) + + def find_dispatch_tensor(formals): + # type: (List[AtFormal]) -> Optional[str] + # dispatch to self if it's a parameter + for formal in formals: + if formal['name'] == 'self' and formal['dynamic_type'] == 'Tensor': + return formal['name'] + # otherwise dispatch to the first Tensor or TensorList + for formal in formals: + if 'TensorList' == formal['dynamic_type'] or formal['dynamic_type'] == 'Tensor': + return formal['name'] + return None + + def format_formal(f): + # type: (AtFormal) -> str + return '{} {}'.format(f['type'], f['name']) + + def formal_with_default(f): + # type: (AtFormal) -> str + s = format_formal(f) + v = f.get('default') + if v is None: + return s + if isinstance(v, bool): + v = str(v).lower() + return '{}={}'.format(s, v) + + def get_broadcast_argument(option): + # type: (FunctionOption) -> Optional[THFormal] + for argument in option['arguments']: + if argument.get('broadcast'): + return argument + return None + + def get_broadcast_actuals(broadcast_arg, broadcast_inplace, broadcast_dims): + # type: (THFormal, bool, bool) -> List[str] + # Note: broadcast_dims can change type... + # return the actuals that will be passed to the broadcast function. + # 1) in the common case, this is the broadcasted argument (e.g. "self") followed by the tensors + # that it is broadcasted against (comma-separated) (e.g. "self, tensor1, tensor2"). + # 2) in the broadcast_dims case, this is the broadcasted argument (e.g. "self") followed by the sizes + # it is broadcasted to (as an initializer list), so e.g. the specification + # "mat1.dim0,mat2.dim1" gets transformed to "self, {mat1.size(0),mat2.size(1)}" + if not broadcast_dims: + broadcast_actuals = [broadcast_arg['name']] + broadcast_arg['broadcast'].split()[0].split(",") + else: + broadcast_dims_spec = broadcast_arg['broadcast'].split()[1].split(':')[1].split(',') + # generate size call for each dimension + broadcast_dims = ([x.split('.')[0] + '.size(' + x.split('.')[1].replace('dim', '') + ')' # type: ignore + for x in broadcast_dims_spec]) + broadcast_dims_init_list = '{' + ','.join(broadcast_dims) + '}' # type: ignore + broadcast_actuals = [broadcast_arg['name'], broadcast_dims_init_list] + + return broadcast_actuals + + def emit_nn_body(option): + # type: (FunctionOption) -> Union[str, List[str]] + # Concrete definition on Type.cpp for NN functions. Delegates to the + # xxx_forward variant variant after creating any necessary buffers. + actuals = option['actuals'] + base_name = option['name'][:-1] if option['inplace'] else option['name'] + fwd_name = option['api_name'].replace(base_name, base_name + '_forward') + + if len(option['buffers']) == 0: + return 'return {}({});'.format(fwd_name, ', '.join(actuals)) + + body = [] # type: List[str] + if option['api_name'].endswith('_out'): + # _out variants must create buffers and insert them in the + # arguments list between output and input arguments + for buffer in option['buffers']: + body.append('Tensor {} = tensor();'.format(buffer['name'])) + actuals = [arg['name'] for arg in option['arguments'] if arg.get('output')] + actuals += [buffer['name'] for buffer in option['buffers']] + actuals += [arg['name'] for arg in option['arguments'] if not arg.get('output')] + + body.append('return std::get<0>({}({}));'.format(fwd_name, ', '.join(actuals))) + return body + + def process_option(option, output_options): + # type: (FunctionOption, List[OutputDeclaration]) -> None + option['inplace'] = re.search( + '(^__i|[^_]_$)', option['api_name']) is not None + + # print(yaml.dump(option)) + formals = get_formals(option) + option['formals_list'] = formals + option['formals'] = [format_formal(f) for f in formals] + option['formals_with_defaults'] = [formal_with_default(f) for f in formals] + option['returns'] = get_return_types(option) + option['return_type'] = format_return_type(option['returns']) + option['return_call'] = 'return ' if option['return_type'] != 'void' else '' + option['actuals'] = [f['name'] for f in formals] + + option['method_formals'] = [format_formal(f) for f in formals + if f['name'] != 'self'] + option['method_formals_with_defaults'] = ( + [formal_with_default(f) for f in formals if f['name'] != 'self']) + option['method_actuals'] = [ + f['name'] if f['name'] != 'self' else '*this' for f in formals] + + # There are no cases where these differ, but they do in native_functions + option['type_method_formals'] = option['formals'] + option['type_method_formals_with_defaults'] = option['formals_with_defaults'] + option['type_method_actuals'] = option['actuals'] + + option['const_mark'] = '' if option['inplace'] else ' const' + + is_method = 'method' in option['variants'] + is_function = 'function' in option['variants'] + dispatch_tensor = find_dispatch_tensor(formals) + is_namespace_function = is_function and dispatch_tensor is not None + + broadcast_arg = get_broadcast_argument(option) + # "s_" for "same size". + option['method_prefix_derived'] = '' if broadcast_arg is None else 's_' + option['device_guard_declaration'] = device_guard(option, formals) + + env = nested_dict(option, top_env) + + mode = option['mode'] + abstract = True + if mode == 'NN' and option.get('cimpls') is None: + # NN function with no _forward/_backward suffix don't have cimpls. + # They call the _forward function and discard any buffer returns + abstract = False + top_env['type_method_declarations'].append( + TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) + body = emit_nn_body(option) + top_env['type_method_definitions'].append( + TYPE_METHOD_DEFINITION_CONCRETE.substitute( + env, type_definition_body=body)) + elif broadcast_arg is None: + top_env['type_method_declarations'].append( + TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env)) + top_env['type_method_definitions'].append( + TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env)) + else: + top_env['type_method_declarations'].append( + TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) + top_env['type_method_declarations'].append( + TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env)) + top_env['type_method_definitions'].append( + TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env)) + + broadcast_inplace = 'inplace' in broadcast_arg['broadcast'] + broadcast_dims = 'dims:' in broadcast_arg['broadcast'] + option['broadcast_actuals'] = get_broadcast_actuals(broadcast_arg, broadcast_inplace, broadcast_dims) + if not broadcast_dims: + option['broadcast_returns'] = (["b_" + x for x in option['broadcast_actuals'] + if x != broadcast_arg['name'] or not broadcast_inplace]) + else: + option['broadcast_returns'] = ["b_" + broadcast_arg['name']] + + option['broadcast_function'] = 'expand_' + ('inplace' if broadcast_inplace + else 'size' if broadcast_dims else 'outplace') + option['broadcast_modified_actuals'] = ['b_' + y if 'b_' + y in option['broadcast_returns'] else y + for y in option['actuals']] + top_env['type_method_definitions'].append( + TYPE_METHOD_DEFINITION_BROADCAST.substitute(env)) + + method_of = ['Type'] + if is_method: + top_env['tensor_method_declarations'].append( + TENSOR_METHOD_DECLARATION.substitute(env)) + top_env['tensor_method_definitions'].append( + TENSOR_METHOD_DEFINITION.substitute(env)) + method_of.append('Tensor') + + if is_namespace_function: + option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor) + top_env['function_declarations'].append( + FUNCTION_DECLARATION.substitute(env)) + top_env['function_definitions'].append( + FUNCTION_DEFINITION.substitute(env)) + method_of.append('namespace') + + buffer_names = [buffer['name'] for buffer in option.get('buffers', [])] + + output_options.append(OutputDeclaration( + name=option['api_name'], + method_prefix_derived=option['method_prefix_derived'], + arguments=formals, + method_of=method_of, + mode=mode, + buffers=buffer_names, + returns=option['returns'], + inplace=option['inplace'], + # See Note [Abstract ATen methods] + abstract=abstract, + device_guard=option.get('device_guard', True), + with_gil=option.get('with_gil', False), + deprecated=option.get('deprecated', False) + )) + + def native_get_formals(option, include_constants=False): + # type: (FunctionOption, bool) -> List[AtFormal] + seen = set() # type: Set[str] + pos_args = [] + kwd_args = [] + + def insert(argument): + # type: (AtFormal) -> None + if argument['name'] not in seen: + seen.add(argument['name']) + if argument.get('kwarg_only', False): + kwd_args.append(argument) + else: + pos_args.append(argument) + + for argument in option['arguments']: + insert(argument) + + # not clear we need dynamic_type translation as we can specify the correct type + # directly in native functions + def add_dynamic_type(argument, option): + # type: (AtFormal, FunctionOption) -> AtFormal + argument['dynamic_type'] = NATIVE_DYNAMIC_TYPE.get(argument['type'], argument['type']) + return argument + + result = pos_args + kwd_args + result = [add_dynamic_type(argument, option) for argument in result] + + # ensure we get reference-type formals when appropriate + def native_translate_formals(argument, option): + # type: (AtFormal, FunctionOption) -> AtFormal + def translate_map(const): + # type: (bool) -> Dict[str, str] + return { + 'Tensor': 'const Tensor &' if const else 'Tensor &', + 'BoolTensor': 'const Tensor &' if const else 'Tensor &', + 'IndexTensor': 'const Tensor &' if const else 'Tensor &', + 'Type': 'const Type &' if const else 'Type &', + 'TensorOptions': 'const TensorOptions &' if const else 'TensorOptions &', + } + + if (option['inplace'] and argument['name'] == 'self') or argument.get('output', False): + argument['type'] = translate_map(False).get(argument['type'], argument['type']) + else: + argument['type'] = translate_map(True).get(argument['type'], argument['type']) + + return argument + + result = [native_translate_formals(argument, option) for argument in result] + return result + + # this can return multiple return types in a list, e.g. ['Tensor', 'Tensor'] + def native_get_return_types(option): + # type: (FunctionOption) -> List[ReturnType] + ret = option['return'] + + return_types = [] # List[ReturnType] + for t_raw in ret: + if isinstance(t_raw, string_type): + t = t_raw + name = None + elif t_raw is None: + t = 'void' + name = None + else: + t = t_raw['type'] + name = t_raw['name'] + + # can't actually return a TensorList (since it's a reference object) + actual_return_type = {'TensorList': 'std::vector'}.get(t, t) + + if actual_return_type == 'Tensor' and (option['inplace'] or option['api_name'].endswith('_out')): + # follow normal ATen convention of returning Tensor & for inplace functions. + actual_return_type = 'Tensor &' + + rtype = { + 'type': actual_return_type, + 'dynamic_type': NATIVE_DYNAMIC_TYPE.get(t, t), + } # type: ReturnType + if name is not None: + rtype['name'] = name + return_types.append(rtype) + + return return_types + + def process_native(option, output_options): + # type: (FunctionOption, List[OutputDeclaration]) -> None + option['inplace'] = re.search( + '(^__i|[^_]_$)', option['api_name']) is not None + + formals = native_get_formals(option) + option['formals_list'] = formals + option['formals'] = [format_formal(f) for f in formals] + option['formals_with_defaults'] = [formal_with_default(f) for f in formals] + option['returns'] = native_get_return_types(option) + option['return_type'] = format_return_type(option['returns']) + option['return_call'] = 'return ' if option['return_type'] != 'void' else '' + option['actuals'] = [f['name'] for f in formals] + + option['method_formals'] = [format_formal(f) for f in formals + if f['name'] != 'self'] + option['method_formals_with_defaults'] = ( + [formal_with_default(f) for f in formals if f['name'] != 'self']) + option['method_actuals'] = [ + f['name'] if f['name'] != 'self' else '*this' for f in formals] + + def find_formal(formal_name, formals): + for formal in formals: + if formal_name == formal['dynamic_type']: + return formal + return None + + dispatch_tensor = find_dispatch_tensor(formals) + dispatch_type = None if dispatch_tensor else find_formal('Type', formals) + if dispatch_type: + dispatch_type['is_type_dispatched'] = True + + option['type_method_formals'] = [format_formal(f) for f in formals if f != dispatch_type] + option['type_method_formals_with_defaults'] = [formal_with_default(f) for f in formals if f != dispatch_type] + option['type_method_actuals'] = [f['name'] for f in formals if f != dispatch_type] + option['native_actuals'] = [f['name'] if f != dispatch_type else '*this' for f in formals] + + option['const_mark'] = '' if option['inplace'] else ' const' + + is_method = 'method' in option['variants'] + is_namespace_function = 'function' in option['variants'] + is_factory_method = find_formal('TensorOptions', formals) + is_deprecated_factory_method = len(formals) > 0 and \ + formals[0]['dynamic_type'] == 'Type' and \ + option['return_type'] == 'Tensor' and option['deprecated'] + needs_native_definition = not is_deprecated_factory_method + + has_dispatch = dispatch_tensor or dispatch_type + + option['method_prefix_derived'] = '' + option['device_guard_declaration'] = device_guard(option, formals, is_factory_method) + + env = nested_dict(option, top_env) + + broadcast_arg = get_broadcast_argument(option) + if broadcast_arg is not None: + raise Exception("broadcasting is not yet supported for native functions, " + "but specified for function {}", option['name']) + + # Factory methods are not dispatched over `Type`. + if not is_factory_method: + if option['deprecated']: + top_env['type_method_declarations'].append(DEPRECATED_TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) + else: + top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) + dispatch = option['type_method_definition_dispatch'] + option['native_type_method_dispatch'] = dispatch + + # Note [Abstract ATen methods] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # An abstract ATen method is one whose dispatch differs between + # types. These are implemented in derived types (with a + # standard (throwing) definition in Type). A concrete ATen + # method is one which has the same dispatch for all types; + # we just implement it in the base Type. This is exposed + # in Declarations.yaml via a field named 'abstract'. + abstract = False + if isinstance(dispatch, dict): + abstract = True + top_env['type_method_definitions'].append( + TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env)) + elif is_deprecated_factory_method: + top_env['type_method_definitions'].append( + DEPRECATED_TYPE_METHOD_DEFINITION_CONCRETE.substitute(env)) + elif not is_factory_method: + body = TYPE_DEFINITION_BODY_NATIVE.substitute(env) + top_env['type_method_definitions'].append( + TYPE_METHOD_DEFINITION_CONCRETE.substitute( + env, type_definition_body=body)) + + # generate the at::native function declarations (i.e. what the user will implement) + if needs_native_definition: + if isinstance(dispatch, dict): + generated_native_functions = [] # type: List[str] + for key in sorted(dispatch.keys()): + value = dispatch[key] + if value not in generated_native_functions: + option['native_type_method_dispatch'] = value + top_env['native_function_declarations'].append( + NATIVE_DECLARATION.substitute(env)) + generated_native_functions.append(value) + else: + top_env['native_function_declarations'].append( + NATIVE_DECLARATION.substitute(env)) + + method_of = ['Type'] + if is_method: + top_env['tensor_method_declarations'].append( + TENSOR_METHOD_DECLARATION.substitute(env)) + top_env['tensor_method_definitions'].append( + TENSOR_METHOD_DEFINITION.substitute(env)) + method_of.append('Tensor') + + if is_namespace_function: + if dispatch_type: + option['inferred_type'] = dispatch_type['name'] + elif dispatch_tensor: + option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor) + else: + # doesn't depend on a specific type, use undefined float + option['inferred_type'] = 'at::getType(at::Backend::Undefined, at::ScalarType::Float)' + declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION + top_env['function_declarations'].append(declaration.substitute(env)) + if is_factory_method: + top_env['function_definitions'].append(FACTORY_DEFINITION.substitute(env)) + elif is_deprecated_factory_method: + top_env['function_definitions'].append(DEPRECATED_FACTORY_DEFINITION.substitute(env)) + else: + top_env['function_definitions'].append(FUNCTION_DEFINITION.substitute(env)) + method_of.append('namespace') + + output_options.append(OutputDeclaration( + name=option['api_name'], + method_prefix_derived=option['method_prefix_derived'], + arguments=formals, + method_of=method_of, + mode=option['mode'], + buffers=None, + returns=option['returns'], + inplace=option['inplace'], + # See Note [Abstract ATen methods] + abstract=abstract, + device_guard=option.get('device_guard', True), + with_gil=option.get('with_gil', False), + deprecated=option['deprecated'], + )) + + output_declarations = [] # type: List[OutputDeclaration] + for declaration in declarations: + output_options = [] # type: List[OutputDeclaration] + for option in declaration['options']: + try: + if option['mode'] != 'native': + process_option(option, output_options) + else: + process_native(option, output_options) + except NYIError: + option['skip'] = True + output_declarations.extend(output_options) + return output_declarations + + +def create_derived(backend_type_env, declarations): + # type: (Environment, List[FunctionOption]) -> Tuple[List[str], List[str]] + type_object_declarations = [] + type_object_definitions = [] + + is_cuda = 'CUDA' in backend_type_env['Backend'] + + real_is_half = backend_type_env['ScalarName'] == 'Half' + + def replace_with_null(argument): + # type: (THFormal) -> bool + return (argument['type'] == 'THGenerator*' and + backend_type_env['Backend'] == 'CUDA') + + def requires_checked_cast(argument): + # type: (THFormal) -> bool + if argument['type'] == 'IntList': + return 'size' in argument + return argument['type'] in CHECKED_CAST + + def nullable_argument(argument): + # type: (THFormal) -> bool + return argument.get('is_nullable', False) + + def bool_option_is_string(argument): + # type: (THFormal) -> bool + return 'if_true' in argument and isinstance(argument['if_true'], string_type) + + def get_argument(argument, option): + # type: (THFormal, FunctionOption) -> str + if replace_with_null(argument): + return 'NULL' + elif requires_checked_cast(argument): + checked_use = CHECKED_USE.get( + argument['type'], '{}_').format(argument['name']) + if real_is_half and argument['type'] == 'real': + checked_use = HALF_CONVERSION.substitute(value=checked_use) + if nullable_argument(argument): + checked_use = CHECKED_USE_NULLABLE.substitute( + env={}, arg_name=argument['name'], usage=checked_use) + return checked_use + elif argument['type'] == 'bool' and 'if_true' in argument: + if bool_option_is_string(argument): + tpl = '({}) ? "{}" : "{}"' + else: + tpl = '({}) ? {} : {}' + return tpl.format(argument['name'], + argument['if_true'], argument['if_false']) + elif argument['type'] == 'CONSTANT': + # this is a bool that is actually a string... + if bool_option_is_string(argument): + return '"{}"'.format(argument['name']) + v = str(argument.get('default', argument['name'])) + for pattern, replacement in CONSTANT_REPLACEMENTS: + v = re.sub(pattern, replacement, v) + return CodeTemplate(v).substitute(backend_type_env) + # e.g. argument 0, i.e. repeat the 0th argument in this position... + elif argument['type'] == 'argument': + index = int(argument['name']) + return get_argument(option['arguments'][index], option) + else: + return argument['name'] + + def drop_argument(argument, option): + # type: (THFormal, FunctionOption) -> bool + # Devices are handled in the body of the function. + if argument['name'] == 'device': + return True + return 'CUDA' in backend_type_env['Backend'] and ( + option['mode'] == 'TH' and argument['type'] == 'THGenerator*') + + def get_arguments(arguments, option): + # type: (List[THFormal], FunctionOption) -> List[str] + return [get_argument(argument, option) + for argument in arguments if not drop_argument(argument, option)] + + def is_actual_return_long(ret): + # type: (ReturnDecl) -> bool + if ret['type'] == 'long': + return True + if ret['type'] == 'real': + return backend_type_env['ScalarName'] == 'Long' + if ret['type'] == 'accreal': + return backend_type_env['AccScalarName'] == 'Long' + return False + + def handle_zero_dim(env, option): + # type: (Environment, FunctionOption) -> List[str] + zero_dim_dispatch = option.get('zero_dim_dispatch_when_scalar', '') + if not zero_dim_dispatch: + return [] + broadcasts_arg = zero_dim_dispatch in option.get('broadcast_actuals', '') + zero_dim_only = option.get('zero_dim_tensor_only', False) + # this combination doesn't seem to make sense + assert not (broadcasts_arg and zero_dim_only) + # if the argument broadcasts, then this would only affect cases where all broadcasted + # tensors were zero-dim, which is inconsistent with the scalar handling. + if broadcasts_arg: + return [] + zero_dim_actuals = [arg['name'] + if arg['name'] != zero_dim_dispatch else "Scalar({})".format(arg['name']) + for arg in option['formals_list']] + return [ZERO_DIM_CHECK.substitute(env, check_name=zero_dim_dispatch, zero_dim_actuals=zero_dim_actuals)] + + def handle_only_zero_dim(env, option): + # type: (Environment, FunctionOption) -> Optional[List[str]] + if option.get('zero_dim_tensor_only', False): + check_name = option['zero_dim_dispatch_when_scalar'] + return [ZERO_DIM_ONLY.substitute(env, check_name=check_name)] + else: + return None + + def handle_sparse(env, option): + # type: (Environment, FunctionOption) -> List[str] + if 'when_sparse_dispatch' not in option or 'Sparse' in backend_type_env['Backend']: + return [] + check_name = option['when_sparse_dispatch'] + sparse_actuals = [arg['name'] + if arg['name'] != check_name else "SparseTensorRef({})".format(arg['name']) + for arg in option['formals_list']] + return [SPARSE_CHECK.substitute(env, check_name=check_name, sparse_actuals=sparse_actuals)] + + def allocate_arg(env, arg, output_count): + # type: (Environment, THFormal, int) -> List[str] + name = arg['name'] + allocation = CodeTemplate(ALLOC_WRAP[arg['type']]).substitute(env, arguments=[]) + tensor_arg = '{}_'.format(name) + if arg.get('mask', False): + allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation) + tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensor::singleton() : (TensorImpl*){}_' + .format(name, name)) + return [ + 'auto {}_ = {};'.format(name, allocation), + 'auto {} = Tensor({}, false);'.format(name, tensor_arg), + ] + + def resize_arg(arg): + # type: (THFormal) -> str + resize = arg['resize'] + if isinstance(resize, str): + return "{}.resize_({}.sizes());".format(arg['name'], resize) + else: + resize_scalar = arg.get('resize_scalar', False) + if resize_scalar: + dims = ['{}.dim() == 0 ? 1 : {}.size({})'.format(name, name, dim) for name, dim in resize] + else: + dims = ['{}.size({})'.format(name, dim) for name, dim in resize] + return "{}.resize_({{ {} }});".format(arg['name'], ','.join(dims)) + + def handle_call(env, option, cimpl): + # type: (Environment, FunctionOption, FunctionOption) -> str + is_nn = option['mode'] == 'NN' + actuals = get_arguments(cimpl['arguments'], option) + if is_cuda or is_nn: + actuals = ['context->getTHCState()'] + actuals + + cname = cimpl['cname'] + if option.get('sparse', False): + if is_cuda: + cname = 'THCS' + env['ScalarName'] + "Tensor_" + cname + else: + cname = env['THTensor'].replace('TH', 'THS') + '_' + cname + elif is_nn: + cname = 'THNN_{}'.format(env['THType']) + cname + else: + cname = env['THTensor'] + '_' + cname + + call = CALL_TEMPLATE.substitute(actuals=actuals, cname=cname) + if cimpl.get('condition') is not None: + call = 'if ({}) {}'.format(cimpl['condition'], call) + return call + + def emit_body(env, option): + # type: (Environment, FunctionOption) -> List[str] + body = [] # type: List[str] + body += handle_sparse(env, option) + body += handle_zero_dim(env, option) + only_zero_dim_check = handle_only_zero_dim(env, option) + if only_zero_dim_check is not None: + # code below only_zero_dim_check is unreachable so we do not need to generate the rest. + body += only_zero_dim_check + return body + + # arguments are potentially duplicated because of one argument + # referencing another + seen_names = set() # type: Set[str] + seen_tensorlists = set() # type: Set[str] + count = 0 + output_count = 0 + + # scalar_check is the heuristic conditions when a result may be a scalar_check + # if there is a THSize* argument, then its dimensions are used to determine scalar. + # otherwise, it is true if all the input tensors are scalars, + scalar_check_is_from_size = False + scalar_check_is_from_option = False + scalar_check = None + scalar_check_opt = option.get('scalar_check') + if scalar_check_opt is not None: + if isinstance(scalar_check_opt, bool): + scalar_check = str(scalar_check_opt).lower() + else: + scalar_check = scalar_check_opt + scalar_check_is_from_option = True + + for arg in option['arguments']: + if is_real_argument_to_wrapper(arg): + count += 1 + if arg['type'] == 'THSize*' and not scalar_check_is_from_option: + scalar_check_is_from_size = True + scalar_check = '{}.size() == 0'.format(arg['name']) + if arg['type'] == 'TensorList': + seen_tensorlists.add(arg['name']) + + wrap_dim_target = arg.get('wrap_dim', None) + if wrap_dim_target is not None: + # for Tensors, "name_" is the TensorImpl, but for TensorLists, it is an + # std::vector of TH*s. Since TH*s have different dimension rules, we used + # "name" instead, but keep "name_" for tensor to avoid an extra function call. + if wrap_dim_target not in seen_tensorlists: + wrap_dim_target = wrap_dim_target + "_" + body.append("{} = maybe_wrap_dim({}, {});" + .format(arg['name'], arg['name'], wrap_dim_target)) + + # only generated checked casts the first time we see it + if arg['name'] not in seen_names and requires_checked_cast(arg): + seen_names.add(arg['name']) + + # make a new allocation of TensorImpl, then wrap a Tensor around it. + if arg.get('allocate', False): + body += allocate_arg(env, arg, output_count) + output_count += 1 + # extract the TensorImpl from an existing tensor (or Storage, etc.) + else: + # special case where we allow undefined Tensors, and thus + # the checked cast succeeds even if the Tensor is not + # defined + null_okay = 'true' if nullable_argument(arg) else 'false' + default_init = [] + if 'default_init' in arg: + default_init.append(arg['default_init']) + + if arg['type'] in DIRECT_CONSTRUCTION_CHECKED_CAST: + body.append(CHECKED_CAST[arg['type']].substitute( + env, arg_name=arg['name'], arg_pos=count, + null_okay=null_okay, default_init=default_init, + size=arg.get('size'), + result_name=arg['name'] + '_')) + else: + check_cast = CHECKED_CAST[arg['type']].substitute( + env, arg_name=arg['name'], arg_pos=count, + null_okay=null_okay, default_init=default_init, + size=arg.get('size')) + body.append("auto {}_ = {};".format( + arg['name'], check_cast)) + if drop_argument(arg, option) or replace_with_null(arg): + body.append( + "(void) {}_; //silence unused warning".format(arg['name'])) + + initializers = [] + + # resize tensors for special ops that require it + if 'resize' in arg: + initializers.append(resize_arg(arg)) + + # also special handling where we zero some outputs. + if arg.get('zero', False) or (arg.get('cpu_zero', False) and not is_cuda): + initializers.append("{}.zero_();".format(arg['name'])) + + # only initialize non-null arguments + if nullable_argument(arg) and len(initializers) > 0: + body.append(CONDITIONAL_INITIALIZER.substitute({ + 'name': arg['name'], + 'initializer': initializers + })) + else: + body += initializers + + # for out-of-place: isScalar() for all input tensors is and'd to form + # the test for whether the output is also a scalar + # for in-place: isScalar() shouldn't change as a result of the operation + if (not arg.get('output') and 'Tensor' in arg['type'] and + 'TensorList' not in arg['type'] and + 'THS' not in arg['type'] and + not scalar_check_is_from_size and + not scalar_check_is_from_option and + not option['inplace']): + check = '{}->isScalar()'.format(arg['name'] + '_') + if nullable_argument(arg): + check = '(!{} || {})'.format(arg['name'] + '_', check) + scalar_check = (check if scalar_check is None + else scalar_check + ' && ' + check) + + # cimpls, if it exists, contains the underlying C function names and + # arguments. Otherwise use option + cimpls = option.get('cimpls', [option]) + calls = [handle_call(env, option, cimpl) for cimpl in cimpls] + + ret = option['return'] + + if ret['kind'] == 'arguments': + if 'aten_custom_call' in option: + # all aten_custom_call bodies handle settings on their own. + scalar_check = None + body.append(CodeTemplate( + option['aten_custom_call']).substitute(env)) + else: + body.extend([call + ';' for call in calls]) + arguments_indices = ret['arguments'] + arguments = [option['arguments'][argi] + for argi in arguments_indices] + if scalar_check is not None: + if not isinstance(scalar_check, dict): + if len(arguments) > 1: + body.append("bool maybe_scalar = {};".format(scalar_check)) + scalar_check = 'maybe_scalar' + for arg in arguments: + scalar_check_arg = (scalar_check if not isinstance(scalar_check, dict) + else scalar_check.get(arg['name'])) # type: ignore + if scalar_check_arg is not None: + stmt = "{}_->maybeScalar({});".format(arg['name'], scalar_check_arg) + if nullable_argument(arg): + stmt = "if ({}_) {}".format(arg['name'], stmt) + body.append(stmt) + if len(arguments_indices) == 1: + arg = arguments[0] + body.append("return {};".format(arg['name'])) + else: + types = [to_return_type(arg, option)['type'] + for arg in arguments] + # TODO: check for move semantics... + names = [arg['name'] for arg in arguments] + body.append(CodeTemplate("return std::tuple<${types}>(${names});").substitute( + types=types, names=names)) + elif ret['kind'] == 'type': + assert len(calls) == 1 + call = calls[0] + if 'aten_custom_call' in option: + # all aten_custom_call bodies handle settings on their own. + scalar_check = None + body.append(CodeTemplate( + option['aten_custom_call']).substitute(env)) + + if ret['type'] in ALLOC_WRAP.keys(): + maybe_scalar = "->maybeScalar({})".format(scalar_check) \ + if scalar_check is not None \ + else "" + wrapped_tensor = CodeTemplate(ALLOC_WRAP[ret['type']]).substitute( + env, arguments=[call]) + return_tensor = "return Tensor((${wrapped_tensor})${maybe_scalar},false);" + body.append(CodeTemplate(return_tensor).substitute( + env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar)) + # return the same underlying Tensor type for both real and accreal; this ensures + # e.g. x.sum(0) and x.sum() return the same type. We explicitly cast to the + # ScalarType before constructing the scalarTensor to avoid overflow checking. + elif ret['type'] == 'accreal' or ret['type'] == 'real': + return_scalar = 'return scalarTensor(convert<${ScalarType}>(${call}));' + body.append(CodeTemplate(return_scalar).substitute(env, call=call)) + else: + # we using int64_t for long in the API, so correct it here... + if is_actual_return_long(ret): + call = "static_cast({})".format(call) + body.append("return {};".format(call)) + else: + raise Exception("NYI - return handling") + return body + + def process_option(option): + # type: (FunctionOption) -> None + pair = (backend_type_env['Backend'], + backend_type_env['ScalarName']) + if pair in option['backend_type_pairs']: + env = nested_dict(option, backend_type_env) + body = emit_body(env, option) # type: ignore + option['type_definition_body'] = body + type_object_declarations.append( + TYPE_DERIVED_DECLARATION.substitute(env)) + type_object_definitions.append( + TYPE_DERIVED_DEFINITION.substitute(env)) + + def process_native(option): + # type: (FunctionOption) -> None + dispatch = option['type_method_definition_dispatch'] + env = nested_dict(option, backend_type_env) + + if isinstance(dispatch, dict): + pair = (backend_type_env['Backend'], + backend_type_env['ScalarName']) + if pair in option['backend_type_pairs']: + native_dispatch = dispatch.get(pair[0]) + type_object_declarations.append( + TYPE_DERIVED_DECLARATION.substitute(env)) + if native_dispatch is None: + type_object_definitions.append( + TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env)) + else: + option['native_type_method_dispatch'] = native_dispatch + type_object_definitions.append( + TYPE_DERIVED_DEFINITION_NATIVE.substitute(env)) + + for declaration in declarations: + for option in declaration['options']: + if not option.get('skip', False): + try: + if option['mode'] == 'NN' and option.get('cimpls') is None: + continue + if option['mode'] != 'native': + process_option(option) + else: + process_native(option) + except NYIError: + pass + return type_object_declarations, type_object_definitions diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py new file mode 100644 index 0000000..6d3598a --- /dev/null +++ b/aten/src/ATen/gen.py @@ -0,0 +1,457 @@ +import argparse +import os + +import yaml +from collections import OrderedDict + +import sys +from os import path +sys.path.append(path.dirname(path.abspath(__file__))) + +import cwrap_parser +import nn_parse +import native_parse +import preprocess_declarations +import function_wrapper +import copy_wrapper + +from code_template import CodeTemplate + + +# This file is the top-level entry point for code generation in ATen. +# It takes an arbitrary number of arguments specifying metadata files to +# process (.cwrap, .yaml and .h) and outputs a number generated header +# and cpp files in ATen/ (see invocations of 'write' for each file that +# is written.) It is invoked from cmake; look for the 'cwrap_files' +# variable for an up-to-date list of files which are passed. + +parser = argparse.ArgumentParser(description='Generate ATen source files') +parser.add_argument('files', help='cwrap files', nargs='+') + +parser.add_argument( + '-s', + '--source-path', + help='path to source directory for ATen', + default='.') +parser.add_argument( + '-o', + '--output-dependencies', + help='output a list of dependencies into the given file and exit') +parser.add_argument( + '-d', '--install_dir', help='output directory', default='ATen') +options = parser.parse_args() + +if options.install_dir is not None and not os.path.exists(options.install_dir): + os.makedirs(options.install_dir) + + +class FileManager(object): + def __init__(self): + self.filenames = set() + self.outputs_written = False + self.undeclared_files = [] + + def will_write(self, filename): + filename = '{}/{}'.format(options.install_dir, filename) + if self.outputs_written: + raise Exception("'will_write' can only be called before " + + "the call to write_outputs, refactor so outputs are registered " + + "before running the generators") + self.filenames.add(filename) + + def _write_if_changed(self, filename, contents): + try: + with open(filename, 'r') as f: + old_contents = f.read() + except IOError: + old_contents = None + if contents != old_contents: + with open(filename, 'w') as f: + f.write(contents) + + def write_outputs(self, filename): + """Write a file containing the list of all outputs which are + generated by this script.""" + self._write_if_changed( + filename, + ''.join(name + ";" for name in sorted(self.filenames))) + self.outputs_written = True + + def write(self, filename, s, env=None): + filename = '{}/{}'.format(options.install_dir, filename) + if isinstance(s, CodeTemplate): + assert env is not None + env['generated_comment'] = "@" + "generated by aten/src/ATen/gen.py" + s = s.substitute(env) + self._write_if_changed(filename, s) + if filename not in self.filenames: + self.undeclared_files.append(filename) + else: + self.filenames.remove(filename) + + def check_all_files_written(self): + if len(self.undeclared_files) > 0: + raise Exception( + "trying to write files {} which are not ".format(self.undeclared_files) + + "in the list of outputs this script produces. " + + "use will_write to add them.") + if len(self.filenames) > 0: + raise Exception("Outputs declared with 'will_write' were " + + "never written: {}".format(self.filenames)) + + +TEMPLATE_PATH = options.source_path + "/templates" +GENERATOR_DERIVED = CodeTemplate.from_file( + TEMPLATE_PATH + "/GeneratorDerived.h") +STORAGE_DERIVED_CPP = CodeTemplate.from_file( + TEMPLATE_PATH + "/StorageDerived.cpp") +STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h") + +TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp") +SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") +TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") +TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h") +TYPE_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.cpp") + +TENSOR_DERIVED_CPP = CodeTemplate.from_file( + TEMPLATE_PATH + "/TensorDerived.cpp") +TENSOR_DENSE_CPP = CodeTemplate.from_file( + TEMPLATE_PATH + "/TensorDense.cpp") + +REGISTER_CUDA_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.h") +REGISTER_CUDA_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.cpp") + +TENSOR_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorDerived.h") +TENSOR_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Tensor.h") +TENSOR_METHODS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorMethods.h") + +FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Functions.h") + +NATIVE_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/NativeFunctions.h") + +TYPE_REGISTER = CodeTemplate("""\ +context->type_registry[static_cast(Backend::${backend})] + [static_cast(ScalarType::${scalar_type})] + .reset(new ${type_name}(context)); +detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type}); +""") + +file_manager = FileManager() +cuda_file_manager = FileManager() + +generators = { + 'CPUGenerator.h': { + 'name': 'CPU', + 'th_generator': 'THGenerator * generator;', + 'header': 'TH/TH.h', + }, + 'CUDAGenerator.h': { + 'name': 'CUDA', + 'th_generator': '', + 'header': 'THC/THC.h' + }, +} + +backends = ['CPU', 'CUDA'] +densities = ['Dense', 'Sparse'] + +# scalar_name, c_type, accreal, th_scalar_type, is_floating_type +scalar_types = [ + ('Byte', 'uint8_t', 'Long', 'uint8_t', False), + ('Char', 'int8_t', 'Long', 'int8_t', False), + ('Double', 'double', 'Double', 'double', True), + ('Float', 'float', 'Double', 'float', True), + ('Int', 'int', 'Long', 'int32_t', False), + ('Long', 'int64_t', 'Long', 'int64_t', False), + ('Short', 'int16_t', 'Long', 'int16_t', False), + ('Half', 'Half', 'Double', 'THHalf', True), +] + +# shared environment for non-derived base classes Type.h Tensor.h Storage.h +top_env = { + 'cpu_type_registrations': [], + 'cpu_type_headers': [], + 'cuda_type_registrations': [], + 'cuda_type_headers': [], + 'type_method_declarations': [], + 'type_method_definitions': [], + 'type_method_inline_definitions': [], + 'tensor_method_declarations': [], + 'tensor_method_definitions': [], + 'function_declarations': [], + 'function_definitions': [], + 'type_ids': [], + 'native_function_declarations': [], +} + + +def dict_representer(dumper, data): + return dumper.represent_dict(data.items()) + + +def postprocess_output_declarations(output_declarations): + # ensure each return has a name associated with it + for decl in output_declarations: + has_named_ret = False + for n, ret in enumerate(decl.returns): + if 'name' not in ret: + assert not has_named_ret + if decl.inplace: + ret['name'] = 'self' + elif len(decl.returns) == 1: + ret['name'] = 'result' + else: + ret['name'] = 'result' + str(n) + else: + has_named_ret = True + + def remove_key_if_none(dictionary, key): + if key in dictionary.keys() and dictionary[key] is None: + del dictionary[key] + return dictionary + + return [remove_key_if_none(decl._asdict(), 'buffers') + for decl in output_declarations] + + +def format_yaml(data): + if options.output_dependencies: + # yaml formatting is slow so don't do it if we will ditch it. + return "" + noalias_dumper = yaml.dumper.SafeDumper + noalias_dumper.ignore_aliases = lambda self, data: True + # Support serializing OrderedDict + noalias_dumper.add_representer(OrderedDict, dict_representer) + return yaml.dump(data, default_flow_style=False, Dumper=noalias_dumper) + + +def generate_storage_type_and_tensor(backend, density, scalar_type, declarations): + scalar_name, c_type, accreal, th_scalar_type, is_floating_type = scalar_type + env = {} + density_tag = 'Sparse' if density == 'Sparse' else '' + env['Density'] = density + env['ScalarName'] = scalar_name + env['ScalarType'] = c_type + env['THScalarType'] = th_scalar_type + env['AccScalarName'] = accreal + env['isFloatingType'] = is_floating_type + env['isIntegralType'] = not is_floating_type + if density == 'Dense': + env['Storage'] = "{}{}Storage".format(backend, scalar_name) + env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name) + env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name) + env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name) + env['Backend'] = density_tag + backend + env['DenseBackend'] = backend + env['storage_tensor_headers'] = [] + if density != 'Sparse': + env['storage_tensor_headers'] = [ + '#include "ATen/{}.h"'.format(env['Storage']), + '#include "ATen/{}.h"'.format(env['Tensor']), + '#include "ATen/{}ByteTensor.h"'.format(env['Backend']), + '#include "ATen/{}IntTensor.h"'.format(env['Backend']), + '#include "ATen/{}LongTensor.h"'.format(env['Backend']), + ] + + # used for generating switch logic for external functions + tag = density_tag + backend + scalar_name + env['TypeID'] = 'TypeID::' + tag + top_env['type_ids'].append(tag + ',') + + if backend == 'CUDA': + env['th_headers'] = [ + '#include ', + '#include ', + '#include ', + '#undef THNN_', + '#undef THCIndexTensor_', + ] + env['extra_cuda_headers'] = ['#include '] + env['extra_cuda_headers'].append('#include ') + sname = '' if scalar_name == "Float" else scalar_name + env['THType'] = 'Cuda{}'.format(sname) + env['THStorage'] = 'THCuda{}Storage'.format(sname) + env['THTensor'] = 'THCuda{}Tensor'.format(sname) + env['THIndexTensor'] = 'THCudaLongTensor' + env['state'] = ['context->getTHCState()'] + env['isCUDA'] = 'true' + env['storage_device'] = 'return storage->device;' + env['Generator'] = 'CUDAGenerator' + else: + env['th_headers'] = [ + '#include ', + '#include ', + '#include ', + '#undef THNN_', + ] + env['extra_cuda_headers'] = [] + env['THType'] = scalar_name + env['THStorage'] = "TH{}Storage".format(scalar_name) + env['THTensor'] = 'TH{}Tensor'.format(scalar_name) + env['THIndexTensor'] = 'THLongTensor' + env['state'] = [] + env['isCUDA'] = 'false' + env['storage_device'] = 'throw std::runtime_error("CPU storage has no device");' + env['Generator'] = 'CPUGenerator' + env['AS_REAL'] = env['ScalarType'] + if scalar_name == "Half": + env['SparseTensor'] = 'Tensor' + if backend == "CUDA": + env['to_th_type'] = 'HalfFix<__half,Half>' + env['to_at_type'] = 'HalfFix' + env['AS_REAL'] = 'convert' + env['THScalarType'] = 'half' + else: + env['to_th_type'] = 'HalfFix' + env['to_at_type'] = 'HalfFix' + elif scalar_name == 'Long': + env['to_th_type'] = 'long' + env['to_at_type'] = 'int64_t' + else: + env['to_th_type'] = '' + env['to_at_type'] = '' + + declarations, definitions = function_wrapper.create_derived( + env, declarations) + env['type_derived_method_declarations'] = declarations + env['type_derived_method_definitions'] = definitions + + fm = file_manager + if env['DenseBackend'] == 'CUDA': + fm = cuda_file_manager + + if density != 'Sparse': + # there are no storage or tensor types for sparse; it's all uniform + fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env) + fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env) + env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env) + fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env) + fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env) + + if density != 'Sparse': + fm.write(env['Type'] + ".cpp", TYPE_DERIVED_CPP, env) + else: + fm.write(env['Type'] + ".cpp", SPARSE_TYPE_DERIVED_CPP, env) + fm.write(env['Type'] + ".h", TYPE_DERIVED_H, env) + + type_register = TYPE_REGISTER.substitute(backend=env['Backend'], scalar_type=scalar_name, type_name=env['Type']) + if env['DenseBackend'] == 'CPU': + top_env['cpu_type_registrations'].append(type_register) + top_env['cpu_type_headers'].append( + '#include "ATen/{}.h"'.format(env['Type'])) + else: + assert env['DenseBackend'] == 'CUDA' + top_env['cuda_type_registrations'].append(type_register) + top_env['cuda_type_headers'].append( + '#include "ATen/{}.h"'.format(env['Type'])) + + return env + + +def iterate_types(): + for backend in backends: + for density in densities: + for scalar_type in scalar_types: + if density == 'Sparse' and scalar_type[0] == 'Half': + # THS does not do half type yet. + continue + yield (backend, density, scalar_type) + + +################### +# declare what files will be output _before_ we do any work +# so that the script runs quickly when we are just querying the +# outputs +def declare_outputs(): + files = ['Declarations.yaml', 'Type.h', 'Type.cpp', 'Tensor.h', + 'TensorMethods.h', 'Functions.h', + 'CPUCopy.cpp', 'NativeFunctions.h'] + for f in files: + file_manager.will_write(f) + cuda_files = ['CUDACopy.cpp', 'RegisterCUDA.cpp', 'RegisterCUDA.h'] + for f in cuda_files: + cuda_file_manager.will_write(f) + for fname in sorted(generators.keys()): + fm = file_manager + if generators[fname]['name'] == 'CUDA': + fm = cuda_file_manager + fm.will_write(fname) + for backend, density, scalar_types in iterate_types(): + scalar_name = scalar_types[0] + full_backend = "Sparse" + backend if density == "Sparse" else backend + for kind in ["Storage", "Type", "Tensor"]: + if kind != 'Type' and density == "Sparse": + # No Storage or Tensor for sparse + continue + fm = file_manager + if backend == 'CUDA': + fm = cuda_file_manager + fm.will_write("{}{}{}.h".format(full_backend, scalar_name, kind)) + fm.will_write("{}{}{}.cpp".format(full_backend, scalar_name, kind)) + + +def filter_by_extension(files, *extensions): + filtered_files = [] + for file in files: + for extension in extensions: + if file.endswith(extension): + filtered_files.append(file) + return filtered_files + + +def generate_outputs(): + cwrap_files = filter_by_extension(options.files, '.cwrap') + nn_files = filter_by_extension(options.files, 'nn.yaml', '.h') + native_files = filter_by_extension(options.files, 'native_functions.yaml') + + declarations = [d + for file in cwrap_files + for d in cwrap_parser.parse(file)] + + declarations += nn_parse.run(nn_files) + declarations += native_parse.run(native_files) + declarations = preprocess_declarations.run(declarations) + for fname, env in generators.items(): + fm = file_manager + if env['name'] == 'CUDA': + fm = cuda_file_manager + fm.write(fname, GENERATOR_DERIVED, env) + + # note: this will fill in top_env['type/tensor_method_declarations/definitions'] + # and modify the declarations to include any information that will all_backends + # be used by function_wrapper.create_derived + output_declarations = function_wrapper.create_generic(top_env, declarations) + output_declarations = postprocess_output_declarations(output_declarations) + file_manager.write("Declarations.yaml", format_yaml(output_declarations)) + + # populated by generate_storage_type_and_tensor + all_types = [] + + for backend, density, scalar_type in iterate_types(): + all_types.append(generate_storage_type_and_tensor( + backend, density, scalar_type, declarations)) + + file_manager.write('Type.h', TYPE_H, top_env) + file_manager.write('Type.cpp', TYPE_CPP, top_env) + + cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env) + cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env) + + file_manager.write('Tensor.h', TENSOR_H, top_env) + file_manager.write('TensorMethods.h', TENSOR_METHODS_H, top_env) + file_manager.write('Functions.h', FUNCTIONS_H, top_env) + + file_manager.write('CPUCopy.cpp', copy_wrapper.create(all_types, 'CPU')) + cuda_file_manager.write('CUDACopy.cpp', copy_wrapper.create(all_types, 'CUDA')) + file_manager.write('NativeFunctions.h', NATIVE_FUNCTIONS_H, top_env) + + file_manager.check_all_files_written() + cuda_file_manager.check_all_files_written() + + +declare_outputs() +if options.output_dependencies is not None: + file_manager.write_outputs(options.output_dependencies) + cuda_file_manager.write_outputs(options.output_dependencies + "-cuda") +else: + generate_outputs() diff --git a/aten/src/ATen/mkl/Descriptors.h b/aten/src/ATen/mkl/Descriptors.h new file mode 100644 index 0000000..efedcd0 --- /dev/null +++ b/aten/src/ATen/mkl/Descriptors.h @@ -0,0 +1,44 @@ +#pragma once + +#include "Exceptions.h" +#include +#include + +namespace at { namespace native { + +struct DftiDescriptorDeleter { + void operator()(DFTI_DESCRIPTOR* desc) { + if (desc != nullptr) { + MKL_DFTI_CHECK(DftiFreeDescriptor(&desc)); + } + } +}; + +class DftiDescriptor { +public: + void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, MKL_LONG signal_ndim, MKL_LONG* sizes) { + if (desc_ != nullptr) { + throw std::runtime_error("DFTI DESCRIPTOR can only be initialized once"); + } + DFTI_DESCRIPTOR *raw_desc; + if (signal_ndim == 1) { + MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0])); + } else { + MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, signal_ndim, sizes)); + } + desc_.reset(raw_desc); + } + + DFTI_DESCRIPTOR *get() const { + if (desc_ == nullptr) { + throw std::runtime_error("DFTI DESCRIPTOR has not been initialized"); + } + return desc_.get(); + } + +private: + std::unique_ptr desc_; +}; + + +}} // at::native diff --git a/aten/src/ATen/mkl/Exceptions.h b/aten/src/ATen/mkl/Exceptions.h new file mode 100644 index 0000000..e954a07 --- /dev/null +++ b/aten/src/ATen/mkl/Exceptions.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { namespace native { + +static inline void MKL_DFTI_CHECK(MKL_INT status) +{ + if (status && !DftiErrorClass(status, DFTI_NO_ERROR)) { + std::ostringstream ss; + ss << "MKL FFT error: " << DftiErrorMessage(status); + throw std::runtime_error(ss.str()); + } +} + +}} // namespace at::native diff --git a/aten/src/ATen/mkl/Limits.h b/aten/src/ATen/mkl/Limits.h new file mode 100644 index 0000000..b0d3829 --- /dev/null +++ b/aten/src/ATen/mkl/Limits.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +namespace at { namespace native { + + // Since size of MKL_LONG varies on different platforms (linux 64 bit, windows + // 32 bit), we need to programmatically calculate the max. + static int64_t MKL_LONG_MAX = ((1LL << (sizeof(MKL_LONG) * 8 - 2)) - 1) * 2 + 1; + +}} // namespace diff --git a/aten/src/ATen/mkl/README.md b/aten/src/ATen/mkl/README.md new file mode 100644 index 0000000..2916246 --- /dev/null +++ b/aten/src/ATen/mkl/README.md @@ -0,0 +1,4 @@ +All files living in this directory are written with the assumption that MKL is available, +which means that these code are not guarded by `#if AT_MKL_ENABLED()`. Therefore, whenever +you need to use definitions from here, please guard the `#include` and +definition usages with `#if AT_MKL_ENABLED()` macro, e.g. [SpectralOps.cpp](native/mkl/SpectralOps.cpp). \ No newline at end of file diff --git a/aten/src/ATen/mkldnn/Runtime.cpp b/aten/src/ATen/mkldnn/Runtime.cpp new file mode 100644 index 0000000..54f999e --- /dev/null +++ b/aten/src/ATen/mkldnn/Runtime.cpp @@ -0,0 +1,5 @@ +#include "Runtime.h" + +namespace at { namespace native { + +}} // namespace at::native diff --git a/aten/src/ATen/mkldnn/Runtime.h b/aten/src/ATen/mkldnn/Runtime.h new file mode 100644 index 0000000..c58ef2c --- /dev/null +++ b/aten/src/ATen/mkldnn/Runtime.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +using namespace mkldnn; + +namespace at { namespace native { + +// CpuEngine singleton +struct CpuEngine { + static CpuEngine& Instance() { + static CpuEngine myInstance; + return myInstance; + } + engine& get_engine() { + return _cpu_engine; + } + CpuEngine(CpuEngine const&) = delete; + CpuEngine& operator=(CpuEngine const&) = delete; + +protected: + CpuEngine():_cpu_engine(mkldnn::engine::cpu, 0) {} + ~CpuEngine() {} + +private: + engine _cpu_engine; +}; + +// Stream singleton +struct Stream { + static Stream& Instance() { + static Stream myInstance; + return myInstance; + }; + stream& get_stream() { + return _cpu_stream; + } + Stream(Stream const&) = delete; + Stream& operator=(Stream const&) = delete; + +protected: + Stream():_cpu_stream(mkldnn::stream::kind::eager) {} + ~Stream() {} + +private: + stream _cpu_stream; +}; + +}} // namespace at::native diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp new file mode 100644 index 0000000..87bd091 --- /dev/null +++ b/aten/src/ATen/native/Activation.cpp @@ -0,0 +1,72 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Dispatch.h" +#include "ATen/CPUApplyUtils.h" +#include "ATen/Half.h" + +namespace at { namespace native { + +static const double SELU_ALPHA = 1.6732632423543772848170429916717; +static const double SELU_SCALE = 1.0507009873554804934193349852946; + +Tensor relu(const Tensor & self) { + return self.clamp_min(0.0); +} + +Tensor & relu_(Tensor & self) { + return self.clamp_min_(0.0); +} + +Tensor selu(const Tensor & self) { + return at::elu(self, SELU_ALPHA, SELU_SCALE); +} + +Tensor & selu_(Tensor & self) { + return at::elu_(self, SELU_ALPHA, SELU_SCALE); +} + +Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) { + return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator); +} + +Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) { + return at::rrelu_with_noise_(self, self.type().tensor(), lower, upper, training, generator); +} + +Tensor hardshrink_cpu(const Tensor & self, Scalar lambd) { + auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU); + auto out_tensor = at::empty_like(self); + AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_cpu", [&] { + scalar_t* lambd_tensor_d = lambd_tensor.data(); + at::CPU_tensor_apply2( + self, + out_tensor, + [lambd_tensor_d]( + scalar_t& self_val, + scalar_t& out_tensor_val) { + out_tensor_val = (self_val >= -*lambd_tensor_d && self_val <= *lambd_tensor_d) ? convert(0) : self_val; + }); + }); + return out_tensor; +} + +Tensor hardshrink_backward_cpu(const Tensor & grad, const Tensor & self, Scalar lambd) { + auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU); + auto out_tensor = at::empty_like(self); + AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_backward_cpu", [&] { + scalar_t* lambd_tensor_d = lambd_tensor.data(); + at::CPU_tensor_apply3( + self, + grad, + out_tensor, + [lambd_tensor_d]( + scalar_t& self_val, + scalar_t& grad_val, + scalar_t& out_tensor_val) { + out_tensor_val = (self_val >= -*lambd_tensor_d && self_val <= *lambd_tensor_d) ? convert(0) : grad_val; + }); + }); + return out_tensor; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp new file mode 100644 index 0000000..a537691 --- /dev/null +++ b/aten/src/ATen/native/Convolution.cpp @@ -0,0 +1,685 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + +#include "ATen/Config.h" + +namespace at { namespace native { + +struct ConvParams { + std::vector stride; + std::vector padding; + std::vector dilation; + bool transposed; + std::vector output_padding; + int groups; + bool benchmark; + bool deterministic; + bool cudnn_enabled; + + bool is_strided() const; + bool is_dilated() const; + bool is_padded() const; + bool is_output_padding_neg() const; + bool is_output_padding_big() const; + bool is_padding_neg() const; + void view1d_as_2d(); + bool use_cudnn(const at::Tensor& input) const; + bool use_mkldnn(const at::Tensor& input) const; + bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const; +}; + +std::ostream& operator<<(std::ostream & out, const ConvParams& params) { + out << "ConvParams {" + << " stride = " << IntList{params.stride} + << " padding = " << IntList{params.padding} + << " dilation = " << IntList{params.dilation} + << " transposed = " << params.transposed + << " output_padding = " << IntList{params.output_padding} + << " groups = " << params.groups + << " benchmark = " << params.benchmark + << " deterministic = " << params.deterministic + << " cudnn_enabled = " << params.cudnn_enabled + << "}"; + return out; +} + +auto ConvParams::is_strided() const -> bool { + bool is_strided = false; + for (int s : stride) { + is_strided |= (s != 1); + } + return is_strided; +} + +auto ConvParams::is_dilated() const -> bool { + bool is_dilated = false; + for (int d : dilation) { + is_dilated |= (d != 1); + } + return is_dilated; +} + +auto ConvParams::is_padded() const -> bool { + bool is_padded = false; + for (int p : padding) { + is_padded |= (p != 0); + } + return is_padded; +} + +auto ConvParams::is_output_padding_neg() const -> bool { + bool is_non_neg = false; + for (int p : output_padding) { + is_non_neg |= (p < 0); + } + return is_non_neg; +} + +auto ConvParams::is_output_padding_big() const -> bool { + bool is_big = false; + for (size_t i = 0; i < output_padding.size(); i++) { + is_big |= (output_padding[i] >= stride[i] || output_padding[i] >= dilation[i]); + } + return is_big; +} + +auto ConvParams::is_padding_neg() const -> bool { + bool is_non_neg = false; + for (int p : padding) { + is_non_neg |= (p < 0); + } + return is_non_neg; +} + + +auto ConvParams::view1d_as_2d() -> void { + if (stride.size() == 1) { + stride.insert(stride.begin(), 1); + padding.insert(padding.begin(), 0); + dilation.insert(dilation.begin(), 1); + output_padding.insert(output_padding.begin(), 0); + } +} + +auto ConvParams::use_cudnn(const at::Tensor& input) const -> bool { + if (!detail::getCUDAHooks().compiledWithCuDNN()) { + return false; + } + if (!input.type().is_cuda() || !cudnn_enabled) { + return false; + } + if (deterministic && is_dilated()) { + // cudnn doesn't support deterministic dilated convolution fully yet + return false; + } + if (is_dilated()) { + return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big(); + } + return !is_output_padding_big(); +} + +auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool { +#if AT_MKLDNN_ENABLED() + return input.type().backend() == kCPU && + input.type().scalarType() == kFloat && // only on CPU Float Tensors + !is_dilated() && // doesn't support dilation + !transposed && // or transposed tensors + input.ndimension() == 4; // must be in NCHW format +#endif + return false; +} + +// We currently only have depthwise support for the case where groups == +// nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of +// a depthwise multiplier) +auto ConvParams::is_depthwise( + const at::Tensor& input, const at::Tensor& weight) const -> bool { + return input.type().is_cuda() && + !transposed && + input.ndimension() == 4 && + input.size(1) == groups && + groups > 1 && // no point if there is only a single group + weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels +} + +static void check_input_shape_forward(const at::Tensor& input, + const at::Tensor& weight, const at::Tensor& bias, + int64_t groups, bool transposed) { + int64_t k = input.ndimension(); + int64_t weight_dim = weight.ndimension(); + + if (weight_dim != k) { + std::stringstream ss; + ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim + << "-dimensional weight " << weight.sizes() << ", but got input of size " + << input.sizes() << " instead"; + throw std::runtime_error(ss.str()); + } + if (weight.size(0) < groups) { + std::stringstream ss; + ss << "Given groups=" << groups << ", expected weight to be at least " + << groups << " at dimension 0, but got weight of size " << weight.sizes() + << " instead"; + throw std::runtime_error(ss.str()); + } + + if (!transposed) { + if (input.size(1) != (weight.size(1) * groups)) { + std::stringstream ss; + ss << "Given groups=" << groups << ", weight of size " << weight.sizes() + << ", expected input" << input.sizes() << " to have " + << (weight.size(1) * groups) << " channels, but got " << input.size(1) + << " channels instead"; + throw std::runtime_error(ss.str()); + } + if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(0))) { + std::stringstream ss; + ss << "Given weight of size " << weight.sizes() + << ", expected bias to be 1-dimensional with " << weight.size(0) << " elements" + << ", but got bias of size " << bias.sizes() << " instead"; + throw std::runtime_error(ss.str()); + } + } else { // transposed + if (input.size(1) != weight.size(0)) { + std::stringstream ss; + ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() + << ", expected input" << input.sizes() << " to have " + << weight.size(0) << " channels, but got " << input.size(1) + << " channels instead"; + throw std::runtime_error(ss.str()); + } + if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(1) * groups)) { + std::stringstream ss; + ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() + << ", expected bias to be 1-dimensional with " << weight.size(1) * groups << " elements" + << ", but got bias of size " << bias.sizes() << " instead"; + throw std::runtime_error(ss.str()); + } + } +} + +static auto view4d(const at::Tensor& tensor) -> at::Tensor { + if (tensor.ndimension() != 3) throw std::runtime_error("expected 3D tensor"); + return tensor.unsqueeze(2); +} + +static auto view3d(const at::Tensor& tensor) -> at::Tensor { + if (tensor.ndimension() != 4) throw std::runtime_error("expected 4D tensor"); + return tensor.squeeze(2); +} + + +static at::Tensor subtensor(at::Tensor& tensor, int dim, int groups, int g) { + if (!tensor.defined()) { + return at::Tensor(); + } + int64_t n = tensor.sizes()[dim] / groups; + return tensor.narrow(dim, n * g, n).contiguous(); +} + + +at::Tensor conv1d( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList dilation, int64_t groups) { + return at::convolution(input, weight, bias, stride, padding, dilation, + false, {0}, groups); +} + +at::Tensor conv2d( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList dilation, int64_t groups) { + return at::convolution(input, weight, bias, stride, padding, dilation, + false, {{0, 0}}, groups); +} + +at::Tensor conv3d( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList dilation, int64_t groups) { + return at::convolution(input, weight, bias, stride, padding, dilation, + false, {{0, 0, 0}}, groups); +} + +at::Tensor conv_transpose1d( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) { + return at::convolution(input, weight, bias, stride, padding, dilation, + true, output_padding, groups); +} + +at::Tensor conv_transpose2d( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) { + return at::convolution(input, weight, bias, stride, padding, dilation, + true, output_padding, groups); +} + +at::Tensor conv_transpose3d( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList output_padding, int64_t groups, IntList dilation) { + return at::convolution(input, weight, bias, stride, padding, dilation, + true, output_padding, groups); +} + +at::Tensor convolution( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList dilation, + bool transposed, IntList output_padding, int64_t groups) { + auto& ctx = at::globalContext(); + return at::_convolution(input, weight, bias, stride, padding, dilation, + transposed, output_padding, groups, + ctx.benchmarkCuDNN(), ctx.deterministicCuDNN(), ctx.userEnabledCuDNN()); +} + +static inline std::vector convolution_expand_param_if_needed( + IntList list_param, const char *param_name, int64_t expected_dim) { + if (list_param.size() == 1) { + return std::vector(expected_dim, list_param[0]); + } else if ((int64_t) list_param.size() != expected_dim) { + std::ostringstream ss; + ss << "expected " << param_name << " to be a single integer value or a " + << "list of " << expected_dim << " values to match the convolution " + << "dimensions, but got " << param_name << "=" << list_param; + throw std::runtime_error(ss.str()); + } else { + return list_param.vec(); + } +} + +at::Tensor _convolution( + const Tensor& input_r, const Tensor& weight_r, const Tensor& bias_r, + IntList stride_, IntList padding_, IntList dilation_, + bool transposed_, IntList output_padding_, int64_t groups_, + bool benchmark, bool deterministic, bool cudnn_enabled) { + + auto input = input_r.contiguous(); + auto weight = weight_r; + auto bias = bias_r; + auto k = weight.ndimension(); + int64_t dim = k - 2; + + if (dim <= 0) { + throw std::runtime_error("weight should have at least two dimensions"); + } + + ConvParams params; + params.stride = convolution_expand_param_if_needed(stride_, "stride", dim); + params.padding = convolution_expand_param_if_needed(padding_, "padding", dim); + params.dilation = convolution_expand_param_if_needed(dilation_, "dilation", dim); + params.transposed = transposed_; + params.output_padding = convolution_expand_param_if_needed(output_padding_, "output_padding", dim); + params.groups = groups_; + params.benchmark = benchmark; + params.deterministic = deterministic; + params.cudnn_enabled = cudnn_enabled; + + if (params.is_padding_neg()) throw std::runtime_error("negative padding is not supported"); + if (params.is_output_padding_neg()) throw std::runtime_error("negative output_padding is not supported"); + + check_input_shape_forward(input, weight, bias, params.groups, params.transposed); + + if (k == 3) { + params.view1d_as_2d(); + input = view4d(input); + weight = view4d(weight); + } + + auto output = input.type().tensor(); + + if (params.is_depthwise(input, weight)) { + /* output.resize_(output_size(input, weight)); */ + + auto kernel_size = weight.sizes().slice(2); + auto stride = params.stride; + auto padding = params.padding; + auto dilation = params.dilation; + + output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation); + } else if (params.use_cudnn(input)) { + if (input.type() != weight.type()){ + std::stringstream ss; + ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same"; + throw std::runtime_error(ss.str()); + } + if (bias.defined() && input.type() != bias.type()){ + std::stringstream ss; + ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same"; + throw std::runtime_error(ss.str()); + } + + if (params.transposed) { + output = at::cudnn_convolution_transpose( + input, weight, bias, + params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); + } else { + output = at::cudnn_convolution( + input, weight, bias, + params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); + } + } else if (params.use_mkldnn(input)) { +#if AT_MKLDNN_ENABLED() + if (input.type() != weight.type()){ + std::stringstream ss; + ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same"; + throw std::runtime_error(ss.str()); + } + if (bias.defined() && input.type() != bias.type()){ + std::stringstream ss; + ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same"; + throw std::runtime_error(ss.str()); + } + + output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups); +#endif + } else { + if (params.groups == 1) { + output = at::_convolution_nogroup( + input, weight, bias, params.stride, params.padding, params.dilation, params.transposed, params.output_padding); + } else { + std::vector outputs(params.groups); + for (int g = 0; g < params.groups; ++g) { + auto input_g = subtensor(input, 1, params.groups, g); + auto weight_g = subtensor(weight, 0, params.groups, g); + auto bias_g = subtensor(bias, 0, params.groups, g); + outputs[g] = at::_convolution_nogroup( + input_g, weight_g, bias_g, params.stride, params.padding, params.dilation, params.transposed, params.output_padding); + } + output = at::cat(outputs, 1); + } + } + + if (k == 3) { + output = view3d(output); + } + + return output; +} + +// A generic function for convolution implementations which don't +// natively implement groups (e.g., not CuDNN). +at::Tensor _convolution_nogroup( + const Tensor& input, const Tensor& weight, const Tensor& bias, + IntList stride, IntList padding, IntList dilation, + bool transposed, IntList output_padding) { + + ConvParams params; + params.stride = stride; + params.padding = padding; + params.dilation = dilation; + params.transposed = transposed; + params.output_padding = output_padding; + params.groups = 1; + params.benchmark = false; + params.deterministic = false; + params.cudnn_enabled = false; + + auto dim = input.ndimension(); + auto dilated = params.is_dilated(); + auto kernel_size = weight.sizes().slice(2); + + if (params.transposed) { + if (dim == 4) { + return at::thnn_conv_transpose2d( + input, weight, kernel_size, bias, + stride, padding, output_padding, dilation); + } else if (dim == 5) { + return at::thnn_conv_transpose3d( + input, weight, kernel_size, bias, + stride, padding, output_padding, dilation); + } + } else { /* Not transposed */ + if (dim == 4) { + if (dilated) { + return at::thnn_conv_dilated2d( + input, weight, kernel_size, bias, + stride, padding, dilation); + } else { /* dim == 4, non-dilated */ + /* CPU implementation has specialized MM kernels + for non-dilated case here */ + return at::thnn_conv2d( + input, weight, kernel_size, bias, + stride, padding); + } + } else if (dim == 5 && (input.type().is_cuda() || dilated)) { + return at::thnn_conv_dilated3d( + input, weight, kernel_size, bias, + stride, padding, dilation); + } else if (dim == 5) { /* dim == 5, CPU, non-dilated */ + /* CPU implementation has specialized MM kernels + for non-dilated case here */ + return at::thnn_conv3d( + input, weight, kernel_size, bias, + stride, padding); + } + } + + throw std::runtime_error("unsupported ConvNd parameters"); +} + +static Tensor subvariable(const Tensor& var, int dim, int groups, int g) { + int64_t n = var.sizes()[dim] / groups; + auto result = var.narrow(dim, n * g, n); + return result; +} + +std::tuple _convolution_double_backward( + const Tensor& ggI, const Tensor& ggW_r, const Tensor& ggb, + const Tensor& gO_r, const Tensor& weight_r, const Tensor& input, + IntList stride_, IntList padding_, IntList dilation_, + bool transposed_, IntList output_padding_, int64_t groups_, + bool benchmark, bool deterministic, bool cudnn_enabled, + std::array output_mask) { + + auto ggW = ggW_r; + auto gO = gO_r; + auto weight = weight_r; + + ConvParams params; + params.stride = stride_; + params.padding = padding_; + params.dilation = dilation_; + params.transposed = transposed_; + params.output_padding = output_padding_; + params.groups = groups_; + params.benchmark = benchmark; + params.deterministic = deterministic; + params.cudnn_enabled = cudnn_enabled; + + // Compute ggO = conv(ggI, w) + conv(i, ggW) + ggb + Tensor ggO; + if (ggI.defined()) { + if (weight.type().is_cuda()) { + weight = weight.contiguous(); + } + ggO = at::_convolution(ggI, weight, Tensor(), params.stride, params.padding, params.dilation, params.transposed, params.output_padding, params.groups, params.benchmark, params.deterministic, params.cudnn_enabled); + } + + if (ggW.defined()) { + if (ggW.type().is_cuda()) { + ggW = ggW.contiguous(); + } + auto ggW_term = at::_convolution(input, ggW, Tensor(), params.stride, params.padding, params.dilation, params.transposed, params.output_padding, params.groups, params.benchmark, params.deterministic, params.cudnn_enabled); + if (ggO.defined()) { + ggO = ggO + ggW_term; + } else { + ggO = ggW_term; + } + } + + if (ggb.defined()) { + // View as (1, ggb.size(0), 1, 1...) + + // Expand + std::vector new_size(gO.ndimension(), 1); + new_size[1] = ggb.sizes()[0]; + auto ggb_contiguous = ggb.contiguous(); + auto ggb_view = ggb_contiguous.view(new_size); + + // Expand + auto ggb_expanded = ggb_view.expand(gO.sizes()); + + if (ggO.defined()) { + ggO = ggO + ggb_expanded; + } else { + ggO = ggb_expanded; + } + } + + // Compute gW = conv(ggI, gO) + Tensor gW; + if (ggI.defined()) { + // Modified params with correct padding + ConvParams gw_conv_params(params); + + // Disable groups as they are handled separately + auto groups = gw_conv_params.groups; + gw_conv_params.groups = 1; + std::swap(gw_conv_params.dilation, gw_conv_params.stride); + + // Transpose gO and ggI to accumulate over batch + auto gOt = gO.transpose(0, 1); + auto ggIt = ggI.transpose(0, 1); + + Tensor gWt; + // Compute conv + if (groups == 1) { + if (gOt.type().is_cuda()) { + gOt = gOt.contiguous(); + } + + // Compute conv + if (params.transposed) { + gw_conv_params.transposed = false; + gWt = at::_convolution(gOt, ggIt, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled); + } else { + gWt = at::_convolution(ggIt, gOt, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled); + } + } else { + std::vector gWt_list(groups); + for (int g = 0; g < groups; ++g) { + auto ggIt_g = subvariable(ggIt, 0, groups, g); + auto gOt_g = subvariable(gOt, 0, groups, g); + if (gOt_g.type().is_cuda()) { + gOt_g = gOt_g.contiguous(); + } + + // Compute conv + if (params.transposed) { + gw_conv_params.transposed = false; + gWt_list[g] = at::_convolution(gOt_g, ggIt_g, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled); + } else { + gWt_list[g] = at::_convolution(ggIt_g, gOt_g, Tensor(), gw_conv_params.stride, gw_conv_params.padding, gw_conv_params.dilation, gw_conv_params.transposed, gw_conv_params.output_padding, gw_conv_params.groups, gw_conv_params.benchmark, gw_conv_params.deterministic, gw_conv_params.cudnn_enabled); + } + } + + gWt = at::cat(gWt_list, 1); + } + + // Transpose gW to match chan_in and chan_out + gW = gWt.transpose(0, 1); + + // narrow gW to only relevant portion + // we do it this way instead of narrowing the input itself because + // the ConvForward kernels don't support asymmetric padding. + auto gW_size = gW.sizes(); + auto w_size = weight.sizes(); + for (size_t i = 2; i < gW_size.size(); ++i) { + if (gW_size[i] > w_size[i]) { + gW = gW.narrow(i, 0, w_size[i]); + gW_size = gW.sizes(); + } + } + } + + // Compute gI = convT(ggW, gO.t()) if !transposed + // gI = conv(go, ggw) if transposed + Tensor gI; + if (ggW.defined()) { + ConvParams gi_conv_params(params); + gi_conv_params.transposed = !params.transposed; + + if (params.transposed) { + if (gO.type().is_cuda()) { + gO = gO.contiguous(); + } + gI = at::_convolution(gO, ggW, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups, gi_conv_params.benchmark, gi_conv_params.deterministic, gi_conv_params.cudnn_enabled); + + // narrow gI to only relevant portion + // we do it this way because negative output_padding is not supported + // TODO: figure out if we can narrow gO and save some compute, + // rather than narrowing the computed gI + auto gI_size = gI.sizes(); + auto i_size = input.sizes(); + for (size_t i = 2; i < gI_size.size(); ++i) { + if (gI_size[i] > i_size[i]) { + gI = gI.narrow(i, 0, i_size[i]); + gI_size = gI.sizes(); + } + } + } else { + auto groups = gi_conv_params.groups; + gi_conv_params.groups = 1; + // swap stride and dilation + std::swap(gi_conv_params.dilation, gi_conv_params.stride); + + auto ggWt = ggW.transpose(0, 1); + auto gOt = gO.transpose(0, 1); + + // calculate output_padding + // TODO: figure out why this needs to be computed... + auto kernel_size = weight.sizes().slice(2); + auto input_shape = input.sizes().slice(2); + auto grad_output_shape = gO.sizes().slice(2); + + if (kernel_size.size() == 1) { + auto expected_input_shape = (kernel_size[0] - 1) * gi_conv_params.stride[1] + - 2 * gi_conv_params.padding[1] + + (gi_conv_params.dilation[1] * (grad_output_shape[0] - 1) + 1); + if (expected_input_shape != input_shape[0]) { + gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape; + } + } else { + for(size_t i = 0; i < kernel_size.size(); ++i) { + // Check if whole input has been used or not + auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.stride[i] + - 2 * gi_conv_params.padding[i] + + (gi_conv_params.dilation[i] * (grad_output_shape[i] - 1) + 1); + if (expected_input_shape != input_shape[i]) { + gi_conv_params.output_padding[i] = input_shape[i] - expected_input_shape; + } + } + } + + Tensor gIt; + if (params.groups == 1) { + if (gOt.type().is_cuda()) { + gOt = gOt.contiguous(); + } + + gIt = at::_convolution(ggWt, gOt, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups, gi_conv_params.benchmark, gi_conv_params.deterministic, gi_conv_params.cudnn_enabled); + } else { + std::vector gIt_list(params.groups); + for (int g = 0; g < groups; ++g) { + auto ggWt_g = subvariable(ggWt, 1, groups, g); + auto gOt_g = subvariable(gOt, 0, groups, g); + if (gOt_g.type().is_cuda()) { + gOt_g = gOt_g.contiguous(); + } + + gIt_list[g] = at::_convolution(ggWt_g, gOt_g, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups, gi_conv_params.benchmark, gi_conv_params.deterministic, gi_conv_params.cudnn_enabled); + } + + gIt = at::cat(gIt_list, 0); + } + + gI = gIt.transpose(0, 1); + } + } + + if (output_mask[0] && !ggO.defined()) ggO = at::zeros_like(gO); + if (output_mask[1] && !gI.defined()) gI = at::zeros_like(input); + if (output_mask[2] && !gW.defined()) gW = at::zeros_like(weight); + + return std::tuple{ggO, gI, gW}; +} + +}} // at::native diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp new file mode 100644 index 0000000..0c2ac96 --- /dev/null +++ b/aten/src/ATen/native/ConvolutionTBC.cpp @@ -0,0 +1,107 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include + +namespace at { +namespace native { + +Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, int64_t pad) { + AT_CHECK(self.dim() == 3, "Input must have 3 dims: time, batch, " + "in_channel"); + AT_CHECK(weight.dim() == 3, "Weight tensor must have 3 dims: kernel_width," + " in_channels, out_channels."); + AT_CHECK(bias.dim() == 1, "Bias must be 1-D"); + + auto input_size = self.sizes(); + auto weight_size = weight.sizes(); + + auto ilen = input_size[0]; + auto batchSize = input_size[1]; + auto inputPlanes = input_size[2]; + auto outputPlanes = weight_size[2]; + auto kw = weight_size[0]; + auto olen = input_size[0] - kw + 1 + pad * 2; + auto real_pad = (olen - ilen + kw - 1) / 2; + + // Make sure shapes are correct. + // Input = (time, batch, in_channels) + // Weight = (kernel_width, in_channels, out_channels) + // Bias = (out_channels) + AT_CHECK(inputPlanes == weight_size[1], "Input dim 2 (input channels) " + "is not == dim 1 in the weight tensor"); + AT_CHECK(weight_size[2] == bias.sizes()[0], "Bias size must equal dim 2 in " + "the weight tensor (output channels)."); + + // input * weights + bias -> output_features + Tensor output = self.type().tensor({ + olen, + input_size[1], + weight_size[2], + }); + output.copy_(bias.expand(output.sizes())); + for (int k = 0; k < kw; k++) { + int iShift = std::max(0, static_cast(k - real_pad)); + int oShift = std::max(0, static_cast(real_pad - k)); + int t = std::min(ilen + real_pad - k, olen) - oShift; + // Note: gemm assumes column-major matrices + // input is l*m (row-major) + // weight is m*r (row-major) + // output is l*r (row-major) + if (t > 0) { + auto W = weight[k]; + auto I = self.narrow(0, iShift, t).view({t * batchSize, inputPlanes}); + auto O = output.narrow(0, oShift, t).view({t * batchSize, outputPlanes}); + O.addmm_(I, W); + } + } + return output; +} + +std::tuple conv_tbc_backward(const Tensor& dOutput, const Tensor& input, const Tensor& weight, const Tensor& bias, int64_t pad) { + auto input_size = input.sizes(); + auto weight_size = weight.sizes(); + + auto ilen = input_size[0]; + auto batchSize = input_size[1]; + auto inputPlanes = input_size[2]; + auto outputPlanes = weight_size[2]; + auto kw = weight.sizes()[0]; + auto olen = input_size[0] - kw + 1 + pad * 2; + int real_pad = (olen - ilen + kw - 1) / 2; + + Tensor dInput = at::zeros_like(input); + for (int k = 0; k < kw; k++) { + int iShift = std::max(0, k - real_pad); + int oShift = std::max(0, real_pad - k); + int t = std::min(ilen + real_pad - k, olen) - oShift; + // dOutput * T(weight) -> dInput + if (t > 0) { + auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes}); + auto dI = dInput.narrow(0, iShift, t).view({t * batchSize, inputPlanes}); + dI.addmm_(dO, weight[k].t()); + } + } + + Tensor dWeight = at::zeros_like(weight); + for (int k = 0; k < kw; k++) { + int iShift = std::max(0, k - real_pad); + int oShift = std::max(0, real_pad - k); + int t = std::min(ilen + real_pad - k, olen) - oShift; + // T(input) * dOutput -> dWeight + if (t > 0) { + auto dW = dWeight[k]; + auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes}); + auto I = input.narrow(0, iShift, t).view({t * batchSize, inputPlanes}).t(); + dW.addmm_(I, dO); + } + } + + Tensor dBias = at::zeros_like(bias); + auto tmp = dOutput.sum(0, false); + dBias.copy_(tmp.sum(0)); + + return std::make_tuple(dInput, dWeight, dBias); +} + +} +} diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp new file mode 100644 index 0000000..da49e28 --- /dev/null +++ b/aten/src/ATen/native/Distance.cpp @@ -0,0 +1,10 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + + +namespace at { namespace native { + +Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double eps, bool keepdim) { + return at::norm(x1 - x2 + eps, p, 1, keepdim); +} +}} // namespace at::native diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp new file mode 100644 index 0000000..a9bd61a --- /dev/null +++ b/aten/src/ATen/native/Distributions.cpp @@ -0,0 +1,208 @@ +#include "ATen/ATen.h" +#include "ATen/CPUApplyUtils.h" +#include "ATen/Dispatch.h" +#include "ATen/Error.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" + +#include "ATen/CPUGenerator.h" +#include "ATen/CheckGenerator.h" +#include "ATen/Generator.h" +#include "ATen/native/Distributions.h" + +#include + +#include "TH/THRandom.h" +#include "TH/THGenerator.hpp" +#include "TH/THMath.h" + +namespace { +/* + * This section is a counterpart to Distributions.cu + * + */ + +// The function `sample_poisson` +// is adapted from Numpy's distributions.c implementation. +// It is MIT licensed, so here is the copyright: + +/* Copyright 2005 Robert Kern (robert.kern@gmail.com) + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +THGenerator* get_generator(at::Generator* gen) { + auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU); + auto gen_ = at::check_generator(gen, default_gen); + return gen_->generator; +} + +int64_t sample_poisson(double lambda, THGenerator* generator) { + if (lambda >= 10) { + // transformed rejection method, (Hoermann, 1993) + int64_t k; + double U, V, a, b, invalpha, vr, us; + + double slam = std::sqrt(lambda); + double loglam = std::log(lambda); + b = 0.931 + 2.53 * slam; + a = -0.059 + 0.02483 * b; + invalpha = 1.1239 + 1.1328 / (b - 3.4); + vr = 0.9277 - 3.6224 / (b - 2); + + while (1) { + U = THRandom_standard_uniform(generator) - 0.5; + V = THRandom_standard_uniform(generator); + us = 0.5 - std::fabs(U); + k = (int64_t)std::floor((2 * a / us + b) * U + lambda + 0.43); + if ((us >= 0.07) && (V <= vr)) { + return k; + } + if ((k < 0) || ((us < 0.013) && (V > us))) { + continue; + } + if ((std::log(V) + std::log(invalpha) - std::log(a / (us * us) + b)) <= + (-lambda + k * loglam - std::lgamma((double)k + 1))) { + return k; + } + } + } else if (lambda == 0) { + return 0; + } else { + int64_t X; + double prod, U, enlam; + + enlam = std::exp(-lambda); + X = 0; + prod = 1.0; + while (1) { + U = THRandom_standard_uniform(generator); + prod *= U; + if (prod > enlam) { + X += 1; + } else { + return X; + } + } + } +} + +} // namespace + +namespace at { +namespace native { + +Tensor bernoulli(const Tensor& self, const Tensor& p, Generator* gen) { + Tensor result = self.type().tensor(); + result.resize_(self.sizes()); + return native::bernoulli_(result, p, gen); +} + +Tensor bernoulli(const Tensor& self, double p, Generator* gen) { + Tensor result = self.type().tensor(); + result.resize_(self.sizes()); + return native::bernoulli_(result, p, gen); +} + +Tensor bernoulli(const Tensor& self) { + Tensor result = self.type().tensor(); + result.resize_(self.sizes()); + return native::bernoulli(result, self, nullptr); +} + +Tensor& bernoulli_(Tensor& self, const Tensor& p_, Generator* gen) { + if (!self.is_cuda() && !p_.is_cuda()) { + Tensor p = p_.toType(kDouble); + AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_", [&] { + THGenerator* generator = get_generator(gen); + std::lock_guard lock(generator->mutex); + CPU_tensor_apply2( + self, p, [generator](scalar_t& ret_val, double& p_val) { + ret_val = (scalar_t)THRandom_bernoulli(generator, p_val); + }); + }); + return self; + } + self.copy_(at::_th_bernoulli(std::get<0>(expand_inplace(self, p_)), gen)); + return self; +} + +Tensor& bernoulli_(Tensor& self, double p, Generator* gen) { + self._bernoulli_(p, gen); + return self; +} + +Tensor& bernoulli_(Tensor& self) { + return native::bernoulli_(self, 0.5, nullptr); +} + +Tensor _standard_gamma_grad_cpu(const Tensor& self, const Tensor& output) { + Tensor ret = self.type().tensor(self.sizes()); + AT_DISPATCH_FLOATING_TYPES(self.type(), "_standard_gamma_grad", [&] { + CPU_tensor_apply3(ret, self, output, + [](scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) { + ret_val = standard_gamma_grad_one(self_val, output_val); + } + ); + }); + return ret; +} + +/* + * This section is a counterpart to Distributions.cu + */ + +Tensor _s_poisson_cpu(const Tensor& lambda, Generator *gen) { + Tensor ret = at::zeros(lambda.sizes(), lambda.type()); + AT_DISPATCH_FLOATING_TYPES(ret.type(), "poisson", [&] { + THGenerator* generator = get_generator(gen); + std::lock_guard lock(generator->mutex); + CPU_tensor_apply2(ret, lambda, + [generator](scalar_t& ret_val, const scalar_t& lambda){ + ret_val = static_cast(sample_poisson(static_cast(lambda), generator)); + } + ); + }); + return ret; +} + +Tensor _s_gamma_cpu(const Tensor& alpha, Generator *gen) { + Tensor ret = at::zeros(alpha.sizes(), alpha.type()); + AT_DISPATCH_FLOATING_TYPES(ret.type(), "gamma", [&] { + THGenerator* generator = get_generator(gen); + std::lock_guard lock(generator->mutex); + CPU_tensor_apply2(ret, alpha, + [generator](scalar_t& ret_val, const scalar_t& alpha){ + BaseSampler standard_uniform([generator] () { + return THRandom_standard_uniform(generator); + }); + BaseSampler standard_normal([generator] () { + return THRandom_normal(generator, 0.0, 1.0); + }); + auto sample = sample_gamma(alpha, standard_uniform, standard_normal); + ret_val = std::max(std::numeric_limits::min(), (scalar_t) sample); + } + ); + }); + + return ret; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h new file mode 100644 index 0000000..7a6e078 --- /dev/null +++ b/aten/src/ATen/native/Distributions.h @@ -0,0 +1,215 @@ +#include "TH/THMath.h" +#ifdef __CUDA_ARCH__ +#include +#endif + +namespace { + +#ifdef __CUDA_ARCH__ +#define nvfunction_or_function nvstd::function +#define deviceforcuda __device__ +#else +#define nvfunction_or_function std::function +#define deviceforcuda +// we cannot use std::isnan directly due to some incompatibility of +// gcc constexpr'ing and nvcc +#define isnan std::isnan +#endif + +template +struct BaseSampler { + nvfunction_or_function sampler; + deviceforcuda BaseSampler(nvfunction_or_function sampler): sampler(sampler) {} + deviceforcuda scalar_t sample() { + return sampler(); + } +}; + +// The function `sample_gamma` is +// is adapted from Numpy's distributions.c implementation. +// It is MIT licensed, so here is the copyright: + +/* Copyright 2005 Robert Kern (robert.kern@gmail.com) + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +template +deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler& standard_uniform, BaseSampler& standard_normal) { + accscalar_t scale = 1.0f; + + // Boost alpha for higher acceptance probability. + if (alpha < 1.0f) { + scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha); + alpha += 1.0f; + } + + // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) + // doi:10.1145/358407.358414 + const accscalar_t d = alpha - 1.0f / 3.0f; + const accscalar_t c = 1.0f / std::sqrt(9.0f * d); + for (;;) { + accscalar_t x, y; + do { + x = standard_normal.sample(); + y = 1.0f + c * x; + } while (y <= 0); + const accscalar_t v = y * y * y; + const accscalar_t u = 1 - standard_uniform.sample(); + const accscalar_t xx = x * x; + if (u < 1.0f - 0.0331f * xx * xx) + return static_cast(scale * d * v); + if (std::log(u) < 0.5f * xx + d * (1.0f - v + std::log(v))) + return static_cast(scale * d * v); + } +} + +template +deviceforcuda static inline scalar_t polevl(const scalar_t x, const scalar_t A[], size_t len) { + scalar_t result = 0; + for (size_t i = 0; i <= len; i++) { + result = result * x + A[i]; + } + return result; +} + + +/* + * The following function comes with the following copyright notice. + * It has been released under the BSD license. + * + * Cephes Math Library Release 2.8: June, 2000 + * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier + */ +template +deviceforcuda static inline scalar_t digamma_one(scalar_t x) { + constexpr accscalar_t PSI_10 = 2.25175258906672110764; + if (x == 0) { + return INFINITY; + } + accscalar_t additional_summand = 0; + int x_is_integer = x == std::floor(x); + if (x < 0) { + if (x_is_integer) { + return INFINITY; + } + // it is more standard to write this as recursion, but + // nvcc does not like that + additional_summand = - static_cast(M_PI) / std::tan(static_cast(M_PI) * x); + x = 1 - x; + } + + // Push x to be >= 10 + accscalar_t result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10 + additional_summand; + } + + // Compute asymptotic digamma + static const accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t y = 0; + if (x < 1.0e17f) { + accscalar_t z = 1.0 / (x * x); + y = z * polevl(z, A, 6); + } + return static_cast(result + std::log(x) - (0.5f / x) - y + additional_summand); +} + +// Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha) +// for random number x drawn from a standard Gamma distribution Gamma(alpha). +template +deviceforcuda scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { + // Use a Taylor series expansion for small x. + accscalar_t x = static_cast(x_); + accscalar_t alpha = static_cast(alpha_); + if (x < 0.8f) { + accscalar_t numer = 1; + accscalar_t denom = alpha; + auto series1 = numer / denom; + auto series2 = numer / (denom * denom); + for (int i = 1; i <= 5; ++i) { + numer *= -x / static_cast(i); + denom += 1; + series1 += numer / denom; + series2 += numer / (denom * denom); + } + const auto pow_x_alpha = std::pow(x, alpha); + const auto gamma_pdf = std::pow(x, alpha - 1) * std::exp(-x); + const auto gamma_cdf = pow_x_alpha * series1; + const auto gamma_cdf_alpha = (std::log(x) - digamma_one(alpha)) * gamma_cdf + - pow_x_alpha * series2; + const auto result = -gamma_cdf_alpha / gamma_pdf; + return isnan(result) ? static_cast( 0.f ) : static_cast(result); + } + + // Use a Rice saddle point expansion for large alpha. + if (alpha > 8.0f) { + if (0.9f * alpha <= x && x <= 1.1f * alpha) { + const auto numer_1 = 1 + 24 * alpha * (1 + 12 * alpha); + const auto numer_2 = 1440 * (alpha * alpha) + 6 * x * (53 - 120 * x) + - 65 * x * x / alpha + alpha * (107 + 3600 * x); + const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha); + return static_cast(numer_1 * numer_2 / denom); + } + const auto denom = std::sqrt(8 * alpha); + const auto term2 = denom / (alpha - x); + const auto term3 = std::pow(x - alpha - alpha * std::log(x / alpha), static_cast(-1.5)); + const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3; + const auto term1 = std::log(x / alpha) * term23 + - std::sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x)); + const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha)); + const auto numer = x * term1; + return static_cast(-stirling * numer / denom); + } + + // Use a bivariate rational approximation to the reparameterized gradient. + const auto u = std::log(x / alpha); + const auto v = std::log(alpha); + static const accscalar_t coef_uv[3][8] = { + {0.16009398, -0.094634809, 0.025146376, -0.0030648343, + 1, 0.32668115, 0.10406089, 0.0014179084}, + {0.53487893, 0.1298071, 0.065735949, -0.0015649758, + 0.16639465, 0.020070113, -0.0035938915, -0.00058392623}, + {0.040121004, -0.0065914022, -0.0026286047, -0.0013441777, + 0.017050642, -0.0021309326, 0.00085092367, -1.5247877e-07}, + }; + accscalar_t coef_v[8]; + for (int i = 0; i < 8; ++ i) { + coef_v[i] = coef_uv[0][i] + u * (coef_uv[1][i] + u * coef_uv[2][i]); + } + const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3])); + const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7])); + return static_cast(std::exp(p / q)); +} + +} // namespace diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp new file mode 100644 index 0000000..7599386 --- /dev/null +++ b/aten/src/ATen/native/Embedding.cpp @@ -0,0 +1,182 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + + +namespace at { namespace native { + +Tensor embedding(const Tensor & weight, const Tensor & indices, + int64_t padding_idx, bool scale_grad_by_freq, bool sparse) { + auto indices_arg = TensorArg(indices, "indices", 1); + checkScalarType("embedding", indices_arg, kLong); + + // TODO: use tensor.index() after improving perf + if (indices.dim() == 1) { + return weight.index_select(0, indices); + } + + auto size = std::vector(indices.sizes()); + for (auto d : weight.sizes().slice(1)) { + size.push_back(d); + } + return weight.index_select(0, indices.reshape(-1)).view(size); +} + +Tensor embedding_backward( + const Tensor & grad, const Tensor & indices, int64_t num_weights, + int64_t padding_idx, bool scale_grad_by_freq, bool sparse) { + if (sparse) { + return at::embedding_sparse_backward( + grad, indices, num_weights, padding_idx, scale_grad_by_freq); + } else { + return at::embedding_dense_backward( + grad, indices, num_weights, padding_idx, scale_grad_by_freq); + } +} + +Tensor embedding_sparse_backward( + const Tensor & grad_, const Tensor & indices_, int64_t num_weights, + int64_t padding_idx, bool scale_grad_by_freq) { + + auto indices_arg = TensorArg(indices_, "indices", 2); + checkScalarType("embedding_backward", indices_arg, kLong); + + // TODO: implement scale_grad_by_freq + if (scale_grad_by_freq) { + AT_ERROR( + "embedding_backward: scale_grad_by_freq not supported with sparse gradients"); + } + + Tensor indices = indices_; + Tensor grad = grad_; + if (padding_idx != -1) { + auto c = indices != padding_idx; + indices = indices.index(c); + grad = grad.index(c); + } + + int64_t num_features = grad_.size(-1); + auto weight_size = std::array{{ num_weights, num_features }}; + auto& dense_type = grad.type(); + auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? kSparseCUDA : kSparseCPU); + + // check if all our grad come from padding_idx + if (grad.numel() == 0) { + // FIXME: USE_TH_SIZE_ZERO_DIM + return sparse_type._sparse_coo_tensor_unsafe(indices_.type().tensor(), + dense_type.tensor(), weight_size); + } + + auto index = indices.reshape({1, -1}); + auto values = grad.reshape({-1, num_features}); + return sparse_type._sparse_coo_tensor_unsafe(index, values, weight_size); +} + +Tensor embedding_dense_backward_cpu( + const Tensor & grad_, const Tensor & indices, int64_t num_weights, + int64_t padding_idx, bool scale_grad_by_freq) { + + auto indices_arg = TensorArg(indices, "indices", 2); + checkScalarType("embedding_backward", indices_arg, kLong); + + auto indices_contig = indices.contiguous(); + auto indices_data = indices_contig.data(); + int64_t numel = indices.numel(); + + std::unique_ptr counts; + if (scale_grad_by_freq) { + counts.reset(new int64_t[num_weights]); + for (int i = 0; i < numel; i++) { + counts[indices_data[i]] = 0; + } + for (int i = 0; i < numel; i++) { + counts[indices_data[i]]++; + } + } + + auto grad = grad_.contiguous().view({numel, grad_.size(-1)}); + auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options()); + +#ifdef _OPENMP + if (numel > 1000) { + // The strategy is to parallelize over sections of the vocabulary, so that + // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread + // has to traverse the entire input, but the dominating factor is the axpy + // BLAS call. + #pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int64_t start = tid * (num_weights/nthreads + 1); + int64_t end = start + (num_weights/nthreads + 1); + for (int64_t i = 0; i < numel; i++) { + if (indices_data[i] != padding_idx) { + int64_t k = indices_data[i]; + if (k >= start && k < end) { + double scale = 1.0; + if (scale_grad_by_freq) { + scale /= counts[k]; + } + grad_weight[k].add_(grad[i], scale); + } + } + } + } + return grad_weight; + } +#endif + + for (int64_t i = 0; i < numel; i++) { + if (indices_data[i] != padding_idx) { + int64_t k = indices_data[i]; + double scale = 1.0; + if (scale_grad_by_freq) { + scale /= counts[k]; + } + grad_weight[k].add_(grad[i], scale); + } + } + + return grad_weight; +} + +Tensor & embedding_renorm_cpu_( + Tensor & self, const Tensor & indices, double max_norm, double norm_type) { + auto self_arg = TensorArg(self, "self", 1); + auto indices_arg = TensorArg(indices, "indices", 2); + checkDim("embedding_renorm_", self_arg, 2); + checkScalarType("embedding_renorm_", indices_arg, kLong); + + auto indices_contig = indices.contiguous(); + + auto num_indices = indices.numel(); + auto data_ptr = indices_contig.data(); + auto sorted_indices = std::vector(data_ptr, data_ptr + num_indices); + std::sort(sorted_indices.begin(), sorted_indices.end(), std::less()); + + #pragma omp parallel for if(num_indices > 1000) + for (int64_t i = 0; i < num_indices; i++) { + if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) { + continue; + } + auto row = self[sorted_indices[i]]; + auto norm = row.norm(norm_type).toCDouble(); + if (norm > max_norm) { + auto scale = max_norm / (norm + 1e-7); + row *= scale; + } + } + + return self; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp new file mode 100644 index 0000000..d171893 --- /dev/null +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -0,0 +1,356 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" + +#include "TH/THBlasUtils.h" + +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace { + const int MODE_SUM = 0; + const int MODE_MEAN = 1; + const int MODE_MAX = 2; +} + +namespace at { +namespace native { + +static void make_offset2bag(const Tensor &offsets, const Tensor &indices, + Tensor &offset2bag) { + offset2bag.index_add_( + 0, offsets, at::ones_like(offsets)); // offset2bag = [1 0 1 0 1] + offset2bag[0] -= 1; // offset2bag = [0 0 1 0 1] + offset2bag = offset2bag.cumsum(0); // offset2bag = [0 0 1 1 2] +} + +// This function combines index_select (using select_indices as the index) and +// index_add (using add_indices as the index), without creating an intermediary +// tensor to hold the selected embeddings +template +static void index_select_add(const Tensor &select_indices, + const Tensor &add_indices, + const Tensor &src, + Tensor &output) { + auto add_indices_data = add_indices.data(); + auto select_indices_data = select_indices.data(); + auto src_data = src.data(); + auto output_data = output.data(); + auto numel = add_indices.numel(); + int64_t ddim = src.size(1); + auto src_stride0 = src.stride(0); + auto src_stride1 = src.stride(1); + auto output_stride0 = output.stride(0); + auto output_stride1 = output.stride(1); + for (int64_t i = 0; i < numel; i++) { + THBlas_axpy(ddim, 1, + src_data + src_stride0 * select_indices_data[i], src_stride1, + output_data + output_stride0 * add_indices_data[i], output_stride1); + } +} + +static void make_bag_size(const Tensor &offsets, const Tensor &indices, + const int64_t mode, Tensor &bag_size) { + if (mode == MODE_MEAN || mode == MODE_MAX) { + // Compute this for MODE_MEAN and MODE_MAX (latter needed for backwards) + if (offsets.size(0) != 1) { + bag_size.slice(0, 0, bag_size.size(0) - 1, 1) = + offsets.slice(0, 1, offsets.size(0), 1) - + offsets.slice(0, 0, offsets.size(0) - 1, 1); + } + bag_size[-1] = indices.size(0) - offsets[-1]; + } +} + +static Tensor apply_bag_size(const Tensor &offsets, const Tensor &indices, + const int64_t mode, Tensor &output, + const Tensor &bag_size) { + if (mode == MODE_MEAN) { + if (offsets.size(0) == 1) { + auto bag_size_ = indices.size(0); + output /= bag_size_; + } else { + // Avoid dividing by 0 for empty bags. + // Instead we want empty bags to return all 0s + auto bag_size_ = at::max(bag_size, at::ones_like(bag_size)) + .toType(output.type()) + .unsqueeze(1) + .expand_as(output); + output /= bag_size_; + } + } + return output; +} + +static Tensor apply_bag_size_backward(const Tensor &offsets, + const Tensor &indices, const int64_t mode, + Tensor &output, const Tensor &offset2bag, + const Tensor &bag_size) { + if (mode == MODE_MEAN) { + if (offsets.size(0) == 1) { + auto bag_size_ = indices.size(0); + output /= bag_size_; + } else { + auto inv_bag_size_ = (1 / bag_size.toType(output.type())) + .unsqueeze(1) + .index_select(0, offset2bag); + output *= inv_bag_size_; + } + } + return output; +} + + +template +std::tuple embedding_bag_cpu_max( + const Tensor& weight, const Tensor &indices, const Tensor& offset2bag, const Tensor& output, const Tensor& bag_size, const Tensor& offsets) { + + auto max_indices = at::zeros({offsets.size(0), weight.size(1)}, indices.type()); + + int64_t numel = indices.numel(); + int64_t dims = weight.size(1); + auto indices_data = indices.data(); + auto offset2bag_data = offset2bag.data(); + + auto max_indices_data = max_indices.data(); + auto max_indices_stride = max_indices.stride(0); + + auto weight_data = weight.data(); + auto output_data = output.data(); + auto weight_stride0 = weight.stride(0); + auto weight_stride1 = weight.stride(1); + auto output_stride = output.stride(0); + + for (int i = 0; i < numel; i++) { + auto bag = offset2bag_data[i]; + auto word_idx = indices_data[i]; + + + for (int dim = 0; dim < dims; dim++) { + auto& current_item = output_data[output_stride * bag + dim]; + auto weight_item = weight_data[weight_stride0 * word_idx + dim * weight_stride1]; + + bool is_first_for_bag = (i == 0) || offset2bag_data[i - 1] != bag; + + if (is_first_for_bag || weight_item > current_item) { + current_item = weight_item; + max_indices_data[max_indices_stride * bag + dim] = word_idx; + } + } + } + + return std::tuple(output, offset2bag, bag_size, max_indices); +} + +// embedding_bag wrapper to enforce contiguity in tensors other than `weight`. +// This is created to save extra `.contiguous()` call in backward. +// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details +std::tuple +embedding_bag(const Tensor &weight, const Tensor &indices, + const Tensor &offsets, const bool scale_grad_by_freq, + const int64_t mode, bool sparse) { + return at::_embedding_bag(weight, indices.contiguous(), offsets.contiguous(), + scale_grad_by_freq, mode, sparse); + }; + +// Assumes all input tensors except for `weight` are contiguous. +// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details +std::tuple +_embedding_bag_cpu(const Tensor &weight, const Tensor &indices, + const Tensor &offsets, const bool scale_grad_by_freq, + const int64_t mode, bool sparse) { + auto indices_arg = TensorArg(indices, "indices", 1); + checkScalarType("embedding_bag", indices_arg, kLong); + auto offsets_arg = TensorArg(offsets, "offsets", 1); + checkScalarType("embedding_bag", indices_arg, kLong); + auto weight_arg = TensorArg(weight, "weight", 1); + checkScalarTypes("embedding_bag", weight_arg, {kFloat, kDouble}); + + auto bag_size = at::zeros(offsets.sizes(), indices.type()); + make_bag_size(offsets, indices, mode, bag_size); + + // If the last entries are empty, that the last offsets are irrelevant as they + // won't change anything in the assignment of ID -> bag, but index_add would + // throw out of bounds error. So to keep it simple we just add one more + // entry to the end then get rid of it after make_offset2bag. + auto offset2bag = at::zeros( + {indices.sizes()[0] + 1}, indices.options()); // offset2bag = [0 0 0 0 0] + + make_offset2bag(offsets, indices, offset2bag); + + offset2bag.resize_({indices.sizes()[0]}); + + auto output = at::zeros({offsets.size(0), weight.size(1)}, weight.options()); + + if (mode == MODE_MEAN || mode == MODE_SUM) { + if (weight.type().scalarType() == kFloat) { + index_select_add(indices, offset2bag, weight, output); + } else if (weight.type().scalarType() == kDouble) { + index_select_add(indices, offset2bag, weight, output); + } + auto ret = apply_bag_size(offsets, indices, mode, output, bag_size); + return std::tuple(ret, offset2bag, bag_size, bag_size); + } else { // MODE_MAX + return AT_DISPATCH_FLOATING_TYPES_AND_HALF( + weight.type(), "embedding_bag_cpu_max", [&]() { + return embedding_bag_cpu_max(weight, indices, offset2bag, output, bag_size, offsets); + } + ); + } +} + +// Assumes all input tensors are contiguous. +// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details +Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices, + const Tensor &offsets, + const Tensor &offset2bag, + const Tensor &bag_size_, + const Tensor &max_indices_, + int64_t num_weights, + bool scale_grad_by_freq, int64_t mode, + bool sparse) { + auto indices_arg = TensorArg(indices, "indices", 1); + checkScalarType("embedding_bag", indices_arg, kLong); + checkContiguous("embedding_bag", indices_arg); + auto offsets_arg = TensorArg(offsets, "offsets", 1); + checkScalarType("embedding_bag", offsets_arg, kLong); + checkContiguous("embedding_bag", offsets_arg); + auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1); + checkScalarType("embedding_bag", offset2bag_arg, kLong); + checkContiguous("embedding_bag", offset2bag_arg); + + if (sparse) { + return at::_embedding_bag_sparse_backward( + grad, indices, offsets, offset2bag, bag_size_, num_weights, + scale_grad_by_freq, mode); + } else { + return at::_embedding_bag_dense_backward( + grad, indices, offsets, offset2bag, bag_size_, max_indices_, num_weights, + scale_grad_by_freq, mode); + } +} + +Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indices_, + const Tensor &offsets_, + const Tensor &offset2bag__, + const Tensor &bag_size_, + const Tensor& max_indices_, int64_t num_weights, + bool scale_grad_by_freq, int64_t mode) { + // indices_, offsets_ and offset2bag__ are assumed having correct dtypes and + // contiguous here due to the checks in _embedding_bag_backward above. + // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml + // for more details. + + auto grad = grad_.contiguous(); + auto grad_arg = TensorArg(grad, "grad_", 1); + checkScalarTypes("embedding_bag", grad_arg, {kFloat, kDouble}); + + Tensor &offset2bag_ = const_cast(offset2bag__); + + auto ind_sort_ = indices_.sort(); + auto indices = std::get<0>(ind_sort_); + auto ind_sort = std::get<1>(ind_sort_); + auto offset2bag = offset2bag_.index_select(0, ind_sort); + + auto indices_data = indices.data(); + auto offsets_data = offsets_.data(); + auto offset2bag_data = offset2bag.data(); + int64_t numel = indices.numel(); + + std::vector counts(num_weights); + for (int i = 0; i < numel; i++) { + counts[indices_data[i]] = 0; + } + for (int i = 0; i < numel; i++) { + counts[indices_data[i]]++; + } + + auto index_grad_weight = + at::zeros({num_weights, grad.size(1)}, grad.type()).contiguous(); + + std::vector counts_uniq; + counts_uniq.reserve(num_weights); + int64_t o = 0; + for (int64_t i = 0; i < numel; i += counts[indices_data[i]]) { + counts_uniq.push_back(counts[indices_data[i]]); + if (o > 0) { + counts_uniq[o] += counts_uniq[o - 1]; + } + o++; + } + + if (mode == MODE_MEAN || mode == MODE_SUM) { + #pragma omp parallel for if (numel > 1000) + for (int64_t i = 0; i < (int64_t)counts_uniq.size(); i++) { + int64_t start = i == 0 ? 0 : counts_uniq[i - 1]; + int64_t index = indices_data[start]; + for (int64_t j = start; j < counts_uniq[i]; j++) { + int64_t source = offset2bag_data[j]; + double scale = 1.0; + if (scale_grad_by_freq) { + scale /= counts[indices_data[i]]; + } + if (mode == 1) { // MODE_MEAN + if (offsets_.size(0) == 1) { + auto bag_size = indices.size(0); + scale /= bag_size; + } else { + if (source == offsets_.size(0) - 1) { + scale /= indices.size(0) - offsets_data[offsets_.size(0) - 1]; + } else { + scale /= offsets_data[source + 1] - offsets_data[source]; + } + } + } + int64_t ddim = grad.size(1); + if (grad.type().scalarType() == kFloat) { + auto igwd = index_grad_weight.data(); + auto gd = grad.data(); + THBlas_axpy(ddim, (float)scale, gd + ddim * source, 1, + igwd + ddim * index, 1); + } else if (grad.type().scalarType() == kDouble) { + auto igwd = index_grad_weight.data(); + auto gd = grad.data(); + THBlas_axpy(ddim, (double)scale, gd + ddim * source, 1, + igwd + ddim * index, 1); + } + } + } + } else if (mode == MODE_MAX) { + auto nonempty_max_indices = max_indices_.index_select(0, bag_size_.nonzero().view(-1)); + auto nonempty_grad = grad_.index_select(0, bag_size_.nonzero().view(-1)); + + for (int64_t dim = 0; dim < grad.size(1); dim++) { + index_grad_weight.select(1, dim).index_add_( + 0, nonempty_max_indices.select(1, dim), nonempty_grad.select(1, dim)); + } + } + + return index_grad_weight; +} + +Tensor _embedding_bag_sparse_backward( + const Tensor &grad_, const Tensor &indices, const Tensor &offsets, + const Tensor &offset2bag, const Tensor &bag_size_, int64_t num_weights, + bool scale_grad_by_freq, int64_t mode) { + // indices, offsets and offset2bag are assumed having correct dtypes and + // contiguous here due to the checks in _embedding_bag_backward above. + // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml + // for more details. + + Tensor grad = grad_; + Tensor index_grad = grad_.index_select(0, offset2bag); + index_grad = apply_bag_size_backward(offsets, indices, mode, index_grad, + offset2bag, bag_size_); + return native::embedding_backward(index_grad, indices, num_weights, -1, + scale_grad_by_freq, true); +} +} +} // namespace at::native diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp new file mode 100644 index 0000000..0e9a594 --- /dev/null +++ b/aten/src/ATen/native/Gesv.cpp @@ -0,0 +1,126 @@ +#include "ATen/ATen.h" +#include "ATen/CPUApplyUtils.h" +#include "ATen/Dispatch.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" + +#include "ATen/native/LinearAlgebraUtils.h" +#include "ATen/native/Gesv.h" + +#include "TH.h" // for USE_LAPACK + +#include + +#ifdef USE_LAPACK +extern "C" void dgesv_( + int* n, int* nrhs, double* a, int* lda, + int *ipiv, double* b, int* ldb, int* info); +extern "C" void sgesv_( + int* n, int* nrhs, float* a, int* lda, + int* ipiv, float* b, int* ldb, int* info); +#endif + +namespace at { namespace native { + +template +void lapackGesv( + int n, int nrhs, scalar_t* a, int lda, int* ipiv, + scalar_t* b, int ldb, int* info) { + AT_ERROR("gesv only takes float or double Tensors"); +} + +#ifdef USE_LAPACK +template<> void lapackGesv( + int n, int nrhs, float* a, int lda, int* ipiv, + float* b, int ldb, int* info) { + sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info); +} + +template<> void lapackGesv( + int n, int nrhs, double* a, int lda, int* ipiv, + double* b, int ldb, int* info) { + dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info); +} +#endif + +template +static void applyGesv(Tensor& b, Tensor& A, std::vector infos) { +#ifndef USE_LAPACK + AT_ERROR("gesv: LAPACK library not found in compilation"); +#endif + auto A_data = A.data(); + auto b_data = b.data(); + auto A_mat_stride = matrixStride(A); + auto b_mat_stride = matrixStride(b); + + auto batch_size = batchCount(A); + auto n = A.size(-2); + auto nrhs = b.size(-1); + + auto ipiv = at::empty({n}, b.type().toScalarType(kInt)); + + for (int64_t i = 0; i < batch_size; i++) { + int info; + scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; + scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; + lapackGesv(n, nrhs, A_working_ptr, n, ipiv.data(), + b_working_ptr, n, &info); + infos[i] = info; + if (info != 0) { + return; + } + } +} + +std::tuple _gesv_helper_cpu(const Tensor& self, const Tensor& A) { + std::vector infos(batchCount(A), 0); + auto A_working_copy = cloneBatchedColumnMajor(A); + auto b_working_copy = cloneBatchedColumnMajor(self); + AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{ + applyGesv(b_working_copy, A_working_copy, infos); + }); + checkErrors(infos); + return std::tuple(b_working_copy, A_working_copy); +} + +// Supports arbitrary batch dimensions for self and A +std::tuple gesv(const Tensor& self, const Tensor& A) { + if (self.dim() <= 2 && A.dim() <= 2) { + // TODO: #7102: It's not necessary to have gesv (single) bindings for both + // TH and ATen. We should remove the TH gesv bindings, especially + // since the lapackGesv function is already in ATen. + return at::_gesv_single(self, A); + } + + checkInputs(self, A); + + // broadcast the batch dimensions of self and A. + IntList self_batch_sizes(self.sizes().data(), self.ndimension() - 2); + IntList A_batch_sizes(A.sizes().data(), A.ndimension() - 2); + std::vector expand_batch_portion = + infer_size(self_batch_sizes, A_batch_sizes); + + std::vector self_expand_size({expand_batch_portion}); + self_expand_size.insert(self_expand_size.end(), + { self.size(-2), self.size(-1) }); + + std::vector A_expand_size({expand_batch_portion}); + A_expand_size.insert(A_expand_size.end(), + { A.size(-2), A.size(-1) }); + + Tensor self_broadcasted = self.expand(self_expand_size); + Tensor A_broadcasted = A.expand(A_expand_size); + return self.type()._gesv_helper(self_broadcasted, A_broadcasted); +} + +std::tuple gesv_out( + Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) { + if (self.dim() > 2 || A.dim() > 2) { + AT_ERROR("torch.gesv() with the `out` keyword does not support batching. " + "b.dim() (%lld) and A.dim() (%lld) must both be 2.", + (long long)self.dim(), (long long)A.dim()); + } + return at::_gesv_single_out(solution, lu, self, A); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h new file mode 100644 index 0000000..2d26552 --- /dev/null +++ b/aten/src/ATen/native/Gesv.h @@ -0,0 +1,32 @@ +#include "ATen/ATen.h" + +namespace at { namespace native { + +static inline void checkInputs(const Tensor& self, const Tensor& A) { + if (A.size(-1) != A.size(-2)) { + AT_ERROR("A must be batches of square matrices, " + "but they are %lld by %lld matrices", + (long long)A.size(-1), (long long)A.size(-2)); + } + if (A.size(-1) != self.size(-2)) { + AT_ERROR("Incompatible matrix sizes for matmul: each A " + "matrix is %llu by %lld but each b matrix is %lld by %lld.", + (long long)A.size(-1), (long long)A.size(-1), + (long long)self.size(-2), (long long)self.size(-1)); + } +} + +static inline void checkErrors(std::vector infos) { + for (size_t i = 0; i < infos.size(); i++) { + auto info = infos[i]; + if (info < 0) { + AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value", + (long long)i, -info); + } else if (info > 0) { + AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.", + (long long)i, info, info); + } + } +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp new file mode 100644 index 0000000..9720adb --- /dev/null +++ b/aten/src/ATen/native/Indexing.cpp @@ -0,0 +1,327 @@ +// Indexing tensors by by tensors +// +// This corresponds to "advanced indexing" in NumPy. The two operations are: +// +// index(Tensor self, indices) -> Tensor +// index_put_(Tensor self, indices, value) +// +// The index is a TensorList containg kLong or kByte tensors or nulls. Byte +// tensors (boolean masks) are expanded to long tensors via nonzero(). Null +// tensors signify that the dimension is not indexed. +// +// All indexes are broadcast together and iterated as *one*. From NumPy: +// +// result[i_1, ..., i_M] == x[ind_1[i_1, ..., i_M], ind_2[i_1, ..., i_M], +// ..., ind_N[i_1, ..., i_M]] +// +// Note 1: ByteTensors expand to index as many dimensions as there are in the +// mask. +// +// Note 2: The behavior is more complicated when the index tensors are not all +// adjacent (e.g. x[[0, 1], :, [2, 3]]). In this case, self and the index +// tensors are transposed to the front: x.transpose(1, 2)[[0, 1], [2, 3]] + + +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/ExpandUtils.h" + +#include +#include +#include +#include + +namespace at { namespace native { + +[[noreturn]] +static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) { + std::stringstream ss; + ss << "The shape of the mask " << mask.sizes() << " at index " << maskIdx; + ss << " does not match the shape of the indexed tensor " << self.sizes(); + ss << " at index " << idx; + throw std::runtime_error(ss.str()); +} + +static void checkIndexTensorTypes(TensorList indices) { + for (auto& tensor : indices) { + if (tensor.defined()) { + auto& type = tensor.type(); + auto scalarType = type.scalarType(); + if (scalarType != kLong && scalarType != kByte) { + throw std::runtime_error("tensors used as indices must be long or byte tensors"); + } + } + } +} + +static std::vector expandByteTensors(const Tensor & self, TensorList indices) { + // Expands byte tensors (masks) into the equivalent indexing by LongTensors + std::vector result; + for (auto & index : indices) { + if (index.type().scalarType() == kByte) { + // The sizes of the ByteTensor mask must match the sizes of the + // corresponding dimensions in self + for (int64_t j = 0; j < index.dim(); j++) { + int64_t srcIdx = result.size() + j; + if (index.size(j) != self.size(srcIdx)) { + invalid_mask(self, srcIdx, index, j); + } + } + // Replace with nonzeros + auto nonzero = index.nonzero(); +#ifndef USE_TH_SIZE_ZERO_DIM + auto special_empty = nonzero.numel() == 0; +#else + auto special_empty = false; +#endif + for (int64_t j = 0; j < index.dim(); j++) { + if (special_empty) { + // We can't call select on an empty tensor so we just create an empty + // tensor. + result.emplace_back(nonzero.type().tensor()); + } else { + result.emplace_back(nonzero.select(1, j)); + } + } + } else { + result.emplace_back(index); + } + } + return result; +} + +static bool hasContiguousSubspace(TensorList tl) { + // true if all the non-null tensors are adjacent + auto isDefined = [](const Tensor & tensor){ return tensor.defined(); }; + auto isNull = [](const Tensor & tensor){ return !tensor.defined(); }; + auto start = std::find_if(tl.begin(), tl.end(), isDefined); + auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined); + auto it = std::find_if(start, stop.base(), isNull); + return it == stop.base(); +} + +// Transposes the tensor and indices together so that all the non-null indices +// index the first k dimensions of the tensor. Returns the transposed tensor +// and the reordered indices. For example: +// transposeToFront(tensor, {nullptr, a, nullptr, b}) +// returns +// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr} +static std::tuple> +transposeToFront(Tensor self, TensorList indices) { + std::vector dims; + std::vector transposedIndices; + dims.reserve(self.dim()); + for (int64_t i = 0; i < self.dim(); i++) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (int64_t i = 0; i < self.dim(); i++) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices)); +} + +static std::vector computeLinearStride(const Tensor & tensor) { + // computes the stride as if tensor were contigous + auto sizes = tensor.sizes(); + std::vector stride(tensor.dim()); + stride[tensor.dim() - 1] = 1; + std::partial_sum(sizes.rbegin(), sizes.rend() - 1, stride.rbegin() + 1, std::multiplies()); + return stride; +} + +// Unsqueezes src `before` times at the front and `after` times at the end +static Tensor unsqueezeN(const Tensor & src, int64_t before, int64_t after) { + auto srcSizes = src.sizes(); + auto nDim = src.dim(); + std::vector sizes(nDim + before + after, 1); + for (int64_t i = 0; i < nDim; i++) { + sizes[i + before] = srcSizes[i]; + } + return src.view(sizes); +} + +static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) { + if (index.numel() != 0) { + auto max_idx = index.max().toCLong(); + auto min_idx = index.min().toCLong(); + if (max_idx >= dim_size) { + AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); + } + if (min_idx < -dim_size) { + AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); + } + } + return index.remainder(dim_size); +} + +static Tensor computeLinearIndex(const Tensor & src, TensorList indices) { + auto strides = computeLinearStride(src); + Type& longType = src.type().toScalarType(kLong); + + // Compute the linear index by multiplying the indexing tensors by the + // stride and summing them. All the indexing tensors have the same shape at + // this point. We also compute the number of dimensions before and after that + // are not being index. + Tensor linearIndex; + int64_t emptyBefore = 0, emptyAfter = 0, nElemBefore = 1, nElemAfter = 1; + for (int64_t i = 0; i < src.dim(); i++) { + if (indices[i].defined()) { + // Cast index to the longType matching src's backend + // This allows us to support ie indexing a cuda tensor with a cpu tensor + Tensor index = (wrapIndexOnce(indices[i], i, src.size(i)) * strides[i]).toType(longType); + if (linearIndex.defined()) { + linearIndex += index; + } else { + linearIndex = index; + } + } else if (linearIndex.defined()) { + emptyAfter++; + nElemAfter *= src.size(i); + } else { + emptyBefore++; + nElemBefore *= src.size(i); + } + } + + // Compute the linear indices for the parts of the tensor not being indexed + Tensor beforeIndex; + if (emptyBefore > 0) { + auto index = at::arange(0, nElemBefore, longType) * strides[emptyBefore - 1]; + index = index.view(src.sizes().slice(0, emptyBefore)); + beforeIndex = unsqueezeN(index, 0, linearIndex.dim() + emptyAfter); + } + Tensor afterIndex; + if (emptyAfter > 0) { + auto index = at::arange(0, nElemAfter, longType); + index = index.view(src.sizes().slice(src.dim() - emptyAfter, emptyAfter)); + afterIndex = unsqueezeN(index, linearIndex.dim() + emptyBefore, 0); + } + + // Sum with broadcasting to compute the full index + linearIndex = unsqueezeN(linearIndex, emptyBefore, emptyAfter); + if (beforeIndex.defined()) { + linearIndex = linearIndex + beforeIndex; + } + if (afterIndex.defined()) { + linearIndex = linearIndex + afterIndex; + } + return linearIndex; +} + +#ifndef USE_TH_SIZE_ZERO_DIM +static bool hasEmptyTensor(TensorList tensors) { + for (auto& tensor : tensors) { + if (tensor.defined() && tensor.numel() == 0) { + return true; + } + } + return false; +} +#endif + +static std::tuple makeLinearIndex(Tensor self, TensorList orig) { + checkIndexTensorTypes(orig); + // first expand ByteTensor (boolean masks) into 1 or more LongTensors + auto indices = expandByteTensors(self, orig); +#ifndef USE_TH_SIZE_ZERO_DIM + if (hasEmptyTensor(indices)) { + return std::make_tuple(self, self.type().toScalarType(kLong).tensor()); + } +#endif + // next broadcast all index tensors together + indices = expand_outplace(indices); + // add missing null Tensors so that it matches self.dim() + while (indices.size() < (size_t)self.dim()) { + indices.emplace_back(); + } + // if the non-null indices are not all adjacent, transpose self and indices + // together so that they're adjacent at the front + if (!hasContiguousSubspace(indices)) { + std::tie(self, indices) = transposeToFront(self, indices); + } + auto linearIndex = computeLinearIndex(self, indices); + return std::make_tuple(self, linearIndex); +} + +Tensor index(const Tensor & self, TensorList indices) { + if (indices.size() > (size_t)self.dim()) { + AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); + } + + Tensor src, linearIndex; + std::tie(src, linearIndex) = makeLinearIndex(self, indices); + return src.take(linearIndex); +} + +Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) { + if (indices.size() > (size_t)self.dim()) { + AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); + } + + Tensor src, linearIndex, expandedValue; + std::tie(src, linearIndex) = makeLinearIndex(self, indices); + std::tie(expandedValue) = expand_inplace(linearIndex, value); + Tensor dst = src.clone(); + return dst.put_(linearIndex, expandedValue); +} + +Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) { + if (indices.size() > (size_t)self.dim()) { + AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); + } + + Tensor src, linearIndex, expandedValue; + std::tie(src, linearIndex) = makeLinearIndex(self, indices); + std::tie(expandedValue) = expand_inplace(linearIndex, value); + return src.put_(linearIndex, expandedValue); +} + +Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { + dim = maybe_wrap_dim(dim, self.dim()); + + if (index.dim() >= 2) { + AT_ERROR( + "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); + } + int64_t numIndices = index.numel(); + if (source.dim() == 0 && numIndices != 1) { + AT_ERROR( + "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); + } + if (index.type().scalarType() != ScalarType::Long) { + AT_ERROR("index_copy_(): Expected LongTensor for index"); + } + + // Check that source and destination slices have the same size + auto selfSlicedSizes = std::vector(self.sizes()); + if (selfSlicedSizes.size() > 0) { + selfSlicedSizes.erase(selfSlicedSizes.begin() + dim); + } + auto sourceSlicedSizes = std::vector(source.sizes()); + if (sourceSlicedSizes.size() > 0) { + sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim); + } + if (selfSlicedSizes.size() != sourceSlicedSizes.size() || + !std::equal(selfSlicedSizes.begin(), selfSlicedSizes.end(), + sourceSlicedSizes.begin())) { + std::stringstream ss; + ss << "index_copy_(): Source/destination tensor must have same slice shapes. "; + ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << dim; + ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0."; + throw std::runtime_error(ss.str()); + } + if (source.dim() > 0 && numIndices != source.size(dim)) { + AT_ERROR( + "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")"); + } + + return self._indexCopy_(dim, index, source); +} + +}} // at::native diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp new file mode 100644 index 0000000..203bd5f --- /dev/null +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -0,0 +1,342 @@ +#include +#include +#include +#include + +namespace at { namespace native { + +namespace { + static bool _type_has_native(const Type& dtype) { + return dtype.is_sparse(); + } + + static bool _has_native(const Tensor& self) { + return _type_has_native(self.type()); + } +} + +// These native operations are not "really" native; they're actually just bridge +// functions that decide whether or not to call native sparse functions, or +// TH functions. This file should be temporary; when all of TH gets ported, we +// can just use the native mechanism straight. + +// TODO: Maybe the foo_ variants should call th_foo_ + +Tensor norm(const Tensor & self, Scalar p) { + if (_has_native(self)) { + return native_norm(self, p); + } else { + return th_norm(self, p); + } +} + +Tensor clone(const Tensor& self) { + if (_has_native(self)) { + return native_clone(self); + } else { + return th_clone(self); + } +} + +Tensor& resize_as_(Tensor& self, const Tensor& the_template) { + if (_has_native(self)) { + return native_resize_as_(self, the_template); + } else { + return th_resize_as_(self, the_template); + } +} + +Tensor& pow_out(Tensor& result, const Tensor& self, Scalar exponent) { + if (_has_native(self)) { + return native_pow_out(result, self, exponent); + } else { + return th_pow_out(result, self, exponent); + } +} + +Tensor pow(const Tensor& self, Scalar exponent) { + if (_has_native(self)) { + return native_pow(self, exponent); + } else { + return th_pow(self, exponent); + } +} + +Tensor& zero_(Tensor& self) { + if (_has_native(self)) { + return native_zero_(self); + } else { + return th_zero_(self); + } +} + +// Note [Multiple dispatch to sparse] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// In an ideal world, we would use direct support for multiple dispatch to +// say that add(Dense, Dense) should dispatch to one function, while +// add(Dense, Sparse) should dispatch to another function. +// +// In a world where we only have single dispatch, we can single dispatch on +// the first function, and then do an is_sparse() test on the second argument +// to direct ourselves to the correct argument. +// +// We are in neither of those worlds. Instead, we have a th_add function +// which has legacy implementations in the single dispatch world, BUT our +// actual add function needs to call s_native_add if the function *would have* +// utilized a sparse kernel that is natively implemented. +// +// th_add is "good old single dispatch" which internally handles the is_sparse() +// test and also handles broadcasting. s_native_add works asymmetrically: +// it doesn't handle broadcasting at all, and it ASSUMES that the relevant +// argument is a sparse tensor. Why the asymmetry? It turns out it is not +// so easy to figure out if a kernel is implemented in THS; it's not as simple +// as testing if the first argument is sparse, because, e.g., +// in add(Dense, Sparse), the sparse kernel is in the second argument. So, +// the trampoline function is going to know about the overloads *anyway*; it +// might as well also handle is_sparse() and broadcasting while it's at it. +// +// Why not change TH to follow this new scheme? We could... but since it's +// all going away when we finish porting the TH functions to ATen, we haven't +// done it. + +Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) { + // See Note [Multiple dispatch to sparse] + auto self_sparse = self.is_sparse(); + auto other_sparse = other.is_sparse(); + if (self_sparse && other_sparse) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "add_out"); + return s_native_add_out(result, b_self, b_other, alpha); + } else if (!self_sparse && other_sparse) { + // TODO: Perhaps doing overload selection with SparseTensorRef is + // confusing, and we should have given these overloads different names. + // For now, we do it this way for consistency with the TH bindings + // (not that it is terribly consistent anyway). + return native_add_out(result, self, SparseTensorRef(other), alpha); + } else { + return th_add_out(result, self, other, alpha); + } +} + +// NB: You may be tempted to implement add and add_ just as calls to add_out, but +// calling the actual implementing function matters, because broadcast +// will be handled differently depending on if you call add_ or (a seemingly +// equivalent) add_out. Arguably this mismatch in treatment is a bug, +// c.f., https://github.com/pytorch/pytorch/issues/8308 but fixing this +// bug would involve changing a lot of other places, so we leave it +// alone for now. + +Tensor add(const Tensor& self, const Tensor& other, Scalar alpha) { + // See Note [Multiple dispatch to sparse] + auto self_sparse = self.is_sparse(); + auto other_sparse = other.is_sparse(); + if (self_sparse && other_sparse) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "add"); + return s_native_add(b_self, b_other, alpha); + } else if (!self_sparse && other_sparse) { + return native_add(self, SparseTensorRef(other), alpha); + } else { + return th_add(self, other, alpha); + } +} + +Tensor& add_(Tensor& self, const Tensor& other, Scalar alpha) { + // See Note [Multiple dispatch to sparse] + auto self_sparse = self.is_sparse(); + auto other_sparse = other.is_sparse(); + if (self_sparse && other_sparse) { + Tensor b_other; + std::tie(b_other) = expand_inplace(self, other, "add_"); + return s_native_add_(self, b_other, alpha); + } else if (!self_sparse && other_sparse) { + return native_add_(self, SparseTensorRef(other), alpha); + } else { + return th_add_(self, other, alpha); + } +} + + +Tensor& sub_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) { + if (_has_native(self)) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "sub_out"); + return s_native_sub_out(result, b_self, b_other, alpha); + } else { + return th_sub_out(result, self, other, alpha); + } +} + +Tensor sub(const Tensor& self, const Tensor& other, Scalar alpha) { + if (_has_native(self)) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "sub"); + return s_native_sub(b_self, b_other, alpha); + } else { + return th_sub(self, other, alpha); + } +} + +Tensor& sub_(Tensor& self, const Tensor& other, Scalar alpha) { + if (_has_native(self)) { + Tensor b_other; + std::tie(b_other) = expand_inplace(self, other, "sub_"); + return s_native_sub_(self, b_other, alpha); + } else { + return th_sub_(self, other, alpha); + } +} + + +Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) { + if (_has_native(self)) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "mul_out"); + return s_native_mul_out(result, self, other); + } else { + return th_mul_out(result, self, other); + } +} + +Tensor mul(const Tensor& self, const Tensor& other) { + if (_has_native(self)) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "mul"); + return s_native_mul(self, other); + } else { + return th_mul(self, other); + } +} + +Tensor& mul_(Tensor& self, const Tensor& other) { + if (_has_native(self)) { + Tensor b_other; + std::tie(b_other) = expand_inplace(self, other, "mul_"); + return s_native_mul_(self, b_other); + } else { + return th_mul_(self, other); + } +} + +Tensor& mul_out(Tensor& result, const Tensor& self, Scalar other) { + if (_has_native(self)) { + return native_mul_out(result, self, other); + } else { + return th_mul_out(result, self, other); + } +} + +Tensor mul(const Tensor& self, Scalar other) { + if (_has_native(self)) { + return native_mul(self, other); + } else { + return th_mul(self, other); + } +} + +Tensor& mul_(Tensor& self, Scalar other) { + if (_has_native(self)) { + return native_mul_(self, other); + } else { + return th_mul_(self, other); + } +} + + +Tensor& div_out(Tensor& result, const Tensor& self, Scalar other) { + if (_has_native(self)) { + return native_div_out(result, self, other); + } else { + return th_div_out(result, self, other); + } +} + +Tensor div(const Tensor& self, Scalar other) { + if (_has_native(self)) { + return native_div(self, other); + } else { + return th_div(self, other); + } +} + +Tensor& div_(Tensor& self, Scalar other) { + if (_has_native(self)) { + return native_div_(self, other); + } else { + return th_div_(self, other); + } +} + +Tensor& addmm_out(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { + // See Note [Multiple dispatch to sparse] + auto mat1_sparse = mat1.is_sparse(); + if (mat1_sparse) { + Tensor b_self; + std::tie(b_self) = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out"); + return s_native_addmm_out(result, b_self, mat1, mat2, beta, alpha); + } else { + return th_addmm_out(result, self, mat1, mat2, beta, alpha); + } +} + +Tensor addmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { + // See Note [Multiple dispatch to sparse] + auto mat1_sparse = mat1.is_sparse(); + if (mat1_sparse) { + Tensor b_self; + std::tie(b_self) = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm"); + return s_native_addmm(b_self, mat1, mat2, beta, alpha); + } else { + return th_addmm(self, mat1, mat2, beta, alpha); + } +} + +Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { + // See Note [Multiple dispatch to sparse] + auto mat1_sparse = mat1.is_sparse(); + if (mat1_sparse) { + // inplace is not broadcasting + return s_native_addmm_(self, mat1, mat2, beta, alpha); + } else { + return th_addmm_(self, mat1, mat2, beta, alpha); + } +} + + +Tensor tensor(const Type& dtype) { + if (_type_has_native(dtype)) { + return dtype.native_tensor(); + } else { + return dtype.th_tensor(); + } +} + +Tensor tensor(const Type& dtype, ArrayRef size) { + if (_type_has_native(dtype)) { + return dtype.native_tensor(size); + } else { + return dtype.th_tensor(size); + } +} + +Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) { + return values.type().toSparse().native_sparse_coo_tensor(indices, values); +} + +Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef size) { + return values.type().toSparse().native_sparse_coo_tensor(indices, values, size); +} + +Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef size) { + return values.type().toSparse()._native_sparse_coo_tensor_unsafe(indices, values, size); +} + +int64_t get_device(const Tensor& self) { + if (_has_native(self)) { + return native_get_device(self); + } else { + return _th_get_device(self); + } +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp new file mode 100644 index 0000000..cb24e71 --- /dev/null +++ b/aten/src/ATen/native/Linear.cpp @@ -0,0 +1,440 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/WrapDimUtilsMulti.h" + +namespace at { namespace native { + + +// sumproduct_pair computes `(left*right).sum(sumdims)` by means of permutation and +// batch matrix multiplication +// its main purpose is to provide a pairwise reduction for einsum +static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntList sum_dims_, bool keepdim) { + // assumes that tensors have been pre-unsqueezed (so that all dimensions match - after broadcasting) + // but makes no other assumptions on the order of dimensions + AT_CHECK(left_.dim()==right_.dim(), "number of dimensions must match"); + if (sum_dims_.size() == 0) + return at::mul(left_, right_); + int64_t dim = left_.dim(); + auto sum_dims = dim_list_to_bitset(sum_dims_, dim); + // dimensions that will be part of the output (i.e. not summed over) in three vectors + // dims in lro appear in left, right and output, similarly lo: left and output, ro: right and output + // also the sizes are kept track of for reshaping + std::vector lro, lo, ro; + int64_t lro_size = 1, lo_size = 1, ro_size = 1, sum_size = 1; + Tensor left = left_; + Tensor right = right_; + for (int64_t i = 0; i < dim; i++) { + auto sl = left.size(i)>1; + auto sr = right.size(i)>1; + if (sum_dims[i]) { // first dimensions that will be summed over after multiplication + if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size + AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match"); + sum_size *= left.size(i); + } else if (sl) { // if it is only in one of left and right, we can sum right away + left = left.sum(i, true); + } else if (sr) { + right = right.sum(i, true); + } + } else if (sl && sr) { // now deal with dimensions dimensions that will be in the output + // dimensions nontrivially in both left and right must be of the same size + AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match"); + lro.push_back(i); + lro_size *= left.size(i); + } else if (sl) { // keep track of dimensions appearing only once + lo.push_back(i); + lo_size *= left.size(i); + } else { + ro.push_back(i); + ro_size *= right.size(i); + } + } + // we now work with the following permutations / shapes. + // the pipeline is permute inputs -> reshape inputs -> batch matrix mul -> reshape(view) output -> permute output + // output: "lro, lo, 1-for-summed-dims, ro" with orgiginal shape dimensions + // left: "lro, lo, summed" permuted with lpermutation and the three flattened + // right: "lro, summed, ro" permuted with rpermutation and the three flattened + // then the permuted output is a view of bmm(left, right) + // finally, opermutation reverts the permutation to the original order of dimensions + std::vector out_size; + for (auto& d : lro) out_size.push_back(left.size(d)); + for (auto& d : lo) out_size.push_back(left.size(d)); + for (auto& d : sum_dims_) { out_size.push_back(1); (void)(d); }; // avoid warining about not using d + for (auto& d : ro) out_size.push_back(right.size(d)); + + std::vector lpermutation(lro); + lpermutation.insert(lpermutation.end(), lo.begin(), lo.end()); + lpermutation.insert(lpermutation.end(), sum_dims_.begin(), sum_dims_.end()); + lpermutation.insert(lpermutation.end(), ro.begin(), ro.end()); + + std::vector rpermutation(lro); + rpermutation.insert(rpermutation.end(), sum_dims_.begin(), sum_dims_.end()); + rpermutation.insert(rpermutation.end(), ro.begin(), ro.end()); + rpermutation.insert(rpermutation.end(), lo.begin(), lo.end()); + + std::vector opermutation(lro.size()+lo.size()+sum_dims_.size()+ro.size(), -1); + { + int64_t i = 0; + + for (auto it = lro.begin(); it != lro.end(); i++, it++) { + opermutation[*it] = i; + } + for (auto it = lo.begin(); it != lo.end(); i++, it++) { + opermutation[*it] = i; + } + for (auto it = sum_dims_.begin(); it != sum_dims_.end(); i++, it++) { + opermutation[*it] = i; + } + for (auto it = ro.begin(); it != ro.end(); i++, it++) { + opermutation[*it] = i; + } + } + + // now we can execute the operations above + left = left.permute(lpermutation).reshape({lro_size, lo_size, sum_size}); + right = right.permute(rpermutation).reshape({lro_size, sum_size, ro_size}); + Tensor result = at::bmm(left, right); + result = result.view(out_size).permute(opermutation); + + // finally squeeze summed dimensions if desired + if (! keepdim) { + for (int i = dim-1; i>=0; i--) + if (sum_dims[i]) + result.squeeze_(i); + } + return result; +} + +Tensor einsum(std::string eqn, TensorList tensors) { + constexpr size_t number_of_letters = 26; + std::string in_eqn; + size_t pos; + // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis. + // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter + // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices. + // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that + // the letter has not been assigned an index yet (because it has not been seen). + // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices). + // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet. + // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below. + + std::array letter_mapping; // map letter to internal (numerical) label + letter_mapping.fill(-1); + int64_t num_ell_idxes = -1; + int64_t first_ell_idx = 0; + + // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes. + // For each operand, we have a vector mapping each dimension to an internal index. + // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and + // of the last occurence of each index. + std::vector> input_op_idxes; // the parsed operand indices + std::array num_letter_occurrences; // number of occurrence in the equation of this letter + num_letter_occurrences.fill(0); + std::vector last_idx_occurrence; // the last operator (left to right) using this index + + if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side + in_eqn = eqn.substr(0, pos); + } else { + in_eqn = eqn; + } + + // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index + int64_t operand = 0; + std::stringstream eqn_stream(in_eqn); + std::string term; + int64_t num_total_idxes = 0; + while (! eqn_stream.eof()) { + std::getline(eqn_stream, term, ','); // term = string with indices of current term + AT_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension + + int64_t ell_char_count = 0; // handling of ellipsis '...' is a bit tedious, we count the '.' + // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions + int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3; + int64_t dims_in_term = 0; // dimensions we have seen + std::vector current_op_idxes; // mapping of operand dimensions to indices for current term + for (auto &c : term) { // c = character with a single letter or '.' + if (c == '.') { + ell_char_count++; + AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation"); + if (ell_char_count == 3) { // this completes the ellipsis + if (num_ell_idxes == -1) { // if we have not seen an ellipsis before, keep track of indices and size + first_ell_idx = num_total_idxes; + num_ell_idxes = candidate_num_ell_idxes; + num_total_idxes += num_ell_idxes; + } + else { // we have seen an ellipsis before, so we check compatibility + AT_CHECK(candidate_num_ell_idxes == num_ell_idxes, + "ellipsis must represent ", num_ell_idxes, " dimensions in all terms"); + } + for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices + current_op_idxes.push_back(first_ell_idx + i); + last_idx_occurrence.push_back(operand); + } + dims_in_term += num_ell_idxes; // keep track of dimensions + } + } else { // a letter (hopefully) + AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand); + AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); + int64_t letter_num = c-'a'; // letter_num = position in letter_mapping + if (letter_mapping[letter_num] == -1) { // new letter, add internal index and mapping + letter_mapping[letter_num] = num_total_idxes; + num_total_idxes++; + last_idx_occurrence.push_back(operand); + } else { // letter we have already seen + last_idx_occurrence[letter_mapping[letter_num]] = operand; + } + num_letter_occurrences[letter_num]++; + current_op_idxes.push_back(letter_mapping[letter_num]); + dims_in_term++; + } + } + AT_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim()); + input_op_idxes.push_back(std::move(current_op_idxes)); + operand++; + } + // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <. + AT_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation"); + + // the following parses or infers output (right hand side) + // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) + // for the output indices. -1 means that the index has not been assigned a dimension yet + std::vector idxes_to_preprocessed_dims(num_total_idxes, -1); // the position of the index in the tensor dimensions + int64_t num_output_dims = 0; + if (pos != std::string::npos) { // parse the user provided right hand side + int64_t ell_char_count = 0; + for (auto &c : eqn.substr(pos+2)) { + if (c == '.') { // '.' as part of ellipsis + ell_char_count++; + AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation"); + if (ell_char_count == 3) { // ellipsis complete + AT_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side"); + for (int64_t i = 0; i < num_ell_idxes; ++i) { + idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; + num_output_dims++; + } + } + } else { // letter (hopefully) + AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); + AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); + int64_t letter_num = c-'a'; + AT_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, "occurs twice in output"); + idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims; + num_output_dims++; + } + } + } else { // create an inferred right hand side + // the ellipsis (if in the lhs) comes first + if (num_ell_idxes >= 0) { + for (int64_t i = 0; i < num_ell_idxes; ++i) { + idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; + num_output_dims++; + } + } + // then the indices that occur exactly once in alphabetic order + for (size_t idx = 0; idx < number_of_letters; idx++) { + if (num_letter_occurrences[idx] == 1) { + idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims; + num_output_dims++; + } + } + } + // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) + // for the non-output indices - those that are eventually summed over + int64_t position = num_output_dims; + for (int64_t i = 0; i < num_total_idxes; i++) { + if (idxes_to_preprocessed_dims[i]==-1) { + idxes_to_preprocessed_dims[i] = position; + position++; + } + } + + // we now "homogenize the dimensions", i.e. + // - take diagonals for duplicated indices + // - permute the dimensions to match the order given by idxes_to_preprocessed_dims + // - unsqueeze to create all dimensions for each index in each tensor where they are missing + // we also check that sizes match + // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable) + std::vector preprocessed_operands; + std::vector size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet + for (int64_t op = 0; op < (int64_t) tensors.size(); op++) { + auto preprocessed_op = tensors[op]; + std::vector idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear + std::vector& current_op_input_idxes = input_op_idxes[op]; + int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input + for (size_t i = 0; i < current_op_input_idxes.size(); i++) { + auto idx = current_op_input_idxes[i]; + auto dim_out = idxes_to_preprocessed_dims[idx]; + if (idx_to_dim[dim_out] == -1) { // first appearance + idx_to_dim[dim_out] = dim; + if (size_of_dims[idx] == -1) { // keep track of sizes + size_of_dims[idx] = preprocessed_op.size(dim); + } + else { + AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); + } + dim++; + } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out] + AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); + preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim); + // diagonal moves the diagonal dimension to the back + // now we permute the last dim back to idx_to_dim[dim_out] + std::vector perm(preprocessed_op.dim(), 0); + for (int64_t d = 0; d < preprocessed_op.dim(); d++) { + if (d == idx_to_dim[dim_out]) { + perm[d] = preprocessed_op.dim() - 1; + } else { + perm[d] = d - (d > idx_to_dim[dim_out]); + } + } + preprocessed_op = preprocessed_op.permute(perm); + } + } + // now we permute the dimensions in the right order + std::vector permutation; // permutation for this tensor + for (auto &d : idx_to_dim) { + if (d > -1) { + permutation.push_back(d); + } + } + preprocessed_op = preprocessed_op.permute(permutation); + // finally, we insert dimensions for idxes not in the operand + for (size_t dim = 0; dim < idx_to_dim.size(); dim++) { + if (idx_to_dim[dim] == -1) { + preprocessed_op = preprocessed_op.unsqueeze(dim); + } + } + preprocessed_operands.push_back(preprocessed_op); + } + + // now we reduce the indices from left to right + // numpy allows to optimize the path using various + // algorithms (see eigen_path in numpy docs) + // we start with the leftmost operator and reduce indices that + // appear only there + Tensor result = preprocessed_operands[0]; + for (int64_t idx = 0; idx < num_total_idxes; idx++) { + if ((last_idx_occurrence[idx] == 0) + && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) { + result = result.sum(idxes_to_preprocessed_dims[idx], true); + } + } + + // now we process each tensor using sumproduct_pair + for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) { + std::vector sum_dims; + for (int64_t idx = 0; idx < num_total_idxes; idx++) { + if ((last_idx_occurrence[idx] == i) + && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) { + sum_dims.push_back(idxes_to_preprocessed_dims[idx]); + } + } + result = at::native::sumproduct_pair(result, preprocessed_operands[i], sum_dims, true); + } + // finally, we squeeze out all non-result dimensions + for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) + result.squeeze_(dim); + return result; +} + +// _trilinear computes a trilinear einstein sum with an unrolled dimension +// the result is `(i1.unsqueeze(expand1)*i2.unsqueeze(expand2)*i2.unsqueeze(expand3)).sum(sumdim)` +// the computation is unrolled in the unroll_dim dimension +// its main purpose is to unify the computations in bilinear and bilinear_backward +Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_, + IntList expand1_, IntList expand2_, IntList expand3_, + IntList sumdim_, int64_t unroll_dim) { + int64_t total_dim = i1_.dim()+expand1_.size(); + AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]"); + auto expand1 = dim_list_to_bitset(expand1_, total_dim); + auto expand2 = dim_list_to_bitset(expand2_, total_dim); + auto expand3 = dim_list_to_bitset(expand3_, total_dim); + auto sumdim = dim_list_to_bitset(sumdim_, total_dim); + Tensor i1 = i1_; + Tensor i2 = i2_; + Tensor i3 = i3_; + std::vector output_size; + std::vector sum_dims_12, sum_dims_23; + int64_t unroll_size = -1; + // asserts... + for (int64_t i = 0; i < total_dim; i++) { + int64_t s = 0; + if (expand1[i]) { + i1 = i1.unsqueeze(i); + } else { + s = i1.size(i); + } + if (expand2[i]) { + i2 = i2.unsqueeze(i); + } else { + s = i2.size(i); + } + if (expand3[i]) { + i3 = i3.unsqueeze(i); + if (sumdim[i] && (i != unroll_dim)) + sum_dims_12.push_back(i); + } else { + s = i3.size(i); + if (sumdim[i] && (i != unroll_dim)) + sum_dims_23.push_back(i); + } + output_size.push_back(sumdim[i] ? 1 : s); + if (i == unroll_dim) + unroll_size = s; + } + int64_t slicemul1 = (expand1[unroll_dim] ? 0 : 1); + int64_t slicemul2 = (expand2[unroll_dim] ? 0 : 1); + int64_t slicemul3 = (expand3[unroll_dim] ? 0 : 1); + + auto output = i1.type().tensor(output_size).zero_(); + if (! sumdim[unroll_dim]) { + for (int64_t k = 0; k < unroll_size; k++) { + Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1), + i2.narrow(unroll_dim, k * slicemul2, 1), + sum_dims_12, true); + buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true); + output.narrow(unroll_dim, k, 1).add_(buf); + } + } + else { + for (int64_t k = 0; k < unroll_size; k++) { + Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1), + i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true); + buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true); + output.add_(buf); + } + } + for (int64_t i = output.dim()-1; i >= 0; i--) + if (sumdim[i]) + output.squeeze_(i); + return output; +} + +Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const Tensor& bias) { + AT_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim()); + for (int64_t i = 0; i < input1.dim() - 1; i++) { + AT_CHECK(input1.size(i) == input2.size(i), + "bilinear(): input batch dimensions do not match at dim ", i, ": got ", input1.size(i), " and ", input2.size(i)); + } + AT_CHECK(input1.size(input1.dim() - 1) == weight.size(1), + "bilinear(): input1 size does not match weight size: got ", + input1.size(input1.dim() - 1), " but expected ", weight.size(1)); + AT_CHECK(input2.size(input2.dim() - 1) == weight.size(2), + "bilinear(): input2 size does not match weight size: got ", + input2.size(input2.dim() - 1), " but expected ", weight.size(2)); + AT_CHECK(!bias.defined() || bias.size(0) == weight.size(0), + "bilinear(): bias size does not match weight size: got ", + bias.size(0), " but expected ", weight.size(0)); + + std::vector output_size; + auto size1 = input1.sizes(); + output_size.insert(output_size.end(), size1.begin(), size1.end() - 1); + output_size.push_back(weight.size(0)); + auto input1_flattened = input1.view({-1, input1.size(-1)}); + auto input2_flattened = input2.view({-1, input2.size(-1)}); + Tensor output = at::_trilinear(input1_flattened, weight, input2_flattened, {1,3}, {0}, {1,2}, {2,3}).reshape(output_size); + if (bias.defined()) { + output = output + bias; + } + return output; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp new file mode 100644 index 0000000..388d704 --- /dev/null +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -0,0 +1,319 @@ +#include "ATen/ATen.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" +#include +#include +#include + +namespace at { +namespace native { + +// Helper function for det methods. +// For pivoted LU factorization A = P * L * U. Since we always have det(L) = 1, +// det(P) = \pm 1, this method returns a 3-tuple: +// (det(P), diag(U), info), +// where info helps us identify singular matrices. +static inline std::tuple _lu_det_P_diag_U_info(const Tensor& self) { + Tensor p, lu, info; + std::tie(lu, p, info) = self.unsqueeze(0).btrifact_with_info(); + p.squeeze_(0); + lu.squeeze_(0); + int int_info = info.squeeze_().toCInt(); + AT_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info); + auto n = self.size(0); + auto num_exchanges = (at::arange(1, n + 1, p.type()) != p).nonzero().size(0); + if (num_exchanges % 2 == 1) { + return std::make_tuple(-1., lu.diag(), int_info); + } else { + return std::make_tuple(1., lu.diag(), int_info); + } +} + +Tensor det(const Tensor& self) { + AT_CHECK(at::isFloatingType(self.type().scalarType()) && + self.dim() == 2 && self.size(0) == self.size(1), + "det(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor " + "of floating types"); + double det_P; + Tensor diag_U; + int info; + std::tie(det_P, diag_U, info) = _lu_det_P_diag_U_info(self); + if (info > 0) { + return at::zeros({}, self.type()); + } else { + return diag_U.prod().mul_(det_P); + } +} + +Tensor logdet(const Tensor& self) { + AT_CHECK(at::isFloatingType(self.type().scalarType()) && + self.dim() == 2 && self.size(0) == self.size(1), + "logdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor " + "of floating types"); + double det_P; + Tensor diag_U, det; + int info; + std::tie(det_P, diag_U, info) = _lu_det_P_diag_U_info(self); + if (info > 0) { + det = at::zeros({}, self.type()); + } else { + det = diag_U.prod().mul_(det_P); + } + if (det.sign().toCDouble() <= 0) { + return det.log_(); // in order to get proper -inf (det=0) or nan (det<0) + } else { + return diag_U.abs().log().sum(); + } +} + +std::tuple slogdet(const Tensor& self) { + AT_CHECK(at::isFloatingType(self.type().scalarType()) && + self.dim() == 2 && self.size(0) == self.size(1), + "slogdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor " + "of floating types"); + double det_P; + Tensor diag_U, det; + int info; + std::tie(det_P, diag_U, info) = _lu_det_P_diag_U_info(self); + if (info > 0) { + det = at::zeros({}, self.type()); + } else { + det = diag_U.prod().mul_(det_P); + } + return std::make_tuple(det.sign(), diag_U.abs_().log_().sum()); +} + +Tensor inverse(const Tensor& self) { + Tensor result = self.type().tensor(); + return at::native::inverse_out(result, self); +} + +Tensor& inverse_out(Tensor &result, const Tensor &self) { + AT_CHECK(self.type().backend() == kCPU || self.type().backend() == kCUDA, + "tensor should have CPU or CUDA backend"); + AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional"); + AT_CHECK(self.size(0) == self.size(1), "tensor should be square"); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "tensor should be of floating-point type"); + if (self.size(0) == 0) { + return result.resize_({0, 0}); + } else { + return at::_getri_out(result, self); + } +} + +Tensor pinverse(const Tensor& self, double rcond) { + AT_CHECK(at::isFloatingType(self.type().scalarType()) && self.dim() == 2, + "pinverse(", self.type(), "{", self.sizes(), "}): expected a 2D tensor " + "of floating types"); + AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional"); + if (self.numel() == 0) { + // Match NumPy + return self.type().tensor({self.size(1), self.size(0)}); + } + Tensor U, S, V; + std::tie(U, S, V) = self.svd(); + double max_val = S[0].toCDouble(); + Tensor S_pseudoinv = at::where(S > rcond * max_val, S.reciprocal(), at::zeros({}, self.options())); + return V.mm(S_pseudoinv.diag().mm(U.t())); +} + +static void check_1d(const Tensor& t, const char* arg, const char* fn) { + AT_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D"); +} + +Tensor ger(const Tensor& self, const Tensor& vec2) { + check_1d(self, "self", "ger"); + check_1d(vec2, "vec2", "ger"); + return at::_ger(self, vec2); +} + +Tensor& ger_out(Tensor& result, const Tensor& self, const Tensor& vec2) { + check_1d(self, "self", "ger"); + check_1d(vec2, "vec2", "ger"); + return at::_ger_out(result, self, vec2); +} + +Tensor mm(const Tensor& self, const Tensor& mat2) { + if (self.is_sparse()) { + return mat2.type().addmm(at::zeros({}, mat2.type()), self, mat2, 0, 1); + } + return self.type()._mm(self, mat2); +} + +Tensor& mm_out(Tensor& result, const Tensor& self, const Tensor& mat2) { + if (self.is_sparse()) { + return mat2.type().addmm_out(result, at::zeros({}, mat2.type()), self, mat2, 0, 1); + } + return self.type()._mm_out(result, self, mat2); +} + +Tensor mv(const Tensor& self, const Tensor& vec) { + check_1d(vec, "vec", "mv"); + return at::_mv(self, vec); +} + +Tensor& mv_out(Tensor& result, const Tensor& self, const Tensor& vec) { + check_1d(vec, "vec", "mv"); + return at::_mv_out(result, self, vec); +} + +Tensor addmv(const Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) { + check_1d(vec, "vec", "addmv"); + return at::_addmv(self, mat, vec, beta, alpha); +} + +Tensor& addmv_(Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) { + check_1d(vec, "vec", "addmv"); + return self._addmv_(mat, vec, beta, alpha); +} + +Tensor& addmv_out(Tensor &result, const Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) { + check_1d(vec, "vec", "addmv"); + return at::_addmv_out(result, self, mat, vec, beta, alpha); +} + +Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) { + check_1d(vec1, "vec1", "addr"); + check_1d(vec2, "vec2", "addr"); + return at::_addr(self, vec1, vec2, beta, alpha); +} + +Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) { + check_1d(vec1, "vec1", "addr"); + check_1d(vec2, "vec2", "addr"); + return self._addr_(vec1, vec2, beta, alpha); +} + +Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) { + check_1d(vec1, "vec1", "addr"); + check_1d(vec2, "vec2", "addr"); + return at::_addr_out(result, self, vec1, vec2, beta, alpha); +} + +Tensor dot(const Tensor& self, const Tensor& tensor) { + check_1d(self, "self", "dot"); + check_1d(tensor, "tensor", "dot"); + return self._dot(tensor); +} + +Tensor& dot_out(Tensor& result, const Tensor& self, const Tensor& tensor) { + result.resize_({}); + // dispatching through type ensures we don't allow mismatched types. + return self.type().fill_(result, self.dot(tensor)); +} + +/* +Matrix product of two Tensors. +The behavior depends on the dimensionality of the Tensors as follows: +- If both Tensors are 1-dimensional, the dot product (scalar) is returned. +- If both arguments are 2-dimensional, the matrix-matrix product is returned. +- If the first argument is 1-dimensional and the second argument is 2-dimensional, + a 1 is prepended to its dimension for the purpose of the matrix multiply. + After the matrix multiply, the prepended dimension is removed. +- If the first argument is 2-dimensional and the second argument is 1-dimensional, + the matrix-vector product is returned. +- If both arguments are at least 1-dimensional and at least one argument is + N-dimensional (where N > 2), then a batched matrix multiply is returned. If the first + argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the + batched matrix multiply and removed after. If the second argument is 1-dimensional, a + 1 is appended to its dimension for the purpose of the batched matrix multiple and removed after. + The non-matrix (i.e. batch) dimensions are broadcasted (and thus + must be broadcastable). For example, if tensor1 is a (j x 1 x n x m) Tensor + and tensor2 is a (k x m x p) Tensor, the returned tensor will be an (j x k x n x p) Tensor. +*/ +Tensor matmul(at::optional out_opt, const Tensor& tensor1, const Tensor& tensor2) { + auto dim_tensor1 = tensor1.dim(); + auto dim_tensor2 = tensor2.dim(); + auto has_out = out_opt.has_value(); + Tensor out = out_opt.value_or(Tensor()); + + if (dim_tensor1 == 1 && dim_tensor2 == 1) { + return has_out ? at::native::dot_out(out, tensor1, tensor2) : tensor1.dot(tensor2); + } else if (dim_tensor1 == 2 && dim_tensor2 == 1) { + return has_out ? at::native::mv_out(out, tensor1, tensor2) : tensor1.mv(tensor2); + } else if (dim_tensor1 == 1 && dim_tensor2 == 2) { + return has_out ? at::native::mm_out(out, tensor1.unsqueeze(0), tensor2).squeeze_(0) + : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0); + } else if (dim_tensor1 == 2 && dim_tensor2 == 2) { + return has_out ? at::native::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2); + } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) { + // optimization: use mm instead of bmm by folding tensor1's batch into + // its leading matrix dimension. + + Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2; + auto size1 = tensor1.sizes(); + auto size2 = t2.sizes(); + std::vector output_size; + output_size.insert(output_size.end(), size1.begin(), size1.end() - 1); + if (dim_tensor2 > 1) { + output_size.push_back(size2[dim_tensor2 - 1]); + } + + // fold the batch into the first dimension + Tensor t1 = tensor1.contiguous().view({-1, size1[size1.size() - 1]}); + Tensor output = has_out ? at::_unsafe_view(at::mm_out(out, t1, t2), output_size) + : at::_unsafe_view(t1.mm(t2), output_size); + return has_out ? out.set_(output) : output; + } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) { + // We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list); + // we track m1 vs m2 separately even though they must match for nicer error messages + int64_t n = dim_tensor1 > 1 ? tensor1.size(-2) : 1; + int64_t m1 = tensor1.size(-1); + IntList batch_tensor1(tensor1.sizes().data(), std::max(dim_tensor1 - 2, 0)); + int64_t m2 = dim_tensor2 > 1 ? tensor2.size(-2) : 1; + int64_t p = tensor2.size(-1); + IntList batch_tensor2(tensor2.sizes().data(), std::max(dim_tensor2 - 2, 0)); + + // expand the batch portion (i.e. cut off matrix dimensions and expand rest) + std::vector expand_batch_portion = infer_size(batch_tensor1, batch_tensor2); + + std::vector tensor1_expand_size(expand_batch_portion); + tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1}); + + std::vector tensor2_expand_size(expand_batch_portion); + tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p}); + + int expand_batch_product = std::accumulate(expand_batch_portion.begin(), expand_batch_portion.end(), + 1, std::multiplies()); + + std::vector tensor1_bmm_view({expand_batch_product}); + tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1}); + + std::vector tensor2_bmm_view({expand_batch_product}); + tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p}); + + // flatten expanded batches + Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).contiguous().view(tensor1_bmm_view); + Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).contiguous().view(tensor2_bmm_view); + + // reshape batches back into result + std::vector output_shape(expand_batch_portion); + if (dim_tensor1 > 1) { + output_shape.push_back(n); + } + if (dim_tensor2 > 1) { + output_shape.push_back(p); + } + + Tensor output = has_out ? at::_unsafe_view(at::bmm_out(out, tensor1_expanded, tensor2_expanded), output_shape) + : at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape); + + return has_out ? out.set_(output) : output; + } + + AT_ERROR("both arguments to matmul need to be at least 1D, but they are ", + dim_tensor1, "D and ", dim_tensor2, "D"); + +} + +Tensor matmul(const Tensor & tensor1, const Tensor & tensor2) { + return at::native::matmul(at::nullopt, tensor1, tensor2); +} + +Tensor& matmul_out(Tensor &result, const Tensor & tensor1, const Tensor & tensor2) { + at::native::matmul(at::optional(result), tensor1, tensor2); + return result; +} + +} +} diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h new file mode 100644 index 0000000..d7b9a6d --- /dev/null +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -0,0 +1,42 @@ +#include "ATen/ATen.h" + +namespace at { namespace native { + +/* + * Clones a Tensor so that the following conditions hold: + * If we think of a Tensor of having size (B, M, N), where B is any number + * of batch dimensions, then: + * - Each (M, N) matrix is in column major form + * - Let Tensor P have size (B, M, N) and Q have size (B, M', N'). + * Then when laid out in memory, the M by N matrix starting at + * P.data_ptr()[b * M * N] is of the same corresponding batch as the M' by N' + * matrix starting at Q.data_ptr()[b * M' * N']. + */ +static inline Tensor cloneBatchedColumnMajor(const Tensor& src) { + // If src is already in batched column major format, then + // this will be efficient (no reordering of the data will occur) + // because the first transpose will make the tensor contiguous, + // and cloning a contiguous tensor is fast. + auto result = src.transpose(-2, -1).clone(); + result.transpose_(-2, -1); + return result; +} + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches. + */ +static inline int64_t batchCount(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + result *= batched_matrices.size(i); + } + return result; +} + +// Computes the number of elements of a matrix in a batched matrix tensor +static inline int64_t matrixStride(const Tensor& batched_matrices) { + return batched_matrices.size(-1) * batched_matrices.size(-2); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp new file mode 100644 index 0000000..c370cb4 --- /dev/null +++ b/aten/src/ATen/native/Loss.cpp @@ -0,0 +1,74 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + +#define EPSILON 1e-12 + + +namespace at { namespace native { + +Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) { + auto prod_sum = (input1 * input2).sum(1); + auto mag_square1 = (input1 * input1).sum(1) + EPSILON; + auto mag_square2 = (input2 * input2).sum(1) + EPSILON; + auto denom = (mag_square1 * mag_square2).sqrt_(); + auto cos = prod_sum / denom; + + auto zeros = at::zeros_like(target); + auto pos = 1 - cos; + auto neg = (cos - margin).clamp_min_(0); + auto output_pos = at::where(target == 1, pos, zeros); + auto output_neg = at::where(target == -1, neg, zeros); + auto output = output_pos + output_neg; + + if (reduction == Reduction::ElementwiseMean) { + return output.sum() / target.numel(); + } else if (reduction == Reduction::Sum) { + return output.sum(); + } + return output; +} + +Tensor hinge_embedding_loss(const Tensor& self, const Tensor& target, double margin, int64_t reduction) { + auto zeros = at::zeros_like(self); + auto margin_clamp = (margin - self).clamp_min_(0); + auto output_margin = at::where(target != 1, margin_clamp, zeros); + auto output_self = at::where(target != -1, self, zeros); + auto output = output_margin + output_self; + + if (reduction == Reduction::ElementwiseMean) { + return output.sum() / self.numel(); + } else if (reduction == Reduction::Sum) { + return output.sum(); + } + return output; +} + +Tensor triplet_margin_loss(const Tensor& anchor, const Tensor& positive, const Tensor& negative, double margin, + double p, double eps, bool swap, int64_t reduction) { + auto dist_pos = at::pairwise_distance(anchor, positive, p, eps); + auto dist_neg = at::pairwise_distance(anchor, negative, p, eps); + if (swap) { + auto dist_swap = at::pairwise_distance(positive, negative, p, eps); + dist_neg = at::min(dist_neg, dist_swap); + } + auto output = at::clamp_min(margin + dist_pos - dist_neg, 0); + + if (reduction == Reduction::ElementwiseMean) { + return output.sum() / output.numel(); + } else if (reduction == Reduction::Sum) { + return output.sum(); + } + return output; +} + +Tensor margin_ranking_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) { + auto output = (-target * (input1 - input2) + margin).clamp_min_(0); + + if (reduction == Reduction::ElementwiseMean) { + return output.sum() / output.numel(); + } else if (reduction == Reduction::Sum) { + return output.sum(); + } + return output; +} +}} // namespace at::native diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp new file mode 100644 index 0000000..dfb7e62 --- /dev/null +++ b/aten/src/ATen/native/Memory.cpp @@ -0,0 +1,20 @@ +#include "ATen/ATen.h" +#include "ATen/Error.h" +#include "ATen/NativeFunctions.h" +#include "ATen/detail/CUDAHooksInterface.h" + +namespace at { +namespace native { + +Tensor pin_memory(const Tensor& self) { + if (self.type().backend() != kCPU) { + AT_ERROR("cannot pin '", self.type().toString(), "' only CPU memory can be pinned"); + } + auto* allocator = detail::getCUDAHooks().getPinnedMemoryAllocator(); + auto tensor = self.type().tensorWithAllocator(self.sizes(), self.strides(), allocator); + tensor.copy_(self); + return tensor; +} + +} +} diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp new file mode 100644 index 0000000..ded0082 --- /dev/null +++ b/aten/src/ATen/native/Normalization.cpp @@ -0,0 +1,193 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + +#include "ATen/Config.h" + +#include "ATen/detail/CUDAHooksInterface.h" + +#include + +namespace at { namespace native { + +namespace { + void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){ + if (actual != expected){ + std::stringstream ss; + ss << arg_name << " should contain " << expected << " elements not " << actual ; + throw std::runtime_error(ss.str()); + } + } +} + +Tensor batch_norm( + const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */, + const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */, + bool training, double momentum, double eps, bool cudnn_enabled) { + + auto num_features = input.sizes()[1]; + if (running_mean.defined()) { + check_dims_match_num_input_features("running_mean", num_features, running_mean.numel()); + } else if (!training) { + throw std::runtime_error("running_mean must be defined in evaluation mode"); + } + if (running_var.defined()) { + check_dims_match_num_input_features("running_var", num_features, running_var.numel()); + } else if (!training) { + throw std::runtime_error("running_var must be defined in evaluation mode"); + } + if (weight.defined()) { + check_dims_match_num_input_features("weight", num_features, weight.numel()); + } + if (bias.defined()) { + check_dims_match_num_input_features("bias", num_features, bias.numel()); + } + + bool use_cudnn = false; + use_cudnn = (input.type().is_cuda() + && (input.type().scalarType() != at::kHalf + || weight.type().scalarType() == at::kFloat) + && weight.defined() && bias.defined() + && ((running_mean.defined() && running_var.defined()) + || (!running_mean.defined() && !running_var.defined() && training)) + && input.size(0) <= 131070 + && detail::getCUDAHooks().compiledWithCuDNN() + && cudnn_enabled && detail::getCUDAHooks().versionCuDNN() >= 5110L); + + if (use_cudnn && eps >= detail::getCUDAHooks().batchnormMinEpsilonCuDNN()) { + return std::get<0>(at::cudnn_batch_norm( + input.contiguous(), weight.contiguous(), + bias.contiguous(), + running_mean.defined() ? running_mean.contiguous() : running_mean, + running_var.defined() ? running_var.contiguous() : running_var, + training, momentum, eps)); + } + + return at::thnn_batch_norm( + input.contiguous(), weight, bias, + running_mean, running_var, training, momentum, eps); +} + +Tensor layer_norm(const Tensor& input, IntList normalized_shape, + const Tensor& weight /* optional */, const Tensor& bias /* optional */, + double eps, bool cudnn_enabled) { + + int64_t normalized_ndim = normalized_shape.size(); + + if (normalized_ndim < 1) { + std::stringstream ss; + ss << "Expected normalized_shape to be at least 1-dimensional, i.e., " + << "containing at least one element, but got normalized_shape=" + << normalized_shape; + throw std::runtime_error(ss.str()); + } + + if (weight.defined() && !weight.sizes().equals(normalized_shape)) { + std::stringstream ss; + ss << "Expected weight to be of same shape as normalized_shape, but got " + << "weight of shape " << weight.sizes() << " and normalized_shape=" + << normalized_shape; + throw std::runtime_error(ss.str()); + } + + if (bias.defined() && !bias.sizes().equals(normalized_shape)) { + std::stringstream ss; + ss << "Expected bias to be of same shape as normalized_shape, but got " + << "bias of shape " << bias.sizes() << " and normalized_shape=" + << normalized_shape; + throw std::runtime_error(ss.str()); + } + + auto input_shape = input.sizes(); + auto input_ndim = input.dim(); + + if (input_ndim < normalized_ndim || + !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) { + std::stringstream ss; + ss << "Given normalized_shape=" << normalized_shape + << ", expected input with shape [*"; + for (auto size : normalized_shape) { + ss << ", " << size; + } + ss << "], but got input of size" << input_shape; + throw std::runtime_error(ss.str()); + } + + int64_t n = 1; + for (int64_t i = 0; i < input_ndim - normalized_ndim; i++) { + n *= input_shape[i]; + } + + // Apply layer norm + auto input_reshaped = input.contiguous().view({1, n, -1}); + + auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps, + cudnn_enabled); + out = out.view(input_shape); + + if (weight.defined() && bias.defined()) { + return bias.addcmul(out, weight, 1); + } else if (weight.defined()) { + return out.mul(weight); + } else if (bias.defined()) { + return out.add(bias); + } else { + return out; + } +} + +Tensor group_norm(const Tensor& input, int64_t num_groups, + const Tensor& weight /* optional */, const Tensor& bias /* optional */, + double eps, bool cudnn_enabled) { + + auto input_shape = input.sizes(); + int64_t b = input.size(0); + int64_t c = input.size(1); + + if (c % num_groups != 0) { + std::stringstream ss; + ss << "Expected number of channels in input to be divisible by " + << "num_groups, but got input of shape " << input.sizes() << " and " + << "num_groups=" << num_groups; + throw std::runtime_error(ss.str()); + } + + if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) { + std::stringstream ss; + ss << "Expected weight to be a vector of size equal to the number of " + << "channels in input, but got weight of shape " << weight.sizes() + << " and input of shape " << input.sizes(); + throw std::runtime_error(ss.str()); + } + + if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) { + std::stringstream ss; + ss << "Expected bias to be a vector of size equal to the number of " + << "channels in input, but got bias of shape " << weight.sizes() + << " and input of shape " << input.sizes(); + throw std::runtime_error(ss.str()); + } + + // Apply group norm + auto input_reshaped = input.contiguous().view({1, b * num_groups, -1}); + + auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps, + cudnn_enabled); + out = out.view(input_shape); + + if (!weight.defined() && !bias.defined()) { + return out; + } + + std::vector affine_param_shape(input.dim(), 1); + affine_param_shape[1] = c; + + if (weight.defined() && bias.defined()) { + return bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1); + } else if (weight.defined()) { + return out.mul(weight.view(affine_param_shape)); + } else { + return out.add(bias.view(affine_param_shape)); + } +} + +}} // at::native diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp new file mode 100644 index 0000000..a13cae0 --- /dev/null +++ b/aten/src/ATen/native/Pooling.cpp @@ -0,0 +1,134 @@ +#include "ATen/ATen.h" + +#include "ATen/Error.h" +#include "ATen/NativeFunctions.h" +#include "ATen/TensorUtils.h" + +#include + +namespace at { namespace native { + +static void check1d( + const char* function_name, + const char* argument_name, + IntList x) { + AT_CHECK( + x.size() == 1, + function_name, "() argument '", argument_name, + "' should contain one int (got ", x.size(), ")"); +} + +Tensor adaptive_avg_pool1d(const Tensor & self, IntList output_size) { + checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3); + check1d("adaptive_avg_pool1d", "output_size", output_size); + + auto output = at::adaptive_avg_pool2d( + self.unsqueeze(2), + {1, output_size[0]}); + + return output.squeeze(2); +} + +std::tuple adaptive_max_pool1d(const Tensor & self, IntList output_size) { + checkDim("adaptive_max_pool1d", TensorArg(self, "self", 1), 3); + check1d("adaptive_max_pool1d", "output_size", output_size); + + Tensor output, indices; + std::tie(output, indices) = at::adaptive_max_pool2d( + self.unsqueeze(2), + {1, output_size[0]}); + + return std::make_tuple(output.squeeze(2), indices.squeeze(2)); +} + +std::tuple max_pool1d_with_indices( + const Tensor& self, + IntList kernel_size, + IntList stride, + IntList padding, + IntList dilation, + bool ceil_mode) { + if (stride.empty()) { + stride = kernel_size; + } + checkDim("max_pool1d", TensorArg(self, "self", 1), 3); + check1d("max_pool1d", "kernel_size", kernel_size); + check1d("max_pool1d", "stride", stride); + check1d("max_pool1d", "padding", padding); + check1d("max_pool1d", "dilation", dilation); + + Tensor output, indices; + std::tie(output, indices) = at::max_pool2d_with_indices( + self.unsqueeze(2), + {1, kernel_size[0]}, + {1, stride[0]}, + {0, padding[0]}, + {1, dilation[0]}, + ceil_mode); + + return std::make_tuple(output.squeeze(2), indices.squeeze(2)); +} + +Tensor avg_pool1d( + const Tensor& self, + IntList kernel_size, + IntList stride, + IntList padding, + bool ceil_mode, + bool count_include_pad) { + if (stride.empty()) { + stride = kernel_size; + } + checkDim("avg_pool1d", TensorArg(self, "self", 1), 3); + check1d("avg_pool1d", "kernel_size", kernel_size); + check1d("avg_pool1d", "stride", stride); + check1d("avg_pool1d", "padding", padding); + + auto output = at::avg_pool2d( + self.unsqueeze(2), + {1, kernel_size[0]}, + {1, stride[0]}, + {0, padding[0]}, + ceil_mode, + count_include_pad); + + return output.squeeze(2); +} + +Tensor max_pool1d( + const Tensor& self, + IntList kernel_size, + IntList stride, + IntList padding, + IntList dilation, + bool ceil_mode) { + auto output_and_indices = at::max_pool1d_with_indices( + self, kernel_size, stride, padding, dilation, ceil_mode); + return std::get<0>(output_and_indices); +} + +Tensor max_pool2d( + const Tensor& self, + IntList kernel_size, + IntList stride, + IntList padding, + IntList dilation, + bool ceil_mode) { + auto output_and_indices = at::max_pool2d_with_indices( + self, kernel_size, stride, padding, dilation, ceil_mode); + return std::get<0>(output_and_indices); +} + +Tensor max_pool3d( + const Tensor& self, + IntList kernel_size, + IntList stride, + IntList padding, + IntList dilation, + bool ceil_mode) { + auto output_and_indices = at::max_pool3d_with_indices( + self, kernel_size, stride, padding, dilation, ceil_mode); + return std::get<0>(output_and_indices); +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md new file mode 100644 index 0000000..d4ad799 --- /dev/null +++ b/aten/src/ATen/native/README.md @@ -0,0 +1,310 @@ +ATen "native" functions are the modern mechanism for adding operators and +functions to ATen (they are "native" in contrast to legacy functions, which are bound +via TH/THC cwrap metadata). Native functions +are declared in `native_functions.yaml` and have implementations defined +in one of the `cpp` files in this directory. + +Like all ATen methods/functions, native functions are made available +from both ATen's C++ and Python APIs. In C++, they are made available +either as methods on `Tensor` (`t.mymeth()`) and functions in the ATen +namespace (`at::myfunc()`). In PyTorch, they are made available as +methods on `Variable` or as functions on `torch._C._FunctionBase` +(it is the user's responsibility to re-exporting these functions in +a more user-facing module.) At the moment, only +functions which ingest `Variable` are made available; to use a function +with non-differentiable tensors, wrap your tensors with `Variable` before +passing them in. + +The rest of this document describes how to implement an ATen function. + +## Registering a function in `native_functions.yaml` + +Every native function must have an entry in +`native_functions.yaml`. The format can be summarized as: + +``` +- func: func_name(ArgType arg0[=default], ArgType arg1[=default], ...) -> ReturnType + variants: function, method + dispatch: + CPU: func_cpu + CUDA: func_cuda +``` + +Each component is described in more detail below: + +### `func` + +``` +- func: func_name(ArgType arg0[=default], ArgType arg1[=default], ...) -> ReturnType +``` + +The `func` entry is a string describing the name of the function and its type +signature. + +**Argument types.** These types are permissible as ArgType: + +- `Tensor`. A `Tensor` argument translates into a C++ argument of type `const Tensor&` + (except when the argument is "inplace"; in this case, it is simply `Tensor&`). + A trailing `?`, as in `Tensor?`, indicates that the tensor argument is optional + and may be omitted by passing an undefined tensor. When a function takes multiple + `Tensor` arguments, these tensors are assumed to be the same type (e.g., + if one argument is a `FloatTensor`, all other arguments are checked + to be `FloatTensor`s.) +- Tensors of specific types. At the moment, valid type names are: + - `IntegerTensor` (a.k.a. `LongTensor`) + - `BoolTensor` (a.k.a. `ByteTensor`) + - `IndexTensor` (a.k.a. `IntTensor`) + These type names were inherited from TH, and may be renamed soon, so + don't commit them to memory. +- `TensorList`. A `TensorList` argument translates into a C++ argument of type `ArrayRef` + (a.k.a. `TensorList`) +- `IntList`. `IntList` accepts an optional length specifier, e.g., `IntList[2]`, which + has no effect in C++ but extends our Python bindings to accept a bare number, which will be + expanded into an appropriately sized list by repeating the number. +- `int64_t`. There is no `int`; ATen policy is to use `int64_t` in the API anywhere you would + have ordinarily passed an `int` or `size_t`. +- `double`. There is no `float`; ATen policy is to use `double` anywhere you would have used `float`. +- `bool` +- `Scalar`. `Scalar` supports binding to any numerical types from Python, including integral types, + floating point types, and zero dimensional tensors. `int64_t` and `double` can only bind to the + corresponding Python numerical types. However, you probably don't want to use `Scalar`. It's + really used for binding to TH/THC code "real" types where the Python APIs you are binding to are + actually different types. `double` and `int64_t` argument types should suffice for most algorithms. +- `Generator*`, the state for a random number generator, +- `std::array` (where N is `1-4`). NB: you MUST NOT put a space after the comma, otherwise + this argument will not parse correctly. (If you decide to fix this, make sure you fix the + argument parser both in ATen and in PyTorch.) +- `*` is a special sentinel argument, which doesn't translate into an actual + argument, but indicates that in the Python bindings, any subsequent arguments + must be specified as keyword arguments (and cannot be provided positionally). + +**Return types.** These types are permissible as ReturnType: + +- `Tensor` and `TensorList`, which translate into the C++ types `Tensor` and `std::vector`, + respectively (unless the operation is in-place, in which case the return type + is `Tensor&`. +- A tuple of any number of `Tensor`, e.g., `(Tensor, Tensor)`, translating into + the C++ `std::tuple`. + +If you need a type that is not listed in this list, it may be possible to extend ATen's +code generation to support it. ATen's philosophy on types to support is that it supports +only simple, universal types, as well as a handful of fundamental Tensor structures +(e.g., `Tensor` and `Generator*`), because these types can be easily ported to any language +bound to ATen (in practice, C++ and Python.) + +**Argument names.** Argument names are meaningful; downstream binding code may make use of the specific +argument name you provide, and a rename of an argument name is considered a BC-breaking +change (e.g., you will probably need to update `tools/autograd/derivatives.yaml` at +least). In `native_functions.yaml`, if your function (usually functions named with 'out' affix) args +include the result Tensor, you need to call the argument `Tensor result`. And if there are more +than one result Tensors, you need to name the args `Tensor result0, Tensor result1, ...`. + +TODO: Do argument names affect Python keyword arguments? + +**Defaults.** Any suffix of arguments can have a default value defined; +these default values translate into C++/Python default values which +are applied when those positional arguments are not specified. + +Here are the supported default values: + +* Numbers (e.g., `0` or `5.0` for `int64_t`, `double` and `IntList` + with an explicit length (e.g., `IntList[2]`)--in the case of IntList, + a number is replicated to fill the length (e.g., `IntList[2] x=2` + is equivalent to `IntList[2] x={2,2}`. +* Lists of numbers (e.g., `{0, 0}`) for `IntList`. +* Booleans (e.g., `true`) for `bool`. +* Empty initializer lists (e.g., `{}`) for `Tensor` (this implicitly changes + a `Tensor` argument to accept undefined tensors). +* `nullptr` for pointer types (e.g., `Generator*`) + +The declarations also support the following attributes: + +### `variants` + +``` +variants: function, method +``` + +Controls whether Tensor method (`t.foo()`) or namespace Function (`at::foo()`) is +generated as a result of this declaration. If the declaration is a method, +you must have an argument `Tensor self` at some position in the method; +in the method variant this argument will be elided from the argument +list. For example, given the declaration `where(BoolTensor cond, Tensor self, Tensor other)`, +this generates the function `at::where(cond, self, other)` and the method +`self.where(cond, other)`. + +By default, ATen generates both function and method variants for a native function. +Generally, the function variant is always useful; however, you may not wish +to generate a method variant. Tensor operations as methods are appropriate for "core" +Tensor operations (e.g., add, sub, etc.), but not for more complicated neural network +layers (e.g., `conv2d`) and internal functions designed specifically for binding +(e.g., `cudnn_convolution`). + +### `dispatch` + +``` +dispatch: + CPU: func_cpu + CUDA: func_cuda +``` + +This specifies the actual name of the function you want to dispatch to, so you +can dispatch to different functions depending on whether or not you have CPU or +CUDA tensors. Technically, it is also possible to write `dispatch: func_name` +to unconditionally dispatch to a native function whose name is different than +the name in the public ATen API, but this is generally frowned upon (just name +them the same thing!) + +### `python_default_init` + +``` +python_default_init: + argument_name: initializing_expression +``` + +A map from argument names to default initializing expressions written in C++. Such default +expressions will only be used in Python API (in the C++ API, these arguments are +mandatory). + +There are a few situations where you might like to use this functionality: + +- You want a default value which is fine in Python but would cause ambiguity in C++. + For example, `norm(Tensor self, real p=2, int64_t dim=1)` would cause ambiguity + with long tensors in C++. Therefore, we need to make `p=2` a python only default + initialization value. + +- You want a value to default to the same value as another argument (this cannot + be expressed in C++ default arguments). + +If you grep for `python_default_init`, you can find examples of this being used; +in general, most functions will not need to use this. + +## Writing an implementation in C++ + +Implementations of native functions go in an appropriate C++ file in the +`native/` directory (they are organized roughly by topic, but there is no +semantic meaning to their organization aside for the `cuda` directory, +which is the only place the build system knows how to build `cu` files.) +To write a native function, you only need to write a C++ +implementation (no header necessary) with a matching signature to +the generated header from the ATen metadata. There are many +simple native functions; take a look at some of them to see what to do. + +Although, for the most part, writing an ATen function is mostly writing +the algorithm you want to implement, there are some less obvious details +you should also consider. + +### Will your function be automatically differentiable? + +If you are writing a pair of functions `foo` and `foo_backward`, with +the intent that `foo_backward` implements the derivative of `foo`, then +your implementation of `foo` is probably not automatically differentiable: +it might make use of functions like `data_ptr()` or it dispatches differently +depending on if it's operating on CPU or CUDA tensors. Once you write these two functions, +you will have to write an entry correlating them together in +`tools/autograd/derivatives.yaml`. + +However, in some situations, you can write a function in ATen and it +will be automatically differentiated! This can be the case if the function implementation +only calls other operations which are themselves differentiable. In this +case, you don't have to write an entry in `tools/autograd/derivatives.yaml`. + +### Can it handle being passed Variables? + +The biggest subtlety of writing an ATen implementation is the fact that +`Tensor` is not a "final" class: your implementation may be passed objects +which inherit from `Tensor` (in particular, the `Variable` subclass +implements automatic differentiation in PyTorch.) This has some +direct consequences on valid implementations: + +* Never create a `Tensor` directly (e.g., `at::CPU` or `at::CUDA`), as a + caller will be expecting to get `Variable`s out if it passes `Variable`. + Instead, create tensors from the `type()` of one of the input tensors, e.g., + `input.type().tensor()` or `input.type().toScalarType(kByte)` if you need + a different scalar type. + +* If you need to call other ATen functions, be sure to qualify the call + with `at::`; don't call them unqualified (in the `at::native` namespace). + Using the qualified name ensures that your invocation gets dispatched to + the `Variable` (which may be overridden to behave differently than + simply dispatch to `at::native`). + +These are not hard and fast rules: in particular, if you explicitly define +a derivative for a function, it will only ever be called with `Tensor` +arguments. However, it is considered good style to abide by these rules, +since code written in this style is more robust. + +NB: There is one downside to following the `at::` qualification rule, which +is that if you know that you will only ever be called with `Tensor`, a +direct `at::native` call will be more efficient (as it avoids a dynamic +dispatch). + +### How to handle broadcasting? + +Unlike our legacy TH bindings, ATen native functions do not automatically +handle broadcasting; you will have to insert the necessary broadcasting +calls yourself. + +When writing broadcasting code, we obey the convention that `op` is +broadcasting, while `s_op` (with the `s_` prefix) is not broadcasting. The +relationship is best seen by an example of how you would implement broadcasting +addition out of non-broadcasting addition: + +``` +#include + +Tensor add(const Tensor& self, const Tensor& other) { + Tensor b_self, b_other; + std::tie(b_self, b_other) = expand_outplace(self, other, "add"); + return s_add(b_self, b_other); +} + +Tensor s_add(const Tensor& self, const Tensor& other) { + // non-broadcasting implementation of addition +} +``` + +For inplace operations, the convention looks like this: + +``` +Tensor& add_(Tensor& self, const Tensor& other) { + Tensor b_other = expand_inplace(self, other, "add_"); + return s_add_(self, b_other); +} + +Tensor& s_add_(Tensor& self, const Tensor& other) { + // non-broadcasting implementation of inplace addition +} +``` + +### Undefined tensor conventions + +By default, `Tensor` arguments to ATen functions are always defined, unless +you explicitly specified that an undefined tensor was permissible by writing +`Tensor?` or `Tensor x={}`. + +The rules for returning undefined Tensors are a bit more subtle, but there +is only one case you have to remember: + +* If the function in question is a backward function which accepts a + `std::array output_mask` argument, you MUST return an undefined + `Tensor` at every tuple position `i` for which `output_mask[i]` is false, otherwise + +* You MUST NOT return an undefined tensor. + +The most common situations where you might be tempted to return undefined tensors +are when: + +- You have a forward function that may return a buffer if training is enabled, but does not + return the buffer in inference mode. In this case, just return an appropriately + typed zero-size tensor. + +- You have a backward function where the gradient for an input is zero. In this case, you + are expected to create a zero-filled tensor of appropriate size to return for this input. + To get the shape, it may be helpful to take a `TensorGeometry` of the input to use. + +### Debugging tips + +If you build ATen and get a linker error, that probably means you copy-pasted +the C++ definition of your function incorrectly. Double check your `Tensor` +arguments, and make sure you wrote `const Tensor&` in your signature. diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp new file mode 100644 index 0000000..affa9d2 --- /dev/null +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -0,0 +1,685 @@ +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/WrapDimUtils.h" +#include "ATen/WrapDimUtilsMulti.h" +#include "ReduceOpsUtils.h" +#include "cpu/ReduceOpsKernel.h" + +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +static inline Tensor integer_upcast(const Tensor& self, optional dtype) { + ScalarType scalarType = self.type().scalarType(); + ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType); + return self.toType(upcast_scalarType); +} + +static inline Tensor cumsum(const Tensor& self, int64_t dim, optional dtype) { + return at::_cumsum(integer_upcast(self, dtype), dim); +} + +Tensor cumsum(const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::cumsum(self, dim, optional(dtype)); +} + +Tensor cumsum(const Tensor& self, int64_t dim) { + return at::native::cumsum(self, dim, nullopt); +} + +static inline Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim, optional dtype) { + // result type is favored over dtype; check that they match if provided (NumPy doesn't check) + AT_CHECK( + !dtype.has_value() || (result.type().scalarType() == dtype.value()), + "provided dtype must match dtype of result in cumsum. Got ", + at::toString(result.type().scalarType()), + " and ", + at::toString(dtype.value()), + "."); + return at::_cumsum_out(result, self.toType(result.type().scalarType()), dim); +} + +Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::cumsum_out(result, self, dim, optional(dtype)); +} + +Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim) { + return at::native::cumsum_out(result, self, dim, nullopt); +} + +static inline Tensor cumprod(const Tensor& self, int64_t dim, optional dtype) { + return at::_cumprod(integer_upcast(self, dtype), dim); +} + +Tensor cumprod(const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::cumprod(self, dim, optional(dtype)); +} + +Tensor cumprod(const Tensor& self, int64_t dim) { + return at::native::cumprod(self, dim, nullopt); +} + +static inline Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim, optional dtype) { + // result type is favored over dtype; check that they match if provided (NumPy doesn't check) + AT_CHECK( + !dtype.has_value() || (result.type().scalarType() == dtype.value()), + "provided dtype must match dtype of result in cumprod. Got ", + at::toString(result.type().scalarType()), + " and ", + at::toString(dtype.value()), + "."); + return at::_cumprod_out(result, self.toType(result.type().scalarType()), dim); +} + +Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::cumprod_out(result, self, dim, optional(dtype)); +} + +Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim) { + return at::native::cumprod_out(result, self, dim, nullopt); +} + +// ALL REDUCE ################################################################# + +static inline Tensor mean(const Tensor &self, optional dtype) { + ScalarType scalarType = self.type().scalarType(); + AT_CHECK( + at::isFloatingType(scalarType), + "Can only calculate the mean of floating types. Got ", + at::toString(scalarType), + " instead."); + if (self.numel() > 0) { + Tensor result = at::native::sum(self); + return result.div_(self.numel()); + } else { + return self.type().scalarTensor(std::numeric_limits::quiet_NaN()); + } +} + +Tensor mean(const Tensor &self, ScalarType dtype) { + return at::native::mean(self, optional(dtype)); +} + +Tensor mean(const Tensor &self) { + return at::native::mean(self, nullopt); +} + +static inline Tensor sum(const Tensor &self, optional dtype) { + return at::_sum(integer_upcast(self, dtype)); +} + +Tensor sum(const Tensor &self, ScalarType dtype) { + return at::native::sum(self, optional(dtype)); +} + +Tensor sum(const Tensor &self) { + return at::native::sum(self, nullopt); +} + +Tensor _sum_cpu(const Tensor& self) { + if (self.is_contiguous()) { + Tensor result = at::empty({}, self.type()); + sum_kernel(result, self, at::nullopt); + return result; + } + return self._sumall(); +} + +static inline Tensor prod(const Tensor &self, optional dtype) { + return at::_prod(integer_upcast(self, dtype)); +} + +Tensor prod(const Tensor &self, ScalarType dtype) { + return at::native::prod(self, optional(dtype)); +} + +Tensor prod(const Tensor &self) { + return at::native::prod(self, nullopt); +} + +Tensor _prod_cpu(const Tensor &self) { + if (self.is_contiguous()) { + Tensor result = at::empty({}, self.type()); + prod_kernel(result, self, at::nullopt); + return result; + } + return self._prodall(); +} + +// \ALL REDUCE ################################################################ + +// DIM REDUCE ################################################################# + +static inline Tensor &mean_out(Tensor &result, const Tensor &self, int64_t dim, + bool keepdim, optional dtype) { + ScalarType scalarType = result.type().scalarType(); + AT_CHECK( + at::isFloatingType(scalarType), + "Can only calculate the mean of floating types. Got ", + at::toString(scalarType), + " instead."); + at::native::sum_out( + result, self.toType(result.type().scalarType()), dim, keepdim); + if (result.numel() > 0 && self.ndimension() > 0) { + int64_t numel = self.size(dim); + if (numel > 0) { + result.div_(numel); + } else { + // NumPy equivalent + result.fill_(std::numeric_limits::quiet_NaN()); + } + } + return result; +} + +Tensor& mean_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) { + return at::native::mean_out(result, self, dim, keepdim, at::optional(dtype)); +} +Tensor& mean_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) { + return at::native::mean_out(result, self, dim, keepdim, nullopt); +} + +Tensor& mean_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::mean_out(result, self, dim, false, dtype); +} + +static inline Tensor &sum_out(Tensor &result, const Tensor &self, IntList dim, + bool keepdim, optional dtype) { + // result type is favored over dtype; check that they match if provided (NumPy doesn't check) + AT_CHECK( + !dtype.has_value() || (result.type().scalarType() == dtype.value()), + "provided dtype must match dtype of result in sum. Got ", + at::toString(result.type().scalarType()), + " and ", + at::toString(dtype.value()), + "."); + return at::_sum_out(result, self.toType(result.type().scalarType()), dim, keepdim); +} + +Tensor& sum_out(Tensor& result, const Tensor& self, IntList dim, bool keepdim, ScalarType dtype) { + return at::native::sum_out(result, self, dim, keepdim, at::optional(dtype)); +} +Tensor& sum_out(Tensor& result, const Tensor& self, IntList dim, bool keepdim) { + return at::native::sum_out(result, self, dim, keepdim, nullopt); +} + +Tensor& sum_out(Tensor& result, const Tensor& self, IntList dim, ScalarType dtype) { + return at::native::sum_out(result, self, dim, false, dtype); +} + +Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_, + bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) + return result; + if (self.is_contiguous() && result.is_contiguous()) { + _dimreduce_setup(result, self, dim); + sum_kernel(result, self, dim); + if (!keepdim) result.squeeze_(dim); + return result; + } + return at::_th_sum_out(result, self, dim, keepdim); +} + +static inline Tensor &prod_out(Tensor &result, const Tensor &self, int64_t dim, + bool keepdim, optional dtype) { + // result type is favored over dtype; check that they match if provided (NumPy doesn't check) + AT_CHECK( + !dtype.has_value() || (result.type().scalarType() == dtype.value()), + "provided dtype must match dtype of result in prod. Got ", + at::toString(result.type().scalarType()), + " and ", + at::toString(dtype.value()), + "."); + return at::_prod_out(result, self.toType(result.type().scalarType()), dim, keepdim); +} + +Tensor& prod_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) { + return at::native::prod_out(result, self, dim, keepdim, at::optional(dtype)); +} +Tensor& prod_out(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) { + return at::native::prod_out(result, self, dim, keepdim, nullopt); +} + +Tensor& prod_out(Tensor& result, const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::prod_out(result, self, dim, false, dtype); +} + +Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_, + bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) + return result; + if (self.is_contiguous() && result.is_contiguous()) { + _dimreduce_setup(result, self, dim); + prod_kernel(result, self, dim); + if (!keepdim) result.squeeze_(dim); + return result; + } + return at::_th_prod_out(result, self, dim, keepdim); +} + +static inline Tensor mean(const Tensor &self, int64_t dim, bool keepdim, optional dtype) { + ScalarType scalarType = self.type().scalarType(); + AT_CHECK( + at::isFloatingType(scalarType), + "Can only calculate the mean of floating types. Got ", + at::toString(scalarType), + " instead."); + Tensor result = at::native::sum(self, dim, keepdim); + if (result.numel() > 0 && self.ndimension() > 0) { + int64_t numel = self.size(dim); + if (numel > 0) { + result.div_(numel); + } else { + // NumPy equivalent + result.fill_(std::numeric_limits::quiet_NaN()); + } + } + return result; +} + +Tensor mean(const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) { + return at::native::mean(self, dim, keepdim, at::optional(dtype)); +} + +Tensor mean(const Tensor& self, int64_t dim, bool keepdim) { + return at::native::mean(self, dim, keepdim, nullopt); +} + +Tensor mean(const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::mean(self, dim, false, dtype); +} + +static inline Tensor sum(const Tensor &self, IntList dim_, bool keepdim, optional dtype) { + return at::_sum(integer_upcast(self, dtype), dim_, keepdim); +} + +Tensor sum(const Tensor& self, IntList dim, bool keepdim, ScalarType dtype) { + return at::native::sum(self, dim, keepdim, at::optional(dtype)); +} + +Tensor sum(const Tensor& self, IntList dim, bool keepdim) { + return at::native::sum(self, dim, keepdim, nullopt); +} + +Tensor sum(const Tensor& self, IntList dim, ScalarType dtype) { + return at::native::sum(self, dim, false, dtype); +} + +Tensor _sum(const Tensor &self, int64_t dim_, bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + Tensor result = self.type().tensor(); + return at::_sum_out(result, self, dim, keepdim); +} + +static inline Tensor prod(const Tensor &self, int64_t dim_, bool keepdim, optional dtype) { + return at::_prod(integer_upcast(self, dtype), dim_, keepdim); +} + +Tensor prod(const Tensor& self, int64_t dim, bool keepdim, ScalarType dtype) { + return at::native::prod(self, dim, keepdim, at::optional(dtype)); +} + +Tensor prod(const Tensor& self, int64_t dim, bool keepdim) { + return at::native::prod(self, dim, keepdim, nullopt); +} + +Tensor prod(const Tensor& self, int64_t dim, ScalarType dtype) { + return at::native::prod(self, dim, false, dtype); +} + +Tensor _prod(const Tensor &self, int64_t dim_, bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + Tensor result = self.type().tensor(); + return at::_prod_out(result, self, dim, keepdim); +} + +Tensor& logsumexp_out(Tensor& result, const Tensor &self, int64_t dim_, bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + // can't take max of empty tensor. + if (self.numel() != 0) { + auto maxes = at::max_values(self, dim, true); + result = at::where((maxes == INFINITY).__or__(maxes == -INFINITY), + maxes, + maxes + at::log(at::sum(at::exp(self - maxes), dim, true))); + } else { + result = at::log(at::sum(at::exp(self), dim, true)); + } + if (! keepdim) + result.squeeze_(dim); + return result; +} + +Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + Tensor result = self.type().tensor(); + return at::native::logsumexp_out(result, self, dim, keepdim); +} + +// \DIM REDUCE ################################################################ + +// MULTI DIM REDUCE ########################################################### + +// NB: this applies two optimizations: +// 1. Reducing the dimensions in the order of decreasing size, so that the +// larger dimensions are dealt earlier and we can work with less elements +// overall. +// E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}. +// If we reduce in the order of [0, 1, 2], the input and output +// shapes of iterations are: +// it 0: [1, 10, 200] (2000 elem) => [10, 200] (2000 elem) +// it 1: [10, 200] (2000 elem) => [200] ( 200 elem) +// it 2: [200] ( 200 elem) => [ 1] ( 1 elem) +// Since we need to iterate through all input elements at each +// iteration, total number of elements traversed is 4200. +// If we reduce in the order of [2, 1, 0], i.e., with decreasing +// size, the input and output shapes of iterations are: +// it 0: [1, 10, 200] (2000 elem) => [1, 10] (10 elem) +// it 1: [1, 10] ( 10 elem) => [ 1] ( 1 elem) +// it 2: [1] ( 1 elem) => [ 1] ( 1 elem) +// Total number of elements traversed is 2011, much less than 4200. +// 2. Preallocated buffer. +// Utilizing the `_out` variant, instead of allocating new output tensors +// at each iteration, we can use a preallocated buffer. Since output numel +// in each iteration is decreasing, we can reuse the buffer throughout the +// loop. +// Note that we need two buffers, one containing the input, i.e., output +// from the previous iteration, and one containing the output for this +// iteration. +// The largest output size is the output size of the first iteration. After +// that the largest size we need is the output size of the second +// iteration. +// So we allocate +// 1. a region of size `input.numel() / input.size(reduced_dims[0])`, and +// 2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`. +// These two regions are allocated together as a contiguous flattened +// buffer tensor, with a variable `offset` indicating the starting position +// of the output region for the current iteration. +// E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}. +// Say we reduce in the order of [0, 1, 2]. +// The first buffer with has size `4 * 3 * 2 / 4 = 6`. +// The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`. +// So we allocate a tensor of size `6 + 2 = 8`: +// buffer: [ _, _, _, _, _, _, _, _] +// buffer region 1-->^^^^^^^^^^^^^^^^ ^^^^<--buffer region 2 +// 1st iteration: +// (before reduction) +// input: self (or input) +// input shape: [ 4, 3, 2] +// output shape: [ 3, 2] +// buffer: [ _, _, _, _, _, _, _, _] +// offset: ^--beginning of 1st buffer region, i.e., the +// starting output location of 1st iteration. +// (after reduction) +// buffer: [ {output of 1st it}, _, _] +// +// 2nd iteration: +// (before reduction) +// input: output of 1st it +// input shape: [ 3, 2] +// output shape: [ 2] +// buffer: [ {output of 1st it}, _, _] +// offset: ^--beginning of 2nd +// buffer region. We can't +// overwrite the 1st buffer +// as it contains input to +// reduction of this it. +// (after reduction) +// buffer: [ {output of 1st it}, {output of 2nd it}] +// +// 3rd iteration: +// (before reduction) +// input: output of 2nd it +// input shape: [ 2] +// output shape: [ 1] +// buffer: [ {output of 1st it}, {output of 2nd it}] +// offset: ^--beginning of 1st buffer region. We can +// safely overwrite now. +// (after reduction) +// buffer: [ {output of 3rd it}, {output of 2nd it}] +// Return {output of 3rd it}. +// +// TODO: If two or more reduced dimensions are contiguous, reduce as if they are +// a large dimension. +template +inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool keepdim) { + if (dims_.size() == 1) { + return reduce_1(self, dims_[0], keepdim); + } + if (dims_.size() == 0) { + return self; + } + int64_t ndims = self.dim(); + // `reduced_numel` and `reduced_size` will be updated in the loop. + // Before that, they are just size and numel. + int64_t reduced_numel = self.numel(); + auto reduced_size = self.sizes().vec(); + auto dims = dims_.vec(); + maybe_wrap_dims(dims, ndims); + // Sort the reduced dimensions so that we reduce the larger dimensions first. + std::sort(dims.begin(), dims.end(), + [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; }); + // Calculate 1st buffer region size + int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]]; + int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]]; + // We separate `buffer` into two regions, one starting at 0, and another + // starting at max_reduced_numel. These two regions are used alternatively as + // the output of a `reduce_1` along a particular dimension. `offset` will + // indicate which region we should use next. + // Have keepdim=true when reducing. We will squeeze later. + auto buffer = at::empty({buffer_size}, self.options()); + int64_t offset = 0; + Tensor t = self; + for (auto& dim : dims) { + reduced_numel /= reduced_size[dim]; + reduced_size[dim] = 1; + auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size); + t = reduce_1_out(res, t, dim, true); + // switch to other buffer region + // this alternatively changes `offset` between 0 and max_reduced_numel + offset = max_reduced_numel - offset; + } + // squeeze if needed + if (!keepdim) { + std::vector squeezed_shape; + squeezed_shape.reserve(ndims - dims.size()); + auto reduce_dims = dim_list_to_bitset(dims_, ndims); + for (int64_t dim = 0; dim < ndims; dim++) { + if (!reduce_dims[dim]) { + squeezed_shape.emplace_back(reduced_size[dim]); + } + } + return t.view(squeezed_shape); + } + return t; +} + +// See comments above reduce_multi_associative for details. +template +inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self, IntList dims_, bool keepdim) { + if (dims_.size() == 1) { + return reduce_1_out(result, self, dims_[0], keepdim); + } + if (dims_.size() == 0) { + // reduce_out should be clone_out with empty dims_ + return result.resize_as_(self).copy_(self); + } + int64_t ndims = self.dim(); + // `reduced_numel` and `reduced_size` will be updated in the loop. + // Before that, they are just size and numel. + int64_t reduced_numel = self.numel(); + auto reduced_size = self.sizes().vec(); + auto dims = dims_.vec(); + maybe_wrap_dims(dims, ndims); + // Sort the reduced dimensions so that we reduce the largest dimension first. + std::sort(dims.begin(), dims.end(), + [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; }); + // Calculate 1st buffer region size + int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]]; + int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]]; + // We separate `buffer` into two regions, one starting at 0, and another + // starting at max_reduced_numel. These two regions are used alternatively as + // the output of a `reduce_1` along a particular dimension. `offset` will + // indicate which region we should use next. + // Have keepdim=true when reducing. We will squeeze later. + auto buffer = at::empty({buffer_size}, self.options()); + int64_t offset = 0; + Tensor t = self; + int64_t last_reduction = dims.size() - 1; + int64_t num_reduction = 0; + for (auto& dim : dims) { + reduced_numel /= reduced_size[dim]; + reduced_size[dim] = 1; + auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size); + if (num_reduction < last_reduction) { + t = reduce_1_out(res, t, dim, true); + } else { + reduce_1_out(result, t, dim, true); + } + // switch to other buffer region + // this alternatively changes `offset` between 0 and max_reduced_numel + offset = max_reduced_numel - offset; + num_reduction++; + } + // squeeze if needed (use in-place squeeze_) + if (!keepdim) { + auto reduce_dims = dim_list_to_bitset(dims_, ndims); + for (int64_t dim = ndims - 1; dim >= 0; dim--) { + if (reduce_dims[dim]) { + result.squeeze_(dim); + } + } + } + return result; +} + +Tensor& _sum_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { + if (self.is_cuda()) { + return at::_sum_cuda_out(result, self, dim, keepdim); + } else { + return _sum_out_cpu(result, self, dim, keepdim); + } +} + +Tensor _sum(const Tensor &self, IntList dims, bool keepdim) { + return reduce_multi_associative<_sum, _sum_out>(self, dims, keepdim); +} + +Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim) +{ + return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim); +} + +Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::norm_out(result, self, p, dim, keepdim); +} + +Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes"); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { + return result; + } else { + return at::_th_norm_out(result, self, p, dim, keepdim); + } +} + +Tensor all(const Tensor& self, int64_t dim, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::all_out(result, self, dim, keepdim); +} + +Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "all only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "all only supports torch.uint8 dtype"); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) { + return result; + } else { + return at::_th_all_out(result, self, dim, keepdim); + } +} + +Tensor any(const Tensor& self, int64_t dim, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::any_out(result, self, dim, keepdim); +} + +Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "any only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(self.type().scalarType() == at::ScalarType::Byte, "any only supports torch.uint8 dtype"); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { + return result; + } else { + return at::_th_any_out(result, self, dim, keepdim); + } +} + +Tensor var(const Tensor& self, bool unbiased) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "var only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "var only supports floating-point dtypes"); + auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits::quiet_NaN()); + return trivial_return.has_value() ? trivial_return.value() : at::_th_var(self, unbiased); +} + +Tensor var(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::var_out(result, self, dim, unbiased, keepdim); +} + +Tensor &var_out(Tensor &result, const Tensor &self, int64_t dim, bool unbiased, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "var only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "var only supports floating-point dtypes"); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial(result, self, std::numeric_limits::quiet_NaN(), dim, keepdim)) { + return result; + } else { + return at::_th_var_out(result, self, dim, unbiased, keepdim); + } +} + +Tensor std(const Tensor& self, bool unbiased) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "std only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "std only supports floating-point dtypes"); + auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits::quiet_NaN()); + return trivial_return.has_value() ? trivial_return.value() : at::_th_std(self, unbiased); +} + +Tensor std(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::std_out(result, self, dim, unbiased, keepdim); +} + +Tensor &std_out(Tensor &result, const Tensor &self, int64_t dim, bool unbiased, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "std only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "std only supports floating-point dtypes"); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial(result, self, std::numeric_limits::quiet_NaN(), dim, keepdim)) { + return result; + } else { + return at::_th_std_out(result, self, dim, unbiased, keepdim); + } +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h new file mode 100644 index 0000000..172d3c1 --- /dev/null +++ b/aten/src/ATen/native/ReduceOpsUtils.h @@ -0,0 +1,55 @@ +#pragma once + +namespace at { namespace native { + +static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self, + int64_t dim) { + IntList self_sizes = self.sizes(); + std::vector result_sizes; + result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end()); + result_sizes[dim] = 1; + result.resize_(result_sizes); + return result; +} + +static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self, + Scalar ident, int64_t dim, bool keepdim) { + if (self.numel() == 1 && self.ndimension() == 0) { + result.resize_({}); + result.fill_(self); + return true; + } + // Return identity + if (self.numel() == 0) { + _dimreduce_setup(result, self, dim); + result.fill_(ident); + if (!keepdim) result.squeeze_(dim); + return true; + } + return false; +} + +static bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self, + int64_t dim, bool keepdim, const char *fn_name) { + if (self.numel() == 1 && self.ndimension() == 0) { + result.resize_({}); + result.fill_(self); + return true; + } + + if (self.numel() == 0) { + AT_ERROR("cannot perform reduction function ", fn_name, + " on tensor with no elements because the operation does not have an identity"); + } + return false; +} + +static at::optional _allreduce_return_trivial(const Tensor &self, Scalar ident) { + // Return identity + if (self.numel() == 0) { + return self.type().scalarTensor(ident); + } + return at::nullopt; +} + +}} // at::native diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp new file mode 100644 index 0000000..5995e43 --- /dev/null +++ b/aten/src/ATen/native/RoiPooling.cpp @@ -0,0 +1,141 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include + +namespace at { +namespace native { + +std::tuple RoiPooling2d_forward_cpu( + const Tensor& input, + const Tensor& rois, + int64_t pooledHeight, + int64_t pooledWidth, + double spatialScale) +{ + // Input is the output of the last convolutional layer in the Backbone network, so + // it should be in the format of NCHW + AT_CHECK(input.ndimension() == 4, "Input to RoI Pooling should be a NCHW Tensor"); + + // ROIs is the set of region proposals to process. It is a 2D Tensor where the first + // dim is the # of proposals, and the second dim is the proposal itself in the form + // [batch_index startW startH endW endH] + AT_CHECK(rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)"); + AT_CHECK(rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]"); + + auto proposals = rois.size(0); + auto inputChannels = input.size(1); + auto inputHeight = input.size(2); + auto inputWidth = input.size(3); + + // Output Tensor is (num_rois, C, pooledHeight, pooledWidth) + auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth}); + + // TODO: need some mechanism for determining train vs. test + + // During training, we need to store the argmaxes for the pooling operation, so + // the argmaxes Tensor should be the same size as the output Tensor + auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth}); + + AT_CHECK(input.is_contiguous(), "input must be contiguous"); + AT_CHECK(rois.is_contiguous(), "rois must be contiguous"); + + auto *rawInput = input.data(); + auto inputChannelStride = inputHeight * inputWidth; + auto inputBatchStride = inputChannels * inputChannelStride; + auto *rawRois = rois.data(); + auto roiProposalStride = rois.size(1); + + auto *rawOutput = output.data(); + auto *rawArgmaxes = argmaxes.data(); + auto outputChannelStride = pooledHeight * pooledWidth; + + // Now that our Tensors are properly sized, we can perform the pooling operation. + // We iterate over each RoI and perform pooling on each channel in the input, to + // generate a pooledHeight x pooledWidth output for each RoI + for (auto i = 0; i < proposals; ++i) { + auto n = static_cast(rawRois[0]); + auto startWidth = static_cast(std::round(rawRois[1] * spatialScale)); + auto startHeight = static_cast(std::round(rawRois[2] * spatialScale)); + auto endWidth = static_cast(std::round(rawRois[3] * spatialScale)); + auto endHeight = static_cast(std::round(rawRois[4] * spatialScale)); + + // TODO: assertions for valid values? + // TODO: fix malformed ROIs?? + + auto roiHeight = endHeight - startHeight; + auto roiWidth = endWidth - startWidth; + + // Because the Region of Interest can be of variable size, but our output + // must always be (pooledHeight x pooledWidth), we need to split the RoI + // into a pooledHeight x pooledWidth grid of tiles + + auto tileHeight = static_cast(roiHeight) / static_cast(pooledHeight); + auto tileWidth = static_cast(roiWidth) / static_cast(pooledWidth); + + auto *rawInputBatch = rawInput + (n * inputBatchStride); + + // Compute pooling for each of the (pooledHeight x pooledWidth) tiles for each + // channel in the input + for (auto ch = 0; ch < inputChannels; ++ch) { + for (auto ph = 0; ph < pooledHeight; ++ph) { + for (auto pw = 0; pw < pooledWidth; ++pw) { + auto tileHStart = static_cast(std::floor(ph * tileHeight)); + auto tileWStart = static_cast(std::floor(pw * tileWidth)); + auto tileHEnd = static_cast(std::ceil((ph + 1) * tileHeight)); + auto tileWEnd = static_cast(std::ceil((pw + 1) * tileWidth)); + + // Add tile offsets to RoI offsets, and clip to input boundaries + tileHStart = std::min(std::max(tileHStart + startHeight, 0), inputHeight); + tileWStart = std::min(std::max(tileWStart + startWidth, 0), inputWidth); + tileHEnd = std::min(std::max(tileHEnd + startHeight, 0), inputHeight); + tileWEnd = std::min(std::max(tileWEnd + startWidth, 0), inputWidth); + + auto poolIndex = (ph * pooledWidth) + pw; + + // If our pooling region is empty, we set the output to 0, otherwise to + // the min float so we can calculate the max properly + auto empty = tileHStart >= tileHEnd || tileWStart >= tileWEnd; + rawOutput[poolIndex] = empty ? 0 : std::numeric_limits::min(); + + // Set to -1 so we don't try to backprop to anywhere + // TODO: make optional for test + rawArgmaxes[poolIndex] = -1; + + for (auto th = tileHStart; th < tileHEnd; ++th) { + for (auto tw = tileWStart; tw < tileWEnd; ++tw) { + auto index = (th * inputWidth) + tw; + if (rawInputBatch[index] > rawOutput[poolIndex]) { + rawOutput[poolIndex] = rawInputBatch[index]; + // TODO: make optional for test + rawArgmaxes[poolIndex] = index; + } + } + } + } + } + // Increment raw pointers by channel stride + rawInputBatch += inputChannelStride; + rawOutput += outputChannelStride; + // TODO: make optional for test + rawArgmaxes += outputChannelStride; + } + // Increment RoI raw pointer + rawRois += roiProposalStride; + } + + return std::make_tuple(output, argmaxes); +} + +Tensor RoiPooling2d_backward_cpu( + const Tensor& input, + const Tensor& rois, + int64_t pooledHeight, + int64_t pooledWidth, + double spatialScale, + const Tensor& gradOutput, + const Tensor& argmaxes) { + throw std::runtime_error("not implemented"); +} + +} +} diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp new file mode 100644 index 0000000..546c758 --- /dev/null +++ b/aten/src/ATen/native/SoftMax.cpp @@ -0,0 +1,217 @@ +#include "ATen/ATen.h" +#include "ATen/AccumulateType.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Parallel.h" +#include "ATen/TensorUtils.h" +#include "ATen/WrapDimUtils.h" +#include "ATen/native/cpu/SoftmaxKernel.h" + +namespace at { +namespace native { +namespace { + +template +void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { + int64_t outer_size = 1; + int64_t dim_size = input.size(dim); + if (input.numel() == 0) { + return; + } + int64_t inner_size = 1; + for (int64_t i = 0; i < dim; ++i) + outer_size *= input.size(i); + for (int64_t i = dim + 1; i < input.dim(); ++i) + inner_size *= input.size(i); + int64_t dim_stride = inner_size; + int64_t outer_stride = dim_size * dim_stride; + scalar_t* input_data_base = input.data(); + scalar_t* output_data_base = output.data(); + int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); + parallel_for( + 0, outer_size * inner_size, grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { + int64_t outer_idx = i / inner_size; + int64_t inner_idx = i % inner_size; + scalar_t* input_data = + input_data_base + outer_idx * outer_stride + inner_idx; + scalar_t* output_data = + output_data_base + outer_idx * outer_stride + inner_idx; + scalar_t max_input = input_data[0]; + for (int64_t d = 1; d < dim_size; d++) + max_input = std::max(max_input, input_data[d * dim_stride]); + + scalar_t tmpsum = 0; + for (int64_t d = 0; d < dim_size; d++) { + scalar_t z = std::exp(input_data[d * dim_stride] - max_input); + if (!LogSoftMax) { + output_data[d * dim_stride] = z; + } + tmpsum += z; + } + + if (LogSoftMax) + tmpsum = max_input + std::log(tmpsum); + else + tmpsum = 1 / tmpsum; + + for (int64_t d = 0; d < dim_size; d++) + if (LogSoftMax) + output_data[d * dim_stride] = input_data[d * dim_stride] - tmpsum; + else + output_data[d * dim_stride] *= tmpsum; + } + }); +} + +template +void host_softmax_backward( + Tensor& gI, + const Tensor& grad, + const Tensor& output, + int64_t dim) { + + int64_t outer_size = 1; + int64_t dim_size = grad.size(dim); + int64_t inner_size = 1; + for (int64_t i = 0; i < dim; ++i) + outer_size *= grad.size(i); + for (int64_t i = dim + 1; i < grad.dim(); ++i) + inner_size *= grad.size(i); + int64_t dim_stride = inner_size; + int64_t outer_stride = dim_size * dim_stride; + scalar_t* gradInput_data_base = gI.data(); + scalar_t* output_data_base = output.data(); + scalar_t* gradOutput_data_base = grad.data(); + int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); + parallel_for( + 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { + int64_t outer_idx = i / inner_size; + int64_t inner_idx = i % inner_size; + scalar_t* gradInput_data = + gradInput_data_base + outer_idx * outer_stride + inner_idx; + scalar_t* output_data = + output_data_base + outer_idx * outer_stride + inner_idx; + const scalar_t* gradOutput_data = + gradOutput_data_base + outer_idx * outer_stride + inner_idx; + + scalar_t sum = 0; // TODO was accreal here + for (int64_t d = 0; d < dim_size; d++) + if (LogSoftMax) + sum += gradOutput_data[d * dim_stride]; + else + sum += + gradOutput_data[d * dim_stride] * output_data[d * dim_stride]; + + for (int64_t d = 0; d < dim_size; d++) { + if (LogSoftMax) { + gradInput_data[d * dim_stride] = gradOutput_data[d * dim_stride] - + std::exp(output_data[d * dim_stride]) * sum; + } else { + gradInput_data[d * dim_stride] = output_data[d * dim_stride] * + (gradOutput_data[d * dim_stride] - sum); + } + } + } + }); +} +} // namespace + +Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) { + auto input = input_.contiguous(); + Tensor output = at::native::empty_like(input); + int64_t dim = maybe_wrap_dim(dim_, input.dim()); + if (input.dim() == 0) + input = input.view(1); + AT_CHECK( + dim >= 0 && dim < input.dim(), + "dim must be non-negative and less than input dimensions"); + if (input.ndimension() > 0 && dim == input.ndimension() - 1) { + softmax_lastdim_kernel(output, input); + } else { + AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] { + host_softmax(output, input, dim); + }); + } + return output; +} + +Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) { + auto input = input_.contiguous(); + Tensor output = at::native::empty_like(input); + int64_t dim = maybe_wrap_dim(dim_, input.dim()); + if (input.dim() == 0) + input = input.view(1); + AT_CHECK( + dim >= 0 && dim < input.dim(), + "dim must be non-negative and less than input dimensions"); + if (input.ndimension() > 0 && dim == input.ndimension() - 1) { + log_softmax_lastdim_kernel(output, input); + } else { + AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] { + host_softmax(output, input, dim); + }); + } + return output; +} + +Tensor softmax_backward_cpu( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_) { + TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; + checkSameSize("softmax_backward", grad_arg, output_arg); + int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); + auto grad = grad_.contiguous(); + auto output = output_.contiguous(); + Tensor grad_input = at::native::empty_like(grad); + + if (grad.dim() == 0) + grad = grad.view(1); + if (output.dim() == 0) + output = output.view(1); + AT_CHECK( + dim >= 0 && dim < grad.dim(), + "dim must be non-negative and less than input dimensions"); + if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { + softmax_backward_lastdim_kernel(grad_input, grad, output); + } else { + AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] { + host_softmax_backward(grad_input, grad, output, dim); + }); + } + return grad_input; +} + +Tensor log_softmax_backward_cpu( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_) { + TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; + checkSameSize("log_softmax_backward", grad_arg, output_arg); + int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); + auto grad = grad_.contiguous(); + auto output = output_.contiguous(); + Tensor grad_input = at::native::empty_like(grad); + + if (grad.dim() == 0) + grad = grad.view(1); + if (output.dim() == 0) + output = output.view(1); + AT_CHECK( + dim >= 0 && dim < grad.dim(), + "dim must be non-negative and less than input dimensions"); + if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { + log_softmax_backward_lastdim_kernel(grad_input, grad, output); + } else { + AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] { + host_softmax_backward(grad_input, grad, output, dim); + }); + } + return grad_input; +} +} +} diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp new file mode 100644 index 0000000..5d1c883 --- /dev/null +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -0,0 +1,269 @@ +// define constants like M_PI and C keywords for MSVC +#ifdef _MSC_VER +#define _USE_MATH_DEFINES +#include +#endif + +#include "ATen/ATen.h" +#include "ATen/Config.h" +#include "ATen/NativeFunctions.h" +#include "ATen/detail/CUDAHooksInterface.h" +#include "ATen/native/SpectralOpsUtils.h" + +#include +#include +#include + +namespace at { namespace native { + +// This is a pass-through wrapper function that does the size check and +// inferences. The actual forward implementation function is called +// at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU). +static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim, + const bool complex_input, const bool complex_output, + const bool inverse, IntList signal_sizes, const bool normalized, + const bool onesided) { + + if (signal_ndim < 1 || signal_ndim > 3) { + std::ostringstream ss; + ss << "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim=" + << signal_ndim; + throw std::runtime_error(ss.str()); + } + if (!at::isFloatingType(self.type().scalarType())) { + std::ostringstream ss; + ss << "Expected an input tensor of floating types, but got input=" + << self.type() << self.sizes(); + throw std::runtime_error(ss.str()); + } + + auto signal_tensor_ndim = signal_ndim + static_cast(complex_input); // add complex dim + if (self.dim() < signal_tensor_ndim) { + std::ostringstream ss; + ss << "Given signal_ndim=" << signal_ndim << ", expected an input tensor " + << "of at least" << signal_tensor_ndim << "D"; + if (complex_input) { + ss << " (complex input adds an extra dimension)"; + } + ss << ", but got input=" << self.type() << self.sizes(); + throw std::runtime_error(ss.str()); + } + + auto self_shape = self.sizes(); + auto batch_ndim = self.dim() - signal_tensor_ndim; + + Tensor input = self; + // flatten the batch dims + if (batch_ndim == 0) { + // slightly faster path for non-batch mode + input = input.unsqueeze(0); + } else if (batch_ndim > 1) { + std::vector flatten_input_shape(signal_tensor_ndim + 1); + std::copy(self_shape.begin() + batch_ndim, self_shape.end(), flatten_input_shape.begin() + 1); + flatten_input_shape[0] = -1; + input = input.reshape(flatten_input_shape); + + } + + // now we assume that input is batched as [ B x signal_dims... ] + + if (complex_input) { + if (input.size(signal_ndim + 1) != 2) { + std::ostringstream ss; + ss << "Expected an input tensor with a last dimension of size 2 " + << "representing real + imaginary components, but got input " + << self.type() << self.sizes(); + throw std::runtime_error(ss.str()); + } + } + + // build signal_sizes and output_size + if (signal_sizes.size() > 0 && static_cast(signal_sizes.size()) != signal_ndim) { + std::ostringstream ss; + ss << "Expected signal_sizes to be empty (default) or of signal_ndim=" + << signal_ndim << "D, but got signal_sizes=" << signal_sizes; + throw std::runtime_error(ss.str()); + } + std::vector output_sizes(signal_ndim + 1 + static_cast(complex_output)); + output_sizes[0] = input.size(0); // batch size + std::vector checked_signal_sizes(signal_ndim); + for (int64_t i = 0; i < signal_ndim; i++) { + int64_t input_size = input.size(i + 1); + if (i == signal_ndim - 1 && onesided && complex_input && !complex_output) { + // If last dim and complex-to-real onesided, input is only half of + // signal, and we need to infer basing on signal_sizes, if given + // See native/SpectralOpsUtils.h for detailed description. + int64_t inferred_size; + if (signal_sizes.size() > 0) { + inferred_size = infer_ft_complex_to_real_onesided_size(input_size, signal_sizes[i]); + } else { + inferred_size = infer_ft_complex_to_real_onesided_size(input_size); + } + checked_signal_sizes[i] = inferred_size; + output_sizes[i + 1] = inferred_size; + } else { + if (i == signal_ndim - 1 && onesided && !complex_input && complex_output) { + // if last dim and real-to-complex onesided, output should be only + // half of the signal, and we need to infer using input_size + output_sizes[i + 1] = infer_ft_real_to_complex_onesided_size(input_size); + } else { + output_sizes[i + 1] = input_size; + } + checked_signal_sizes[i] = input_size; + if (signal_sizes.size() > 0 && signal_sizes[i] != checked_signal_sizes[i]) { + std::ostringstream ss; + ss << "Expected given signal_sizes=" << signal_sizes << " to have same " + << "shape with input at signal dimension " << i << ", but got " + << "signal_sizes=" << signal_sizes << " and input=" << self.type() + << self.sizes(); + throw std::runtime_error(ss.str()); + } + } + } + if (complex_output) { + output_sizes[signal_ndim + 1] = 2; + } + + Tensor output = at::_fft_with_size(input, signal_ndim, complex_input, + complex_output, inverse, + checked_signal_sizes, normalized, onesided, + output_sizes); + + // unflatten the batch dims + if (batch_ndim == 0) { + // slightly faster path for non-batch mode + output = output.squeeze(0); + } else if (batch_ndim > 1) { + auto output_ndim = self.dim() + static_cast(complex_output) - static_cast(complex_input); + std::vector unflatten_output_shape(output_ndim); + std::copy(self_shape.begin(), self_shape.begin() + batch_ndim, unflatten_output_shape.begin()); + std::copy(output_sizes.begin() + 1, output_sizes.end(), unflatten_output_shape.begin() + batch_ndim); + output = output.reshape(unflatten_output_shape); + } + return output; +} + +// We call the following methods via CUDA hooks because they are really only +// valid when CUDA is available. See native/cuda/CuFFTPlanCache.h for more details. +int64_t _cufft_get_plan_cache_max_size() { + return detail::getCUDAHooks().cuFFTGetPlanCacheMaxSize(); +} + +void _cufft_set_plan_cache_max_size(int64_t max_size) { + detail::getCUDAHooks().cuFFTSetPlanCacheMaxSize(max_size); +} + +int64_t _cufft_get_plan_cache_size() { + return detail::getCUDAHooks().cuFFTGetPlanCacheSize(); +} + +void _cufft_clear_plan_cache() { + detail::getCUDAHooks().cuFFTClearPlanCache(); +} + +Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { + return _fft(self, signal_ndim, /* complex_input */ true, + /* complex_output */ true, /* inverse */ false, {}, normalized, + /* onesided */ false); +} + +Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { + return _fft(self, signal_ndim, /* complex_input */ true, + /* complex_output */ true, /* inverse */ true, {}, normalized, + /* onesided */ false); +} + +Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, + const bool onesided) { + return _fft(self, signal_ndim, /* complex_input */ false, + /* complex_output */ true, /* inverse */ false, {}, normalized, + onesided); +} + +Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, + const bool onesided, IntList signal_sizes) { + return _fft(self, signal_ndim, /* complex_input */ true, + /* complex_output */ false, /* inverse */ true, signal_sizes, + normalized, onesided); +} + + +Tensor stft(const Tensor& self, const int64_t n_fft, const int64_t hop_length, + const int64_t win_length, const Tensor& window, + const bool normalized, const bool onesided) { + #define REPR(SS) \ + SS << "stft(" << self.type() << self.sizes() << ", n_fft=" << n_fft \ + << ", hop_length=" << hop_length << ", win_length=" << win_length \ + << ", window="; \ + if (window.defined()) { \ + SS << window.type() << "{" << window.sizes() << "}"; \ + } else { \ + SS << "None"; \ + } \ + SS << ", normalized=" << normalized << ", onesided=" << onesided << ")" + + if (!at::isFloatingType(self.type().scalarType()) || self.dim() > 2 || self.dim() < 1) { + std::ostringstream ss; + REPR(ss) << ": expected a 1D or 2D tensor of floating types"; + AT_ERROR(ss.str()); + } + Tensor input = self; + if (self.dim() == 1) { + input = input.unsqueeze(0); + } + int64_t batch = input.size(0); + int64_t len = input.size(1); + if (n_fft <= 0 || n_fft > len) { + std::ostringstream ss; + REPR(ss) << ": expected 0 < n_fft < " << len + << ", but got n_fft=" << win_length; + AT_ERROR(ss.str()); + } + if (hop_length <= 0) { + std::ostringstream ss; + REPR(ss) << ": expected hop_length > 0, but got hop_length=" << hop_length; + throw std::runtime_error(ss.str()); + } + if (win_length <= 0 || win_length > n_fft) { + std::ostringstream ss; + REPR(ss) << ": expected 0 < win_length <= n_fft, but got win_length=" + << win_length; + AT_ERROR(ss.str()); + } + if (window.defined() && (window.dim() != 1 || window.size(0) != win_length)) { + std::ostringstream ss; + REPR(ss) << ": expected a 1D window tensor of size equal to win_length=" + << win_length << ", but got window with size " << window.sizes(); + AT_ERROR(ss.str()); + } + #undef REPR + auto window_ = window; + if (win_length < n_fft) { + // pad center + window_ = at::zeros({n_fft}, self.options()); + auto left = (n_fft - win_length) / 2; + if (window.defined()) { + window_.narrow(0, left, win_length).copy_(window); + } else { + window_.narrow(0, left, win_length).fill_(1); + } + } + int64_t n_frames = 1 + (len - n_fft) / hop_length; + // time2col + input = input.as_strided( + {batch, n_frames, n_fft}, + {input.stride(0), hop_length * input.stride(1), input.stride(1)} + ); + if (window_.defined()) { + input = input.mul(window_); + } + // rfft and transpose to get (batch x fft_size x num_frames) + auto out = input.rfft(1, normalized, onesided).transpose_(1, 2); + if (self.dim() == 1) { + return out.squeeze_(0); + } else { + return out; + } +} + +}} // at::native diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h new file mode 100644 index 0000000..7518d1f --- /dev/null +++ b/aten/src/ATen/native/SpectralOpsUtils.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include + +namespace at { namespace native { + +// NOTE [ Fourier Transform Conjugate Symmetry ] +// +// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is, +// assuming X is the transformed K-dimensionsal signal, we have +// +// X[i_1, ..., i_K] = X[j_i, ..., j_K]*, +// +// where j_k = (N_k - i_k) mod N_k, N_k being the signal size at dim k, +// * is the conjugate operator. +// +// Therefore, in such cases, FFT libraries return only roughly half of the +// values to avoid redundancy: +// +// X[:, :, ..., :floor(N / 2) + 1] +// +// This is also the assumption in cuFFT and MKL. In ATen SpectralOps, such +// halved signal will also be returned by default (flag onesided=True). +// The following infer_ft_real_to_complex_onesided_size function calculates the +// onesided size from the twosided size. +// +// Note that this loses some information about the size of signal at last +// dimension. E.g., both 11 and 10 maps to 6. Hence, the following +// infer_ft_complex_to_real_onesided_size function takes in optional parameter +// to infer the twosided size from given onesided size. +// +// cuFFT doc: http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional +// MKL doc: https://software.intel.com/en-us/mkl-developer-reference-c-dfti-complex-storage-dfti-real-storage-dfti-conjugate-even-storage#CONJUGATE_EVEN_STORAGE + +inline int64_t infer_ft_real_to_complex_onesided_size(int64_t real_size) { + return (real_size / 2) + 1; +} + +inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size, + int64_t expected_size=-1) { + int64_t base = (complex_size - 1) * 2; + if (expected_size < 0) { + return base + 1; + } else if (base == expected_size) { + return base; + } else if (base + 1 == expected_size) { + return base + 1; + } else { + std::ostringstream ss; + ss << "expected real signal size " << expected_size << " is incompatible " + << "with onesided complex frequency size " << complex_size; + throw std::runtime_error(ss.str()); + } +} + +}} // at::native diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp new file mode 100644 index 0000000..fbd07cc --- /dev/null +++ b/aten/src/ATen/native/SummaryOps.cpp @@ -0,0 +1,64 @@ +// Returns the frequency of elements of input non-negative integer tensor. + +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" + +#include + +namespace at { namespace native { + +///////////////// bincount ///////////////// +namespace { + +template +Tensor _bincount_cpu_template( + const Tensor& self, + const Tensor& weights, + int64_t minlength) { + if (minlength < 0) { + AT_ERROR("minlength should be >= 0"); + } + if (self.dim() != 1 || self.numel() == 0 || *self.min().data() < 0) { + AT_ERROR("bincount only supports 1-d non-negative integral inputs."); + } + + bool has_weights = weights.defined(); + if (has_weights && weights.size(0) != self.size(0)) { + AT_ERROR("input and weights should have the same length"); + } + + Tensor output; + int64_t nbins = static_cast(*self.max().data()) + 1L; + nbins = std::max(nbins, minlength); // at least minlength # of bins + + const input_t* self_p = self.contiguous().data(); + if (has_weights) { + output = native::zeros({nbins}, weights.options()); + weights_t* output_p = output.data(); + const weights_t* weights_p = weights.contiguous().data(); + for (int64_t i = 0; i < self.size(0); i++) { + output_p[self_p[i]] += weights_p[i]; + } + } else { + output = native::zeros({nbins}, kLong); + int64_t* output_p = output.data(); + for (int64_t i = 0; i < self.size(0); i++) { + output_p[self_p[i]] += 1L; + } + } + return output; +} +} // namespace + +Tensor +_bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) { + return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] { + const auto scalar = weights.type().scalarType(); + if (scalar == ScalarType::Undefined || scalar == ScalarType::Float) + return _bincount_cpu_template(self, weights, minlength); + return _bincount_cpu_template( + self, weights.toType(CPU(kDouble)), minlength); + }); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp new file mode 100644 index 0000000..52df990 --- /dev/null +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -0,0 +1,227 @@ +#include "ATen/ATen.h" +#include "ATen/CPUApplyUtils.h" +#include "ATen/Dispatch.h" +#include "ATen/Error.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" +#include "ReduceOpsUtils.h" + +namespace { +template +void where_cpu( + at::Tensor& ret, + const at::Tensor& condition, + const at::Tensor& self, + const at::Tensor& other) { + at::CPU_tensor_apply4( + ret, + condition, + self, + other, + [](scalar_t& ret_val, + const uint8_t& cond_val, + const scalar_t& self_val, + const scalar_t& other_val) { + ret_val = cond_val ? self_val : other_val; + }); +} +} // namespace + +namespace at { namespace native { + +bool allclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) { + return at::isclose(self, other, rtol, atol, equal_nan).all().toCByte(); +} + +Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) { + // TODO: use bitwise operator overloads once we add them + auto actual_error = (self - other).abs(); + auto max_error = atol + rtol * other.abs(); + auto close = actual_error <= max_error; + + // Handle +/-inf + close.__ior__(self == other); + close.__iand__((self == INFINITY) == (other == INFINITY)); + close.__iand__((self == -INFINITY) == (other == -INFINITY)); + + if (equal_nan) { + close.__ior__((self != self).__and__((other != other))); + } + return close; +} + +bool is_nonzero(const Tensor& self) { + auto n = self.numel(); + AT_ASSERT(n >= 0); + if (n == 0) { + AT_ERROR("bool value of Tensor with no values is ambiguous"); + } + if (n > 1) { + AT_ERROR("bool value of Tensor with more than one value is ambiguous"); + } + Scalar localScalar = self.pImpl->localScalar(); + if (localScalar.isFloatingPoint()) { + return localScalar.to() != 0; + } else if (localScalar.isIntegral()){ + return localScalar.to() != 0; + } + AT_ERROR("expected non-Tensor backed scalar"); +} + +Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) { + if (condition.type().scalarType() != ScalarType::Byte) { + AT_ERROR("Expected condition to have ScalarType Byte, but got ScalarType ", + toString(condition.type().scalarType())); + } + Tensor b_condition, b_self, b_other; + std::tie(b_condition, b_self, b_other) = expand_outplace(condition, self, other, "where"); + return at::_s_where(b_condition, b_self, b_other); +} + +Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& other) { + Tensor ret = self.type().tensor(self.sizes()); + AT_DISPATCH_ALL_TYPES(ret.type(), "where", [&] { + where_cpu(ret, condition, self, other); + }); + return ret; +} + +std::tuple kthvalue(const Tensor& self, int64_t k, int64_t dim, bool keepdim) { + Tensor values = self.type().tensor(); + Tensor indices = self.type().toScalarType(kLong).tensor(); + return at::native::kthvalue_out(values, indices, self, k, dim, keepdim); +} + +std::tuple kthvalue_out(Tensor& values, Tensor& indices, + const Tensor& self, int64_t k, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "kthvalue only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "kthvalue")) { + AT_ASSERT(values.dim() == 0); + indices.resize_({}).fill_(0); + return std::forward_as_tuple(values, indices); + } else { + return at::_th_kthvalue_out(values, indices, self, k, dim, keepdim); + } +} + +std::tuple median(const Tensor& self, int64_t dim, bool keepdim) { + Tensor values = self.type().tensor(); + Tensor indices = self.type().toScalarType(kLong).tensor(); + return at::native::median_out(values, indices, self, dim, keepdim); +} + +std::tuple median_out(Tensor& values, Tensor& indices, + const Tensor& self, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "median only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "median")) { + AT_ASSERT(values.dim() == 0); + indices.resize_({}).fill_(0); + return std::forward_as_tuple(values, indices); + } else { + return at::_th_median_out(values, indices, self, dim, keepdim); + } +} + +std::tuple mode(const Tensor& self, int64_t dim, bool keepdim) { + Tensor values = self.type().tensor(); + Tensor indices = self.type().toScalarType(kLong).tensor(); + return at::native::mode_out(values, indices, self, dim, keepdim); +} + +std::tuple mode_out(Tensor& values, Tensor& indices, + const Tensor& self, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "mode only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "mode")) { + AT_ASSERT(values.dim() == 0); + indices.resize_({}).fill_(0); + return std::forward_as_tuple(values, indices); + } else { + return at::_th_mode_out(values, indices, self, dim, keepdim); + } +} + +std::tuple max(const Tensor& self, int64_t dim, bool keepdim) { + Tensor max = self.type().tensor(); + Tensor max_indices = self.type().toScalarType(kLong).tensor(); + return at::native::max_out(max, max_indices, self, dim, keepdim); +} + +std::tuple max_out(Tensor& max, Tensor& max_indices, + const Tensor& self, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "max only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial_no_ident(max, self, dim, keepdim, "max")) { + AT_ASSERT(max.dim() == 0); + max_indices.resize_({}).fill_(0); + return std::forward_as_tuple(max, max_indices); + } else { + return at::_th_max_out(max, max_indices, self, dim, keepdim); + } +} + +Tensor max_values(const Tensor& self, int64_t dim, bool keepdim) { + return std::get<0>(self.max(dim, keepdim)); +} + +std::tuple min(const Tensor& self, int64_t dim, bool keepdim) { + Tensor min = self.type().tensor(); + Tensor min_indices = self.type().toScalarType(kLong).tensor(); + return at::native::min_out(min, min_indices, self, dim, keepdim); +} + +std::tuple min_out(Tensor& min, Tensor& min_indices, + const Tensor& self, int64_t dim, bool keepdim) { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "min only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + dim = maybe_wrap_dim(dim, self.dim()); + if (_dimreduce_return_trivial_no_ident(min, self, dim, keepdim, "min")) { + AT_ASSERT(min.dim() == 0); + min_indices.resize_({}).fill_(0); + return std::forward_as_tuple(min, min_indices); + } else { + return at::_th_min_out(min, min_indices, self, dim, keepdim); + } +} + +Tensor min_values(const Tensor& self, int64_t dim, bool keepdim) { + return std::get<0>(self.min(dim, keepdim)); +} + +// argmax and argmin + +Tensor argmax(const Tensor& self, int64_t dim, bool keepdim) { + return std::get<1>(self.max(dim, keepdim)); +} + +Tensor argmax(const Tensor& self) { + return std::get<1>(self.reshape({-1}).max(/*dim=*/0)); +} + +Tensor argmin(const Tensor& self, int64_t dim, bool keepdim) { + return std::get<1>(self.min(dim, keepdim)); +} + +Tensor argmin(const Tensor& self) { + return std::get<1>(self.reshape({-1}).min(/*dim=*/0)); +} + +// `argmin` and `argmax` are exposed in C++ but not in Python, where we only +// expose `_argmin` and `_argmax` (which call the first versions). In Python, +// we then define our own `argmax` and `argmin` that handle passing `dim=None`, +// which gets the argmax/argmin of the flattened array. + +Tensor _argmax(const Tensor& self, int64_t dim, bool keepdim) { + return at::argmax(self, dim, keepdim); +} + +Tensor _argmin(const Tensor& self, int64_t dim, bool keepdim) { + return at::argmin(self, dim, keepdim); +} +}} // namespace at::native diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp new file mode 100644 index 0000000..d8c856b --- /dev/null +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -0,0 +1,636 @@ +// define constants like M_PI and C keywords for MSVC +#ifdef _MSC_VER +#define _USE_MATH_DEFINES +#include +#endif + +#include "ATen/ATen.h" +#include "ATen/CPUGenerator.h" +#include "ATen/CheckGenerator.h" +#include "ATen/Dispatch.h" +#include "ATen/Error.h" +#include "ATen/NativeFunctions.h" +#include "ATen/ScalarType.h" +#include "ATen/Deprecated.h" +#include "ATen/TensorOptions.h" +#include "TH/THRandom.h" + +#include +#include +#include + +namespace at { +namespace native { +namespace { +void window_function_checks( + const char* function_name, + const TensorOptions& options, + int64_t window_length) { + AT_CHECK( + options.layout() != kSparse, + function_name, + " is not implemented for sparse types, got: ", + options.type().toString()); + AT_CHECK( + at::isFloatingType(options.dtype()), + function_name, + " expects floating point dtypes, got: ", + options.type().toString()); + AT_CHECK( + window_length >= 0, + function_name, + " requires non-negative window_length, got window_length=", + window_length); +} +} // namespace + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor arange(Scalar start, Scalar end, const TensorOptions& options) { + return native::arange(start, end, /*step=*/1, options); +} + +Tensor arange( + Scalar start, + Scalar end, + Scalar step, + const TensorOptions& options) { + return options.type()._arange(start, end, step); +} + +Tensor& arange_out(Tensor& result, Scalar start, Scalar end) { + return native::arange_out(result, start, end, /*step=*/1); +} + +Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) { + return at::_arange_out(result, start, end, step); +} + +Tensor arange(Scalar end, const TensorOptions& options) { + return options.type()._arange(end); +} + +Tensor& arange_out(Tensor& result, Scalar end) { + return at::_arange_out(result, end); +} + +Tensor _dim_arange(const Tensor& like, int64_t dim) { + return like.type().toScalarType(at::kLong)._arange(like.size(dim)); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor empty(IntList size, const TensorOptions& options) { + return options.type().tensor(size); +} + +Tensor& empty_out(Tensor& result, IntList size) { + if (result.is_sparse()) { + result.sparse_raw_resize_(size, size.size(), 0); + } else { + result.resize_(size); + } + return result; +} + +// Temporary type cast operators. These are needed to trace type-casts now since +// Type's are not supported in the IR. Instead, we call down to these +// specialized operators for each datatype. +// TODO: remove when we have Type support in the IR + +#define DEFINE_CAST_OP(_1, n, _2) \ + Tensor _cast_##n(const Tensor& self, bool non_blocking) { \ + auto& target_type = self.type().toScalarType(ScalarType::n); \ + if (self.type() == target_type) \ + return self; \ + return target_type.copy(self, non_blocking); \ + } + +AT_FORALL_SCALAR_TYPES(DEFINE_CAST_OP) + +#undef DEFINE_CAST_OP + +Tensor empty_like(const Tensor& self) { + return native::empty_like(self, self.options()); +} + +Tensor empty_like(const Tensor& self, const TensorOptions& options) { + if (options.layout() == kSparse && self.type().is_sparse()) { + auto res = options.type().tensor({}); + // resize_as_ requires the same exact type. + res.sparse_raw_resize_(self.sizes(), self._sparseDims(), self._denseDims()); + + return res; + } + return native::empty(self.sizes(), options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eye ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor eye(int64_t n, const TensorOptions& options) { + return native::eye(n, -1, options); +} + +Tensor eye(int64_t n, int64_t m, const TensorOptions& options) { + auto tensor = options.type().tensor({}); + return at::eye_out(tensor, n, m); +} + +Tensor& eye_out_cpu(Tensor& result, int64_t n) { + return native::eye_out_cpu(result, n, -1); +} + +Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) { +#ifndef USE_TH_SIZE_ZERO_DIM + AT_CHECK(n > 0, "n must be greater than 0, got ", n); +#else + AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); +#endif + +#ifndef USE_TH_SIZE_ZERO_DIM + if(m <= 0) { +#else + if(m < 0) { +#endif + m = n; + } + + result.resize_({n, m}); + result.zero_(); + + int64_t sz = std::min(n, m); + AT_DISPATCH_ALL_TYPES(result.type(), "eye", [&]() -> void { + scalar_t* result_data = result.data(); + for(int64_t i = 0; i < sz; i++) { + result_data[i*(result.strides()[0] + result.strides()[1])] = 1; + } + }); + + return result; +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ full ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor full(IntList size, Scalar fill_value, const TensorOptions& options) { + if (options.layout() == kSparse) { + AT_ERROR("full(...) is not implemented for sparse layout"); + } + auto result = options.type().tensor(size); + return result.fill_(fill_value); +} + +Tensor& full_out(Tensor& result, IntList size, Scalar fill_value) { + if (result.is_sparse()) { + AT_ERROR("full(...) is not implemented for sparse layout"); + } + result.resize_(size); + return result.fill_(fill_value); +} + +Tensor full_like(const Tensor& self, Scalar fill_value) { + return native::full_like(self, fill_value, self.options()); +} + +Tensor full_like(const Tensor& self, Scalar fill_value, const TensorOptions& options) { + return native::full(self.sizes(), fill_value, options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linspace ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor linspace(Scalar start, Scalar end, const TensorOptions& options) { + return native::linspace(start, end, /*steps=*/100, options); +} + +Tensor linspace( + Scalar start, + Scalar end, + int64_t steps, + const TensorOptions& options) { + return options.type()._linspace(start, end, steps); +} + +Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) { + return native::linspace_out(result, start, end, /*steps=*/100); +} + +Tensor& linspace_out(Tensor& result, Scalar start, Scalar end, int64_t steps) { + return at::_linspace_out(result, start, end, steps); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ logspace ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor logspace(Scalar start, Scalar end, const TensorOptions& options) { + return native::logspace(start, end, /*steps=*/100, options); +} + +Tensor logspace( + Scalar start, + Scalar end, + int64_t steps, + const TensorOptions& options) { + return options.type()._logspace(start, end, steps); +} + +Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) { + return native::logspace_out(result, start, end, /*steps=*/100); +} + +Tensor& logspace_out(Tensor& result, Scalar start, Scalar end, int64_t steps) { + return at::_logspace_out(result, start, end, steps); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ones ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor ones(IntList size, const TensorOptions& options) { + return native::full(size, /*fill_value=*/1, options); +} + +Tensor& ones_out(Tensor& result, IntList size) { + return native::full_out(result, size, /*fill_value=*/1); +} + +Tensor ones_like(const Tensor& self) { + return native::ones(self.sizes(), self.options()); +} + +Tensor ones_like(const Tensor& self, const TensorOptions& options) { + return native::ones(self.sizes(), options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rand ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor rand(IntList size, const TensorOptions& options) { + return native::rand(size, nullptr, options); +} + +Tensor rand(IntList size, Generator* generator, const TensorOptions& options) { + auto result = options.type().tensor(size); + return result.uniform_(0, 1, generator); +} + +Tensor& rand_out(Tensor& result, IntList size) { + return native::rand_out(result, size, nullptr); +} + +Tensor& rand_out(Tensor& result, IntList size, Generator* generator) { + result.resize_(size); + return result.uniform_(0, 1, generator); +} + +Tensor rand_like(const Tensor& self) { + return native::rand_like(self, self.options()); +} + +Tensor rand_like(const Tensor& self, const TensorOptions& options) { + return native::rand(self.sizes(), options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randint ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor randint(int64_t high, IntList size, const TensorOptions& options) { + return native::randint(high, size, nullptr, options); +} + +Tensor randint( + int64_t high, + IntList size, + Generator* generator, + const TensorOptions& options) { + return native::randint(0, high, size, generator, options); +} + +Tensor randint( + int64_t low, + int64_t high, + IntList size, + const TensorOptions& options) { + return native::randint(low, high, size, nullptr, options); +} + +Tensor randint( + int64_t low, + int64_t high, + IntList size, + Generator* generator, + const TensorOptions& options) { + auto result = options.type().tensor(size); + return result.random_(low, high, generator); +} + +Tensor& randint_out(Tensor& result, int64_t high, IntList size) { + return native::randint_out(result, high, size, nullptr); +} + +Tensor& randint_out( + Tensor& result, + int64_t high, + IntList size, + Generator* generator) { + result.resize_(size); + return result.random_(0, high, generator); +} + +Tensor& randint_out(Tensor& result, int64_t low, int64_t high, IntList size) { + return native::randint_out(result, low, high, size, nullptr); +} + +Tensor& randint_out( + Tensor& result, + int64_t low, + int64_t high, + IntList size, + Generator* generator) { + result.resize_(size); + return result.random_(low, high, generator); +} + +Tensor randint_like(const Tensor& self, int64_t high) { + return native::randint_like(self, high, self.options()); +} + +Tensor randint_like(const Tensor& self, int64_t low, int64_t high) { + return native::randint_like(self, low, high, self.options()); +} + +Tensor randint_like( + const Tensor& self, + int64_t high, + const TensorOptions& options) { + return native::randint(high, self.sizes(), nullptr, options); +} + +Tensor randint_like( + const Tensor& self, + int64_t low, + int64_t high, + const TensorOptions& options) { + return native::randint(low, high, self.sizes(), nullptr, options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor randn(IntList size, const TensorOptions& options) { + return native::randn(size, nullptr, options); +} + +Tensor randn(IntList size, Generator* generator, const TensorOptions& options) { + auto result = options.type().tensor(size); + return result.normal_(0, 1, generator); +} + +Tensor& randn_out(Tensor& result, IntList size) { + return native::randn_out(result, size, nullptr); +} + +Tensor& randn_out(Tensor& result, IntList size, Generator* generator) { + result.resize_(size); + return result.normal_(0, 1, generator); +} + +Tensor randn_like(const Tensor& self) { + return native::randn_like(self, self.options()); +} + +Tensor randn_like(const Tensor& self, const TensorOptions& options) { + return native::randn(self.sizes(), nullptr, options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randperm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +namespace { +template +void randperm_cpu(Tensor& result, int64_t n, THGenerator* generator) { + scalar_t *r__data = result.data(); + + result.resize_({n}); + int64_t r__stride_0 = result.stride(0); + + for(int64_t i = 0; i < n; i++) { + r__data[i*r__stride_0] = static_cast(i); + } + + for(int64_t i = 0; i < n - 1; i++) + { + int64_t z = THRandom_random(generator) % (n-i); + scalar_t sav = r__data[i*r__stride_0]; + r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0]; + r__data[(z+i)*r__stride_0] = sav; + } +} +} // namespace + + +THGenerator* get_generator(at::Generator* gen) { + auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU); + auto gen_ = at::check_generator(gen, default_gen); + return gen_->generator; +} + +Tensor randperm(int64_t n, const TensorOptions& options) { + return native::randperm(n, nullptr, options); +} + +Tensor randperm(int64_t n, Generator* generator, const TensorOptions& options) { + auto tensor = options.type().tensor(n); + return at::randperm_out(tensor, n, generator); +} + +Tensor& randperm_out(Tensor& result, int64_t n) { + return at::randperm_out(result, n, nullptr); +} + +Tensor& randperm_out_cpu(Tensor& result, int64_t n, Generator* generator) { + AT_CHECK(n >= 0, "n must be non-negative, got", n); + result.resize_({n}); + auto gen = get_generator(generator); + AT_DISPATCH_ALL_TYPES(result.type(), "randperm", [&]() -> void { + randperm_cpu(result, n, gen); + }); + + return result; +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor range(Scalar start, Scalar end, const TensorOptions& options) { + return native::range(start, end, /*step=*/1, options); +} + +Tensor range( + Scalar start, + Scalar end, + Scalar step, + const TensorOptions& options) { + return options.type()._range(start, end, step); +} + +Tensor& range_out(Tensor& result, Scalar start, Scalar end) { + return native::range_out(result, start, end, 1); +} + +Tensor& range_out(Tensor& result, Scalar start, Scalar end, Scalar step) { + return at::_range_out(result, start, end, step); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ zeros ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor zeros(IntList size, const TensorOptions& options) { + auto result = options.type().tensor(size); + return result.zero_(); +} + +Tensor& zeros_out(Tensor& result, IntList size) { + if (result.is_sparse()) { + result.sparse_raw_resize_(size, size.size(), 0); + } else { + result.resize_(size); + } + return result.zero_(); +} + +Tensor zeros_like(const Tensor& self) { + return native::zeros_like(self, self.options()); +} + +Tensor zeros_like(const Tensor& self, const TensorOptions& options) { + if (options.layout() == kSparse && self.type().is_sparse()) { + auto res = options.type().tensor({}); + // resize_as_ requires the same exact type. + res.sparse_raw_resize_(self.sizes(), self._sparseDims(), self._denseDims()); + return res; + } + return native::zeros(self.sizes(), options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ bartlett_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor bartlett_window(int64_t window_length, const TensorOptions& options) { + return native::bartlett_window(window_length, /*periodic=*/true, options); +} + +Tensor bartlett_window( + int64_t window_length, + bool periodic, + const TensorOptions& options) { + window_function_checks("bartlett_window", options, window_length); + if (window_length == 1) { + return native::ones({1}, options); + } + if (periodic) { + window_length += 1; + } + auto window = native::arange(window_length, options).mul_(2. / static_cast(window_length - 1)); + const int64_t first_half_size = ((window_length - 1) >> 1) + 1; + window.narrow(0, first_half_size, window_length - first_half_size).mul_(-1).add_(2); + return periodic ? window.narrow(0, 0, window_length - 1) : window; +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ blackman_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor blackman_window(int64_t window_length, const TensorOptions& options) { + return native::blackman_window(window_length, /*periodic=*/true, options); +} + +Tensor blackman_window( + int64_t window_length, + bool periodic, + const TensorOptions& options) { + window_function_checks("blackman_window", options, window_length); + if (window_length == 1) { + return native::ones({1}, options); + } + if (periodic) { + window_length += 1; + } + // from https://en.wikipedia.org/wiki/Window_function#Blackman_window + auto window = native::arange(window_length, options).mul_(M_PI / static_cast(window_length - 1)); + window = window.mul(4).cos_().mul_(0.08) - window.mul(2).cos_().mul_(0.5) + 0.42; + return periodic ? window.narrow(0, 0, window_length - 1) : window; +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hamming_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor hamming_window(int64_t window_length, const TensorOptions& options) { + return native::hamming_window(window_length, /*periodic=*/true, options); +} + +Tensor hamming_window( + int64_t window_length, + bool periodic, + const TensorOptions& options) { + return native::hamming_window( + window_length, periodic, /*alpha=*/0.54, options); +} + +Tensor hamming_window( + int64_t window_length, + bool periodic, + double alpha, + const TensorOptions& options) { + return native::hamming_window( + window_length, periodic, alpha, /*beta=*/0.46, options); +} + +Tensor hamming_window( + int64_t window_length, + bool periodic, + double alpha, + double beta, + const TensorOptions& options) { + window_function_checks("hamming_window", options, window_length); + if (window_length == 1) { + return native::ones({1}, options); + } + if (periodic) { + window_length += 1; + } + auto window = native::arange(window_length, options); + window.mul_(M_PI * 2. / static_cast(window_length - 1)).cos_().mul_(-beta).add_(alpha); + return periodic ? window.narrow(0, 0, window_length - 1) : window; +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hann_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tensor hann_window(int64_t window_length, const TensorOptions& options) { + return native::hann_window(window_length, /*periodic=*/true, options); +} + +Tensor hann_window( + int64_t window_length, + bool periodic, + const TensorOptions& options) { + window_function_checks("hann_window", options, window_length); + return native::hamming_window( + window_length, periodic, /*alpha=*/0.5, /*beta=*/0.5, options); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +Tensor tensor_cpu(ArrayRef values, const TensorOptions& options) { + auto result = at::empty(values.size(), options); + AT_ASSERT(result.is_contiguous()); + AT_DISPATCH_ALL_TYPES(result.type(), "tensor_cpu", [&] { + std::copy(values.begin(), values.end(), result.template data()); + }); + return result; +} + +template +Tensor tensor_cuda(ArrayRef values, const TensorOptions& options) { + auto cpu_tensor = tensor_cpu(values, TensorOptions(options).device(at::kCPU)); + return cpu_tensor.to(options.device()); +} + +#define TENSOR(T, _1, _2) \ + Tensor tensor(ArrayRef values, const TensorOptions& options) { \ + if (options.device().is_cuda()) { \ + return tensor_cuda(values, options); \ + } else { \ + return tensor_cpu(values, options); \ + } \ + } +AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR) +#undef TENSOR +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp new file mode 100644 index 0000000..881f626 --- /dev/null +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -0,0 +1,39 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/WrapDimUtils.h" +#include "ATen/detail/CUDAHooksInterface.h" + +#include "ATen/Config.h" +namespace at { +namespace native { + +bool is_same_size(const Tensor& self, const Tensor& other) { + return self.sizes().equals(other.sizes()); +} + +int64_t size(const Tensor& self, int64_t dim) { + // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) + dim = maybe_wrap_dim(dim, self.dim(), false); + return self.sizes()[dim]; +} + +int64_t stride(const Tensor& self, int64_t dim) { + // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) + dim = maybe_wrap_dim(dim, self.dim(), false); + return self.strides()[dim]; +} + +bool cudnn_is_acceptable(const Tensor& self) { + if (!globalContext().userEnabledCuDNN()) return false; + if (!self.is_cuda()) return false; + auto st = self.type().scalarType(); + if (!(st == kDouble || st == kFloat || st == kHalf)) return false; + if (!detail::getCUDAHooks().compiledWithCuDNN()) return false; + // NB: In the old Python code, there was also a test to see if the + // cuDNN library was actually dynamically linked or not. I'm not + // sure if we can actually test this. + return true; +} + +} +} diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp new file mode 100644 index 0000000..f248f3e --- /dev/null +++ b/aten/src/ATen/native/TensorShape.cpp @@ -0,0 +1,679 @@ +#include "ATen/ATen.h" +#include "ATen/Error.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/WrapDimUtils.h" +#include "ATen/optional.h" +#include + +#include +#include + +namespace at { +namespace native { + +static void check_cat_no_zero_dim(TensorList tensors) { + for(size_t i = 0; i < tensors.size(); ++i) { + auto& t = tensors[i]; + if (t.dim() == 0) { + AT_ERROR("zero-dimensional tensor (at position ", i, ") cannot be concatenated"); + } + } +} + +Tensor & cat_out(Tensor & result, TensorList tensors, int64_t dim) { + check_cat_no_zero_dim(tensors); + dim = legacy_cat_wrap_dim(dim, tensors); + return at::_cat_out(result, tensors, dim); +} + +Tensor cat(TensorList tensors, int64_t dim) { + check_cat_no_zero_dim(tensors); + dim = legacy_cat_wrap_dim(dim, tensors); + return at::_cat(tensors, dim); +} + +std::vector chunk(const Tensor& self, int64_t chunks, int64_t dim) { + if (self.dim() == 0) { + AT_ERROR("chunk expects at least a 1-dimensional tensor"); + } + if (chunks <= 0) { + AT_ERROR("chunk expects `chunks` to be greater than 0, got: ", chunks); + } + int64_t split_size = (self.size(dim) + chunks - 1) / chunks; + + // We need to call split_with_sizes in the case where split_size and dimension size are 0, because + // a call to split would discard the number of chunks (because we can have an arbitrary number of + // 0-sized chunks adding up to 0). So, call split_with_sizes with the correct number of chunks, + // eventually we will do this for all cases. + if (split_size == 0 && self.size(dim) == 0) { + std::vector split_sizes(chunks, split_size); + split_sizes[chunks - 1] = split_size - (split_size * chunks - self.size(dim)); + return self.split_with_sizes(split_sizes, dim); + } else { + return self.split(split_size, dim); + } +} + +Tensor diagflat(const Tensor& self, int64_t offset) { + return self.contiguous().view(-1).diag(offset); +} + +Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_) { + int64_t nDims = self.dim(); + int64_t dim1 = maybe_wrap_dim(dim1_, nDims); + int64_t dim2 = maybe_wrap_dim(dim2_, nDims); + AT_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); + int64_t diag_size; + int64_t storage_offset = self.storage_offset(); + // compute storage offset and size for the diagonal + // for positive values of offset (above the main diagonal) + // "leftmost columns" (along dim2) are dropped + // for negative values of offset (below the main diagonal) + // "topmost rows" (along dim1) are dropped. + // Note that we invert +/- in the second to absorb the negative + // sign in the offset. + if (offset >= 0) { + diag_size = std::max(std::min(self.size(dim1), self.size(dim2)-offset), 0); + } else { + diag_size = std::max(std::min(self.size(dim1)+offset, self.size(dim2)), 0); + } +#ifndef USE_TH_SIZE_ZERO_DIM + AT_CHECK(diag_size > 0, "invalid diagonal offset ", offset); // the diagonal offset was too large in magnitude +#endif + + // NumPy allows you to specify offsets "off the end"; let's just be careful not to + // set a ridiculous storage_offset in that case (technically it shouldn't matter + // because there are no elements in the tensor, but let's be kosher). + if (diag_size == 0) { + // skip + } else if (offset >= 0) { + storage_offset += offset * self.stride(dim2); + } else { + storage_offset -= offset * self.stride(dim1); + } + + // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum) + // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics + auto sizes = std::vector(self.sizes()); + auto strides = std::vector(self.strides()); + sizes.erase(sizes.begin() + std::max(dim1, dim2)); + strides.erase(strides.begin() + std::max(dim1, dim2)); + sizes.erase(sizes.begin() + std::min(dim1, dim2)); + strides.erase(strides.begin() + std::min(dim1, dim2)); + sizes.push_back(diag_size); + strides.push_back(self.stride(dim1)+self.stride(dim2)); + + // return view with new parameters + return self.as_strided(sizes, strides, storage_offset); +} + +Tensor expand(const Tensor& self, IntList size, bool implicit) { + // [expand implicit] + // The implicit flag is set to true for any expand calls inserted by broadcast + // operators in ExpandUtils.h This flag is recorded by the tracer to + // distinguish between expands inserted by broadcasts and those explicitly + // requested by the user, because it is legal to remove implicit expands + // from the graph, but not legal to remove the explicit ones. + if (size.size() < (size_t)self.dim()) { + std::ostringstream ss; + ss << "expand(" << self.type() << "{" << self.sizes() << "}, size=" << size + << "): the number of sizes provided (" << size.size() << ") " + << "must be greater or equal to the number of dimensions in the tensor (" + << self.dim() << ")"; + throw std::runtime_error(ss.str()); + } + + std::vector expandedSizes; + std::vector expandedStrides; + std::tie(expandedSizes, expandedStrides) = inferExpandGeometry(self, size); + + return self.as_strided(expandedSizes, expandedStrides); +} + +Tensor expand_as(const Tensor& self, const Tensor& other) { + return self.expand(other.sizes()); +} + +Tensor as_strided(const Tensor& self, IntList size, IntList stride) { + return self.as_strided(size, stride, self.storage_offset()); +} + +Tensor &as_strided_(Tensor& self, IntList size, IntList stride) { + return self.as_strided_(size, stride, self.storage_offset()); +} + +Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { + AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); + auto cur_size = self.size(dim); + if (start < 0) { + AT_ERROR("start out of range"); + } +#ifndef USE_TH_SIZE_ZERO_DIM + if (length <= 0 || start > cur_size - length) { +#else + if (length < 0 || start > cur_size - length) { +#endif + AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); + } + return at::slice(self, dim, start, start + length, 1); +} + +Tensor permute(const Tensor& self, IntList dims) { + auto nDims = self.dim(); + if (dims.size() != (size_t)nDims) { + AT_ERROR("number of dims don't match in permute"); + } + auto oldSizes = self.sizes(); + auto oldStrides = self.strides(); + std::vector newSizes(nDims); + std::vector newStrides(nDims); + std::vector seen(nDims); + for (int64_t i = 0; i < nDims; i++) { + auto dim = maybe_wrap_dim(dims[i], nDims); + if (seen[dim]) { + AT_ERROR("repeated dim in permute"); + } + seen[dim] = true; + newSizes[i] = oldSizes[dim]; + newStrides[i] = oldStrides[dim]; + } + return self.as_strided(newSizes, newStrides); +} + +Tensor repeat(const Tensor& self, IntList repeats) { + if (repeats.size() < (size_t)self.dim()) { + AT_ERROR("Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); + } + + // Add new leading dimensions to the tensor if the + // number of target dimensions is larger than the + // number of source dimensions. + int64_t num_new_dimensions = repeats.size() - self.dim(); + std::vector padded_size(num_new_dimensions, 1); + padded_size.insert(padded_size.end(), self.sizes().begin(), self.sizes().end()); + std::vector target_size(repeats.size()); + for(size_t idx = 0; idx < repeats.size(); ++idx) { + target_size[idx] = padded_size[idx] * repeats[idx]; + } + + Tensor xtensor = self.expand(padded_size); + + Tensor result = self.type().tensor(target_size); + Tensor urtensor = result.type().alias(result); + for (int64_t i = 0; i < xtensor.dim(); ++i) { + // can't unfold with step 0, so make sure step is at least 1 + // (it doesn't matter what it is in that case, because the size is 0). + urtensor = urtensor.unfold(i, xtensor.size(i), std::max(xtensor.size(i), 1)); + } + + urtensor.copy_(xtensor.expand_as(urtensor)); + + return result; +} + +// Infers the size of a dim with size -1, if it exists. Also checks that new +// shape is compatible with the number of elements. +static std::vector infer_size(IntList shape, int64_t numel) { + auto res = shape.vec(); + int64_t newsize = 1; + auto infer_dim = at::optional(); + for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { + if (shape[dim] == -1) { + if (infer_dim) { + throw std::runtime_error("only one dimension can be inferred"); + } + infer_dim = dim; + } else if (shape[dim] >= 0) { + newsize *= shape[dim]; + } else { + AT_ERROR("invalid shape dimension ", shape[dim]); + } + } + + if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) { + if (infer_dim) { + // we have a degree of freedom here to select the dimension size; follow NumPy semantics + // and just bail. + AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape); + res[*infer_dim] = numel / newsize; + } +#ifndef USE_TH_SIZE_ZERO_DIM + if (numel == 0) { + // Collapse zero-element shapes into one dimension because TH handles zeros + // in sizes strangely: x.resize_(1, 0) has shape (1,). TODO: remove this + // once we have multi-dimensional empty tensors. + return {0}; + } +#endif + return res; + } + + std::ostringstream ss; + ss << "shape '" << shape << "' is invalid for input of size " << numel; + throw std::runtime_error(ss.str()); +} + +Tensor reshape(const Tensor& self, IntList proposed_shape) { + if (self.type().is_sparse()) { + AT_ERROR("reshape is not implemented for sparse tensors"); + } + auto shape = infer_size(proposed_shape, self.numel()); + if (auto stride = THTensor_compute_stride(self.sizes(), self.strides(), shape)) { + return self.as_strided(shape, *stride); + } + return at::_unsafe_view(self.clone(), shape); +} + +Tensor reshape_as(const Tensor& self, const Tensor& other) { + return self.reshape(other.sizes()); +} + +Tensor select(const Tensor& self, int64_t dim, int64_t index) { + int64_t ndim = self.dim(); + AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor."); + dim = maybe_wrap_dim(dim, ndim); + auto size = self.size(dim); + if (index < -size || index >= size) { + std::stringstream ss; + ss << "select(): index " << index << " out of range for tensor of size "; + ss << self.sizes() << " at dimension " << dim; + throw std::runtime_error(ss.str()); + } + if (index < 0) { + index += size; + } + auto sizes = std::vector(self.sizes()); + auto strides = std::vector(self.strides()); + auto storage_offset = self.storage_offset() + index * strides[dim]; + sizes.erase(sizes.begin() + dim); + strides.erase(strides.begin() + dim); + return self.as_strided(sizes, strides, storage_offset); +} + +Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_t step) { + int64_t ndim = self.dim(); + AT_CHECK(ndim > 0, "slice() cannot be applied to a 0-dim tensor."); + dim = maybe_wrap_dim(dim, ndim); + auto sizes = std::vector(self.sizes()); + auto strides = std::vector(self.strides()); + if (step <= 0) { + // TODO: support negative strides + throw std::runtime_error("slice step must be positive"); + } + if (start < 0) { + start += sizes[dim]; + } + if (end < 0) { + end += sizes[dim]; + } + if (start < 0) { + start = 0; + } else if (start >= sizes[dim]) { + start = sizes[dim]; + } + if (end < start) { + end = start; + } else if (end >= sizes[dim]) { + end = sizes[dim]; + } + auto storage_offset = self.storage_offset() + start * strides[dim]; + auto len = end - start; +#ifndef USE_TH_SIZE_ZERO_DIM + if (len == 0) { + // TODO: currently we don't have support for 0-sized dims, return size 0 tensor for now + return self.type().tensor(); + } +#endif + sizes[dim] = (len + step - 1) / step; // round-up + strides[dim] *= step; + return self.as_strided(sizes, strides, storage_offset); +} + +std::vector split(const Tensor& self, int64_t split_size, int64_t dim) { + AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + AT_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); + int64_t dim_size = self.size(dim); + AT_CHECK(split_size > 0 || self.size(dim) == 0, + "split_size can only be 0 if dimension size is 0, " + "but got dimension size of ", dim_size); + // if split_size is 0 and dimension size is 0, there is 1 split. + int64_t num_splits = 1; + if (split_size != 0) { + // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size + // (returns a single split). We might want to error here, but keep it for BC. + num_splits = std::max((dim_size + split_size - 1) / split_size, 1); + } + std::vector splits(num_splits); + int64_t last_split_size = split_size - (split_size * num_splits - dim_size); + + for (int64_t i = 0; i < num_splits; ++i) { + auto length = i < num_splits - 1 ? split_size : last_split_size; + splits[i] = self.narrow(dim, i * split_size, length); + } + return splits; +} + +std::vector split_with_sizes(const Tensor& self, IntList split_sizes, int64_t dim) { + AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + int64_t dim_size = self.size(dim); + int64_t num_splits = split_sizes.size(); + std::vector splits(num_splits); + int64_t start_idx = 0; + int64_t i; + + for (i = 0; i < num_splits; ++i) { + auto length = split_sizes[i]; + if (length < 0) { + std::ostringstream ss; + ss << "split_with_sizes expects split_sizes have only non-negative " + << "entries, but got split_sizes=" << split_sizes; + throw std::runtime_error(ss.str()); + } + splits[i] = self.narrow(dim, start_idx, length); + start_idx += length; + } + if (start_idx != dim_size) { + std::ostringstream ss; + ss << "split_with_sizes expects split_sizes to sum exactly to " + << dim_size << " (input tensor's size at dimension " << dim << "), " + << "but got split_sizes=" << split_sizes; + throw std::runtime_error(ss.str()); + } + return splits; +} + +static inline std::vector get_stack_inputs(TensorList tensors, int64_t dim) { + std::vector inputs(tensors.size()); + for (size_t i = 0; i < tensors.size(); ++i) { + inputs[i] = tensors[i].unsqueeze(dim); + } + return inputs; +} + +Tensor stack(TensorList tensors, int64_t dim) { + if (tensors.size() == 0) { + throw std::runtime_error("stack expects a non-empty TensorList"); + } + dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); + return at::cat(get_stack_inputs(tensors, dim), dim); +} + +Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) { + if (tensors.size() == 0) { + throw std::runtime_error("stack expects a non-empty TensorList"); + } + dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); + return at::cat_out(result, get_stack_inputs(tensors, dim), dim); +} + +static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) { + int64_t nsparseDims = self._sparseDims(); + if (dim0 >= nsparseDims || dim1 >= nsparseDims) { + AT_ERROR( + "sparse transpose: transposed dimensions must be sparse ", + "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1); + } + + if (self._indices().numel() == 0 && self._values().numel() == 0) { + std::vector sizes(self.sizes()); + std::swap(sizes[dim0], sizes[dim1]); + + return self.sparse_raw_resize_(sizes, self._sparseDims(), self._denseDims()); + } else { + auto indices = self._indices(); + auto row0 = indices.select(0, dim0); + auto row1 = indices.select(0, dim1); + + // swap row0 and row1 + auto tmp = at::zeros_like(row0); + tmp.copy_(row0); + row0.copy_(row1); + row1.copy_(tmp); + + std::vector sizes(self.sizes()); + std::swap(sizes[dim0], sizes[dim1]); + + return self.sparse_raw_resize_(sizes, -1, -1); + } +} + +Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) { + auto ndims = self.dim(); + dim0 = maybe_wrap_dim(dim0, ndims); + dim1 = maybe_wrap_dim(dim1, ndims); + if (dim0 == dim1) { + return self; + } + + if (self.is_sparse()) { + return sparse_transpose_(self, dim0, dim1); + } + + std::vector strides(self.strides()); + std::vector sizes(self.sizes()); + std::swap(strides[dim0], strides[dim1]); + std::swap(sizes[dim0], sizes[dim1]); + return self.as_strided_(sizes, strides); +} + +Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) { + auto ndims = self.dim(); + dim0 = maybe_wrap_dim(dim0, ndims); + dim1 = maybe_wrap_dim(dim1, ndims); + if (dim0 == dim1) { + return self; + } + + if (self.is_sparse()) { + Tensor self_clone = self.clone(); // yes, this is what THS does + return sparse_transpose_(self_clone, dim0, dim1); + } + + std::vector strides(self.strides()); + std::vector sizes(self.sizes()); + std::swap(strides[dim0], strides[dim1]); + std::swap(sizes[dim0], sizes[dim1]); + return self.as_strided(sizes, strides); +} + +static void check_t(const Tensor& self, const char *fn) { + if (self.is_sparse()) { + int64_t sparseDims = self._sparseDims(); + int64_t denseDims = self._denseDims(); + if (!(sparseDims == 2 && denseDims == 0)) { + AT_ERROR(fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ", + sparseDims, " sparse and ", denseDims, " dense dimensions"); + } + } else if (self.dim() != 2) { + AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D"); + } +} + +Tensor t(const Tensor & self) { + check_t(self, "t()"); + return self.transpose(0, 1); +} + +Tensor & t_(Tensor & self) { + check_t(self, "t_()"); + return self.transpose_(0, 1); +} + +std::tuple, std::vector > +inferSqueezeGeometry(const Tensor &tensor) { + std::vector sizes; + std::vector strides; + + for(int64_t d = 0; d < tensor.dim(); d++) { + if(tensor.sizes()[d] != 1) { + sizes.push_back(tensor.sizes()[d]); + strides.push_back(tensor.strides()[d]); + } + } + + return std::make_tuple(sizes, strides); +} + +std::tuple, std::vector > +inferSqueezeGeometry(const Tensor& tensor, int64_t dim) { + std::vector sizes; + std::vector strides; + + for(int64_t d = 0; d < tensor.dim(); d++) { + if(d != dim || tensor.sizes()[dim] != 1) { + sizes.push_back(tensor.sizes()[d]); + strides.push_back(tensor.strides()[d]); + } + } + return std::make_tuple(sizes, strides); +} + +std::tuple, std::vector > +inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { +#ifndef USE_TH_SIZE_ZERO_DIM + if (tensor.numel() == 0) { + throw std::runtime_error("cannot unsqueeze empty tensor"); + } +#endif + std::vector sizes(tensor.sizes()); + std::vector strides(tensor.strides()); + int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim]; + sizes.insert(sizes.begin() + dim, 1); + strides.insert(strides.begin() + dim, new_stride); + + return std::make_tuple(sizes, strides); +} + +Tensor squeeze(const Tensor& self) { + auto g = inferSqueezeGeometry(self); + return self.as_strided(std::get<0>(g), std::get<1>(g)); +} + +Tensor squeeze(const Tensor& self, int64_t dim) { + int64_t dims = self.dim(); + dim = maybe_wrap_dim(dim, dims); + + if (dims == 0 || self.sizes()[dim] != 1) { + return self.as_strided(self.sizes().vec(), self.strides().vec()); + } + auto g = inferSqueezeGeometry(self, dim); + return self.as_strided(std::get<0>(g), std::get<1>(g)); +} + +Tensor & squeeze_(Tensor& self) { + auto g = inferSqueezeGeometry(self); + return self.as_strided_(std::get<0>(g), std::get<1>(g)); +} + +Tensor & squeeze_(Tensor& self, int64_t dim) { + int64_t dims = self.dim(); + dim = maybe_wrap_dim(dim, self.dim()); + + if (dims == 0 || self.sizes()[dim] != 1) { + return self.as_strided_(self.sizes().vec(), self.strides().vec()); + } + auto g = inferSqueezeGeometry(self, dim); + return self.as_strided_(std::get<0>(g), std::get<1>(g)); +} + +// _unsafe_view() differs from view() in that the returned tensor isn't treated +// as a view for the purposes of automatic differentiation. (It's not listed in +// VIEW_FUNCTIONS in gen_autograd.py). It's only safe to use if the `self` tensor +// is temporary. For example, the viewed tensor here (a + b) is discarded immediately +// after viewing: +// +// res = at::_unsafe_view(a + b, size); +// +// This is a hack because in-place operations on tensors treated like views +// can be much more expensive than the same operations on non-view tensors. +Tensor _unsafe_view(const Tensor& self, IntList size) { + return self.view(size); +} + +Tensor unsqueeze(const Tensor& self, int64_t dim) { + dim = maybe_wrap_dim(dim, self.dim() + 1); + + auto g = inferUnsqueezeGeometry(self, dim); + return self.as_strided(std::get<0>(g), std::get<1>(g)); +} + +Tensor & unsqueeze_(Tensor& self, int64_t dim) { + dim = maybe_wrap_dim(dim, self.dim() + 1); + + auto g = inferUnsqueezeGeometry(self, dim); + return self.as_strided_(std::get<0>(g), std::get<1>(g)); +} + +Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) { + start_dim = maybe_wrap_dim(start_dim, self.dim()); + end_dim = maybe_wrap_dim(end_dim, self.dim()); + AT_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim"); + + if (start_dim == end_dim) { + return self; + } + + // We don't want to infer_size on the entire shape, because that can give us an extra degree + // of freedom we don't want; for example, consider shape [0, 1, 3, 0], with start_dim=1, end_dim=2. + // It's clear we want result shape [0, 3, 0] but passing [0, -1, 0] to infer_size means the -1 + // can take on any value and satisfy the constraints. + auto slice_numel = prod_intlist(self.sizes().slice(start_dim, end_dim - start_dim + 1)); + std::vector shape; + shape.reserve(self.dim() - end_dim + start_dim); + for (int64_t i = 0; i < start_dim; i++) { + shape.push_back(self.size(i)); + } + shape.push_back(slice_numel); + for (int64_t i = end_dim + 1; i < self.dim(); i++) { + shape.push_back(self.size(i)); + } + + return self.reshape(shape); +} + +Tensor view_as(const Tensor& self, const Tensor& other) { + return self.view(other.sizes()); +} + +int64_t numel(const Tensor& self) { + return self.pImpl->numel(); +} + +std::vector unbind(const Tensor &self, int64_t dim) { + dim = maybe_wrap_dim(dim, self.dim()); + int64_t size = self.size(dim); + std::vector tensors(size); + for (int i = 0; i < size; i++) { + tensors[i] = self.select(dim, i); + } + return tensors; +} + +std::vector meshgrid(TensorList tensors) { + int64_t size = tensors.size(); + AT_CHECK(size > 0, "meshgrid expects a non-empty TensorList"); + std::vector shape(size); + for(int64_t i = 0; i < size; i++) { + switch (tensors[i].dim()) { + case 0: + shape[i] = 1; + break; + case 1: + shape[i] = tensors[i].size(0); + break; + default: + AT_ERROR("Expected scalar or 1D tensor in the tensor list but got: ", tensors[i]); + } + } + std::vector grids; + for(int64_t i = 0; i < size; i++) { + std::vector view_shape(size, 1); + view_shape[i] = -1; + grids.push_back(tensors[i].view(view_shape).expand(shape)); + } + return grids; +} + +} +} diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp new file mode 100644 index 0000000..8bce12c --- /dev/null +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -0,0 +1,59 @@ +#include "ATen/native/TensorTransformations.h" + +#include +#include + +#include +#include + +namespace at { +namespace native { + +Tensor flip_cpu(const Tensor& self, IntList dims) { + const int64_t total_dims = self.dim(), flip_dims_size = dims.size(); + check_errors(total_dims, flip_dims_size, dims); + + auto flip_dims_v = std::vector(dims); + std::sort(flip_dims_v.begin(), flip_dims_v.end()); + auto final_indices = std::vector(total_dims); + + auto indices = std::vector(flip_dims_size); + for (int64_t i = 0; i < flip_dims_size; i++) { + indices[i] = at::arange(self.size(flip_dims_v[i]) - 1, -1, -1, self.type().toScalarType(at::kLong)); + // creates a meshgrid + auto temp = std::vector(flip_dims_size, 1); + temp[i] = indices[i].size(0); + indices[i] = indices[i].view(IntList(temp)); + final_indices[flip_dims_v[i]] = indices[i]; + } + + // check if distance between two flip dims >= 2, where permute of output tensor is needed, + // because the advanced indexing puts all non-consecutive indices in the beginning of the tensor + bool to_permute = false; + int64_t first = flip_dims_v[0], second = flip_dims_v[0]; + for (int64_t i = 1; i < flip_dims_size; i++) { + second = flip_dims_v[i]; + if (second - first >= 2) { + to_permute = true; + break; + } + first = second; + } + + if (to_permute) { + // permute output tensor + auto permute_order = std::vector(flip_dims_v); + for (int64_t i = 0; i < total_dims; i++) { + if (std::find(flip_dims_v.begin(), flip_dims_v.end(), i) == flip_dims_v.end()) { + permute_order.emplace_back(i); + } + } + auto out_tensor = self.index(TensorList(final_indices)); + return out_tensor.permute(IntList(permute_order)); + } + + auto out_tensor = self.index(TensorList(final_indices)); + return out_tensor; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h new file mode 100644 index 0000000..554a46f --- /dev/null +++ b/aten/src/ATen/native/TensorTransformations.h @@ -0,0 +1,39 @@ +#include "ATen/ATen.h" + +#include + +#include +#include + +namespace at { +namespace native { + +static inline void check_errors(int64_t total_dims, int64_t flip_dims_size, IntList dims) { + // check if number of axis in dim is valid + AT_CHECK(flip_dims_size > 0, + "expected input tensor dims > 0, but got tensor dims size=", flip_dims_size); + + // check duplicates in dims + auto flip_dims_v = std::vector(dims); + flip_dims_v.erase(std::unique(flip_dims_v.begin(), flip_dims_v.end()), flip_dims_v.end()); + AT_CHECK((int64_t)flip_dims_v.size() == flip_dims_size, + "dims has duplicates, original flip dims size=", flip_dims_size, + ", but unique flip dims size=", flip_dims_v.size()); + + // check len of dims + AT_CHECK(flip_dims_size <= total_dims, + "expected flip dims size <= tensor total dims, but got flip dims size=", + flip_dims_size, " and tensor total dim=", total_dims); + + // check if dims axis within range + auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end()); + + AT_CHECK(*min_max_d.first >= 0, + "expected flip dims axis >= 0, but got min flip dims=", *min_max_d.first); + + AT_CHECK(*min_max_d.second < total_dims, + "expected flip dims axis < tensor total dims, but got max flip dims=", + *min_max_d.second, " and tensor total dim=", total_dims); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp new file mode 100644 index 0000000..a3c5f68 --- /dev/null +++ b/aten/src/ATen/native/TypeProperties.cpp @@ -0,0 +1,37 @@ +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" +#include "ATen/NativeFunctions.h" +#include + +namespace at { namespace native { + +bool is_cuda(const Tensor& self) { + return self.type().is_cuda(); +} + +bool is_distributed(const Tensor& self) { + return self.type().is_distributed(); +} + +bool is_floating_point(const Tensor& self) { + return at::isFloatingType(self.type().scalarType()); +} + +bool is_signed(const Tensor &self) { + if (self.type().scalarType() == ScalarType::Half) { + return true; + } + return AT_DISPATCH_ALL_TYPES(self.type(), "is_signed", [&]() -> bool { + return std::is_signed(); + }); +} + +bool is_sparse(const Tensor& self) { + return self.type().is_sparse(); +} + +Tensor type_as(const Tensor& self, const Tensor& other) { + return self.toType(other.type()); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp new file mode 100644 index 0000000..f32a206 --- /dev/null +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -0,0 +1,100 @@ +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/WrapDimUtils.h" + +#include "ATen/CPUApplyUtils.h" +#include "ATen/Parallel.h" +#include "ATen/native/cpu/UnaryOpsKernel.h" + +#include +#include +#include +#include +#include + +#include + +// NOTE: +// YOU ARE NOT OBLIGED TO USE THESE MACROS +// If you're writing something more specialized, please don't try to make them +// work for your case, but just write something new instead. + +namespace at { +namespace native { + +Tensor& fill_(Tensor& self, Scalar value) { + return self._fill_(value); +} + +Tensor& fill_(Tensor& self, const Tensor& value) { + return self._fill_(value); +} + +// NB: If you use this macro, you may also need to add a CUDA forwarding +// stub in CUDAUnaryOps + +#define IMPLEMENT_UNARY_OP_VEC(op) \ + Tensor op(const Tensor& self) { \ + Tensor result = self.type().tensor(); \ + return at::op##_out(result, self); \ + } \ + Tensor& _##op##__cpu(Tensor& self_) { \ + if (self_.numel() > 0) { \ + Tensor self = sort_strides(self_); \ + op##Impl(self, self); \ + } \ + return self_; \ + } \ + Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \ + result.resize_(self.sizes()); \ + if (result.numel() > 0) { \ + op##Impl(result, self); \ + } \ + return result; \ + } + +#define IMPLEMENT_UNARY_OP_TH(op) \ + Tensor op(const Tensor& self) { \ + Tensor result = self.type().tensor(); \ + return at::op##_out(result, self); \ + } \ + Tensor& _##op##__cpu(Tensor& self) { \ + return at::op##_out(self, self); \ + } \ + Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \ + result.resize_(self.sizes()); \ + return at::_##op##_out(result, self); \ + } + +// NB: Temp. defaulting to TH implementation of abs due to issues with Apple + +IMPLEMENT_UNARY_OP_TH(abs) +IMPLEMENT_UNARY_OP_VEC(acos) +IMPLEMENT_UNARY_OP_VEC(asin) +IMPLEMENT_UNARY_OP_VEC(atan) +IMPLEMENT_UNARY_OP_VEC(ceil) +IMPLEMENT_UNARY_OP_VEC(cos) +IMPLEMENT_UNARY_OP_TH(cosh) +IMPLEMENT_UNARY_OP_VEC(erf) +IMPLEMENT_UNARY_OP_VEC(erfc) +IMPLEMENT_UNARY_OP_VEC(exp) +IMPLEMENT_UNARY_OP_VEC(expm1) +IMPLEMENT_UNARY_OP_VEC(floor) +IMPLEMENT_UNARY_OP_VEC(log) +IMPLEMENT_UNARY_OP_VEC(log10) +IMPLEMENT_UNARY_OP_VEC(log1p) +IMPLEMENT_UNARY_OP_VEC(log2) +IMPLEMENT_UNARY_OP_VEC(round) +IMPLEMENT_UNARY_OP_VEC(rsqrt) +IMPLEMENT_UNARY_OP_VEC(sigmoid) +IMPLEMENT_UNARY_OP_VEC(sin) +IMPLEMENT_UNARY_OP_TH(sinh) +IMPLEMENT_UNARY_OP_VEC(sqrt) +IMPLEMENT_UNARY_OP_VEC(tan) +IMPLEMENT_UNARY_OP_VEC(tanh) +IMPLEMENT_UNARY_OP_VEC(trunc) + +} +} // namespace at diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp new file mode 100644 index 0000000..d9bd94e --- /dev/null +++ b/aten/src/ATen/native/Unique.cpp @@ -0,0 +1,60 @@ +// Returns unique elements of input tensor. + +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" + +#include +#include +#include +#include + +namespace at { +namespace native{ + +namespace { + +template +std::tuple _unique_cpu_template( + const Tensor& self, + const bool sorted, + const bool return_inverse) { + const Tensor& input = self.contiguous(); + const scalar_t* input_data = input.data(); + std::unordered_set set(input_data, input_data + input.numel()); + Tensor output = at::empty({static_cast(set.size())}, input.type()); + scalar_t* output_data = output.data(); + + if (sorted) { + std::vector vec(set.begin(), set.end()); + std::sort(vec.begin(), vec.end()); + std::copy(vec.begin(), vec.end(), output_data); + } else { + std::copy(set.begin(), set.end(), output_data); + } + + Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong)); + if (return_inverse) { + inverse_indices.resize_(input.sizes()); + int64_t* inverse_indices_data = inverse_indices.data(); + std::unordered_map inverse_map; + inverse_map.reserve(output.numel()); + for (int i = 0; i < output.numel(); ++i) { + inverse_map[output_data[i]] = i; + } + for (int i = 0; i < input.numel(); ++i) { + inverse_indices_data[i] = inverse_map[input_data[i]]; + } + } + return std::make_tuple(output, inverse_indices); +} +} // namespace + +std::tuple +_unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) { + return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] { + return _unique_cpu_template(self, sorted, return_inverse); + }); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/Vision.cpp b/aten/src/ATen/native/Vision.cpp new file mode 100644 index 0000000..458e9ac --- /dev/null +++ b/aten/src/ATen/native/Vision.cpp @@ -0,0 +1,28 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/detail/CUDAHooksInterface.h" + +namespace { + enum GridSamplerMode {GridSamplerModeZeros, GridSamplerModeBorder}; +} + +namespace at { namespace native { + +Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) { + // cudnn does not support inputs larger than 1024 + if (at::native::cudnn_is_acceptable(input) && + padding_mode == GridSamplerModeZeros && + input.dim() == 4 && + input.size(1) <= 1024) { + return cudnn_grid_sampler(input, grid); + } + if (input.dim() == 4) { + return thnn_grid_sampler_bilinear2d(input, grid, padding_mode); + } + if (input.dim() == 5) { + return thnn_grid_sampler_bilinear3d(input, grid, padding_mode); + } + AT_ERROR("grid_sampler(): input must be 4d or 5d but got input of shape: ", input.dim()); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/CapabilityDispatch.h b/aten/src/ATen/native/cpu/CapabilityDispatch.h new file mode 100644 index 0000000..6cb0f27 --- /dev/null +++ b/aten/src/ATen/native/cpu/CapabilityDispatch.h @@ -0,0 +1,97 @@ +#pragma once + +#include +#include +#include + +// Implements instruction set specific function dispatch. +// +// Kernels that may make use of specialized instruction sets (e.g. AVX) are +// compiled multiple times with different compiler flags (e.g. -mavx). A +// DispatchStub contains a table of function pointers for a kernel. At runtime, +// the fastest available kernel is chosen based on the features reported by +// cpuinfo. +// +// Example: +// +// In native/cpu/MyKernel.h: +// using fn_type = void(*)(const Tensor& x); +// DispatchStub stub; +// +// In native/cpu/MyKernel.cpp: +// void kernel(const Tensor& x) { ... } +// REGISTER_DISPATCH(stub, &kernel); +// +// To call: +// stub(tensor); +// + +namespace at { +namespace native { + +enum class CPUCapability { DEFAULT, AVX, AVX2, NUM_OPTIONS }; + +template +struct DispatchStub { + static_assert(std::is_pointer::value, "FnPtr should be a pointer type"); + + template + void operator()(ArgTypes... args) { + if (!dispatch_ptr) { + dispatch_ptr = choose_impl(); + } + (*dispatch_ptr)(args...); + } + + FnPtr choose_impl() { +// Do not use cpuinfo on PowerPC as it shows confusing errors when run on ppc +#ifndef __powerpc__ + if (cpuinfo_initialize()) { + int avx2 = static_cast(CPUCapability::AVX2); + if (!std::getenv("ATEN_DISABLE_AVX2") && cpuinfo_has_x86_avx2() && + cpuinfo_has_x86_fma3() && table[avx2]) { + return table[avx2]; + } + int avx = static_cast(CPUCapability::AVX); + if (!std::getenv("ATEN_DISABLE_AVX") && cpuinfo_has_x86_avx() && table[avx]) { + return table[avx]; + } + } +#endif + int def = static_cast(CPUCapability::DEFAULT); + AT_ASSERTM(table[def], "DispatchStub: missing default kernel"); + return table[def]; + } + + FnPtr dispatch_ptr = nullptr; + FnPtr table[static_cast(CPUCapability::NUM_OPTIONS)]; +}; + + +#if defined(CPU_CAPABILITY) + +constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::CPU_CAPABILITY; + +// Registers an implementation a kernel for the current CPU capability. +template +struct RegisterDispatch { + RegisterDispatch(DispatchStub& stub, FnPtr value) { + stub.table[static_cast(CURRENT_CAPABILITY)] = value; + } +}; + +// We only define the stub once in the DEFAULT capability compilation +#if defined(CPU_CAPABILITY_DEFAULT) +#define _DEFINE_STUB(stub, fn) DispatchStub stub +#else +#define _DEFINE_STUB(stub, fn) +#endif + +#define REGISTER_DISPATCH(stub, fn) \ + _DEFINE_STUB(stub, fn); \ + static RegisterDispatch stub ## __register(stub, fn); + +#endif + +} +} diff --git a/aten/src/ATen/native/cpu/Intrinsics.h b/aten/src/ATen/native/cpu/Intrinsics.h new file mode 100644 index 0000000..702b2be --- /dev/null +++ b/aten/src/ATen/native/cpu/Intrinsics.h @@ -0,0 +1,25 @@ +#pragma once + +#if defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#if _MSC_VER <= 1900 +#define _mm256_extract_epi64(X, Y) (((uint64_t*)&X)[Y]) +#endif +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include +#endif diff --git a/aten/src/ATen/native/cpu/README b/aten/src/ATen/native/cpu/README new file mode 100644 index 0000000..ac8263d --- /dev/null +++ b/aten/src/ATen/native/cpu/README @@ -0,0 +1,30 @@ +TODO: Clarify and add more documentation all around. + +All of the *.cpp files in this folder will be compiled under all compiler +flags specified by CPU_CAPABILITY_FLAGS in aten/src/ATen/CMakeLists.txt. + +The purpose of this is to allow the compilation with various compiler +flags to enable features such as AVX instructions, while using runtime +dispatch, which makes sure only valid instructions will be used on any +given platform. + +Vec256.h provides a generic implementation of a vec256 type that allows +the programmer to write code packing various primitives (such as floats) +within 256bit registers. vec256 defines various operators such as + and * +and provides functions to allow operations such as max, min, etc. + +As an example ReduceOpsKernel.cpp implements a generic kernel_ that reduces +an entire array using a given associative binary operation such as +. + +More explicity, calling kernel_ with template argument std::plus will cause +it to sum up the entire array into a single value. + +ReduceOpsKernel.cpp uses the CPU_CAPABILITY_* macros to "know" under which +compiler flags it is currently compiled. This allows the programmer to write +generic code, which will be compiled under multipled compilation settings. + +../ReduceOps.cpp now includes the header ReduceOpsKernel.h, which contains +a generic definition of sumImplAll. This function allows the user to reduce +over a dimension or all dimensions. The appropiate capability is chosen at +runtime using cpuinfo. If the current platform has avx, sumImpl will be set +to umImplAll. diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp new file mode 100644 index 0000000..0e749c2 --- /dev/null +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -0,0 +1,191 @@ +#include "ATen/native/cpu/ReduceOpsKernel.h" + +#include +#include +#include + +#include "ATen/Dispatch.h" +#include "ATen/Parallel.h" +#include "ATen/cpu/vec256/vec256.h" +#include "ATen/optional.h" + +namespace at { namespace native { namespace { + +using namespace vec256; + +static inline int64_t round_down(int64_t a, int64_t m) { + return a - (a % m); +} + +template +static void _parallel_for(int64_t size, int64_t step, bool parallelize, F func) { + if (parallelize) { + parallel_for(0, size / step, 1, [func, step](int64_t begin, int64_t end) { + int64_t k = begin * step; + for (int64_t i = begin; i < end; i++, k += step) { + func(k); + } + }); + } else { + for (int64_t i = 0; i != size; i += step) { + func(i); + } + } +} + +// Vectorized reduction defined by reduce operation `Op` with identity `ident`. +// The reduction is built on top of reduce128, which reduces down a column +// 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen +// because of the "adjacent cache line prefetch" behavior on x86 CPUs. +template class Op, int ident> +struct Reduction { + // reduction width in number of scalar elements + static constexpr int WIDTH = 128 / sizeof(scalar_t); + + using Vec = Vec256; + using Reduce = Op; + using ReduceScalar = Op; + + static void apply(Tensor& res, const Tensor& self, at::optional dim) { + auto out_ = res.data(); + auto data_ = self.data(); + auto numel = self.numel(); + if (!dim.has_value()) { + *out_ = reduce_all(data_, numel); + return; + } + + int64_t n = self.size(*dim); + int64_t stride = self.stride(*dim); + // A contiguous tensor does not need to hold a meaningful stride + // if the corresponding size is 1 + if (n == 1) { + stride = 1; + for (int64_t i = self.ndimension() - 1; i > *dim; i--) { + stride *= self.size(i); + } + } + int64_t batch = numel / (n * stride); + bool paralellize = batch * n > internal::GRAIN_SIZE; + if (stride == 1) { + parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) { + for (int64_t b = begin; b < end; b++) { + const scalar_t* data = &data_[b * n]; + scalar_t* out = &out_[b]; + scalar_t buf[WIDTH] = {0}; + std::fill(buf, buf + WIDTH, ident); + int64_t cols_rounded = n / WIDTH; + reduce128(data, buf, cols_rounded, WIDTH); + scalar_t result = ident; + for (int64_t i = 0; i < WIDTH; i++) { + result = ReduceScalar()(result, buf[i]); + } + for (int64_t col = cols_rounded * WIDTH; col != n; col++) { + result = ReduceScalar()(result, data[col]); + } + out_[b] = result; + } + }); + } else { + int64_t rows = n; + int64_t cols = stride; + int64_t cols_rounded = round_down(cols, WIDTH); + int64_t size = cols_rounded; + parallel_for( + 0, + batch * (size / WIDTH), + 1, + [out_, data_, n, stride, rows, cols, cols_rounded, size]( + int64_t begin, int64_t end) { + for (int64_t bi = begin; bi < end; bi++) { + int64_t b = bi / (size / WIDTH); + int64_t i = bi % (size / WIDTH); + int64_t k = i * WIDTH; + reduce128( + &data_[b * n * stride + k], + &out_[b * stride + k], + rows, + stride); + } + }); + + _parallel_for(batch, 1, paralellize, [=](int64_t b) { + const scalar_t* data = &data_[b * n * stride]; + scalar_t* out = &out_[b * stride]; + int64_t rows = n; + int64_t cols = stride; + + int64_t cols_rounded = round_down(cols, WIDTH); + if (cols_rounded != cols) { + scalar_t buf[WIDTH] = {0}; + std::fill(buf, buf + WIDTH, ident); + for (int64_t row = 0; row != rows; row++) { + for (int64_t j = 0; j != cols - cols_rounded; j++) { + auto val = data[row * stride + j + cols_rounded]; + buf[j] = ReduceScalar()(buf[j], val); + } + } + for (int64_t j = 0; j != cols - cols_rounded; j++) { + out[j + cols_rounded] = buf[j]; + } + } + }); + } + } + + static scalar_t reduce_all(const scalar_t* data, int64_t size) { + int64_t k = size / WIDTH; + + scalar_t sum = parallel_reduce( + 0, + k, + internal::GRAIN_SIZE / WIDTH, + (scalar_t)ident, + [data](int64_t begin, int64_t end, scalar_t init) { + scalar_t buf[WIDTH]; + reduce128(&data[begin * WIDTH], buf, end - begin, WIDTH); + return std::accumulate(buf, buf + WIDTH, init, ReduceScalar()); + }, + ReduceScalar()); + + for (int64_t i = k * WIDTH; i != size; i++) { + sum = ReduceScalar()(sum, data[i]); + } + return sum; + } + + // Reduce down a column of WIDTH elements (128 bytes) with the given number + // of rows. Stores the results in out[0 ... WIDTH-1]. + static void reduce128(const scalar_t* data, scalar_t* out, int64_t rows, int64_t stride) { + Vec acc[4] = {ident, ident, ident, ident}; // 128 bytes (two cache lines) + static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes"); + for (int64_t row = 0; row != rows; row++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * stride + j * Vec::size]); + acc[j] = Reduce()(acc[j], val); + } + } + for (int j = 0; j != 4; j++) { + acc[j].store(&out[j * Vec::size]); + } + } +}; + +static void sum_kernel_impl(Tensor& result, const Tensor& self, at::optional dim) { + AT_DISPATCH_ALL_TYPES(self.type(), "sum", [&] { + Reduction::apply(result, self, dim); + }); +} + +static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional dim) { + AT_DISPATCH_ALL_TYPES(self.type(), "prod", [&] { + Reduction::apply(result, self, dim); + }); +} + +} // anonymous namespace + +REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl); +REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h new file mode 100644 index 0000000..9481b90 --- /dev/null +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include "CapabilityDispatch.h" + +namespace at { +namespace native { + +using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional); + +extern DispatchStub sum_kernel; +extern DispatchStub prod_kernel; + +} +} diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp new file mode 100644 index 0000000..6cfa90f --- /dev/null +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -0,0 +1,268 @@ +#include "ATen/native/cpu/SoftmaxKernel.h" + +#include +#include +#include + +#include "ATen/Dispatch.h" +#include "ATen/Parallel.h" +#include "ATen/cpu/vec256/functional.h" +#include "ATen/cpu/vec256/vec256.h" +#include "ATen/optional.h" + +// [Note AVX-SSE transitions] In general we avoid calls into cmath for code +// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in +// Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280 +// +// On grainsize: The grainsize is chosen to roughly get GRAIN_SIZE number of +// computations per task. Each task works across dim_size elements. 16 should be +// a very rough approximation of the number of computations per dim_size element +// by counting simple computations (*, +, -) as 1 and exp or log as 4. + +namespace at { namespace native { +namespace { + +template +inline void _vec_log_softmax_lastdim( + scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t outer_size, + int64_t dim_size) { + using Vec = vec256::Vec256; + static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size; + int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); + if (grain_size < CHUNK_SIZE) + grain_size = CHUNK_SIZE; + + parallel_for( + 0, + outer_size, + grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) { + scalar_t tmp_sum_scalar[CHUNK_SIZE]; + scalar_t max_input_arr[CHUNK_SIZE]; + int64_t loop_end = CHUNK_SIZE; + if (ii + CHUNK_SIZE > end) + loop_end = end - ii; + for (int64_t j = 0; j < loop_end; j++) { + int64_t i = ii + j; + scalar_t* input_data = input_data_base + i * dim_size; + max_input_arr[j] = vec256::reduce_all( + [](Vec& x, Vec& y) { return vec256::max(x, y); }, + input_data, + dim_size); + } + for (int64_t j = 0; j < loop_end; j++) { + int64_t i = ii + j; + scalar_t* input_data = input_data_base + i * dim_size; + scalar_t max_input = max_input_arr[j]; + tmp_sum_scalar[j] = vec256::map_reduce_all( + [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, + [](Vec x, Vec y) { return x + y; }, + input_data, + dim_size); + } + // See [Note AVX-SSE transitions] for why this should call the + // vectorized version (aside from perf improvements). + vec256::map2( + [](Vec x, Vec y) { return x.log() + y; }, + tmp_sum_scalar, + tmp_sum_scalar, + max_input_arr, + loop_end); + for (int64_t j = 0; j < loop_end; j++) { + int64_t i = ii + j; + scalar_t* input_data = input_data_base + i * dim_size; + scalar_t* output_data = output_data_base + i * dim_size; + scalar_t tmp_sum = tmp_sum_scalar[j]; + vec256::map( + [tmp_sum](Vec x) { return x - Vec(tmp_sum); }, + output_data, + input_data, + dim_size); + } + } + }); +} + +template +inline void _vec_softmax_lastdim( + scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t outer_size, + int64_t dim_size) { + using Vec = vec256::Vec256; + int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); + if (grain_size < 1) + grain_size = 1; + + parallel_for( + 0, + outer_size, + grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { + scalar_t* input_data = input_data_base + i * dim_size; + scalar_t* output_data = output_data_base + i * dim_size; + scalar_t max_input = vec256::reduce_all( + [](Vec& x, Vec& y) { return vec256::max(x, y); }, + input_data, + dim_size); + vec256::map( + [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, + output_data, + input_data, + dim_size); + scalar_t tmp_sum = vec256::reduce_all( + [](Vec x, Vec y) { return x + y; }, output_data, dim_size); + tmp_sum = 1 / tmp_sum; + vec256::map( + [tmp_sum](Vec x) { return x * Vec(tmp_sum); }, + output_data, + output_data, + dim_size); + } + }); +} + +template +inline void _vec_host_softmax_backward_lastdim( + scalar_t* grad_input_data_base, + scalar_t* grad_data_base, + scalar_t* output_data_base, + int64_t outer_size, + int64_t dim_size) { + using Vec = vec256::Vec256; + int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); + if (grain_size < 1) + grain_size = 1; + + parallel_for( + 0, + outer_size, + grain_size, + [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; i++) { + scalar_t* grad_input_data = grad_input_data_base + i * dim_size; + scalar_t* grad_data = grad_data_base + i * dim_size; + scalar_t* output_data = output_data_base + i * dim_size; + scalar_t sum; + if (log_softmax) { + sum = vec256::reduce_all( + [](Vec& x, Vec& y) { return x + y; }, grad_data, dim_size); + } else { + sum = vec256::map2_reduce_all( + [](Vec x, Vec y) { return x * y; }, + [](Vec x, Vec y) { return x + y; }, + grad_data, + output_data, + dim_size); + } + if (log_softmax) { + vec256::map2( + [sum](Vec x, Vec y) { return x - ((y.exp()) * Vec(sum)); }, + grad_input_data, + grad_data, + output_data, + dim_size); + } else { + vec256::map2( + [sum](Vec x, Vec y) { return (x - Vec(sum)) * y; }, + grad_input_data, + grad_data, + output_data, + dim_size); + } + } + }); +} + +template +struct vec_host_softmax_lastdim { + static void apply(Tensor& output, const Tensor& input) { + int64_t outer_size = 1; + int64_t dim_size = input.size(input.ndimension() - 1); + for (int64_t i = 0; i < input.ndimension() - 1; ++i) + outer_size *= input.size(i); + scalar_t* input_data_base = input.data(); + scalar_t* output_data_base = output.data(); + if (LogSoftMax) { + _vec_log_softmax_lastdim( + input_data_base, output_data_base, outer_size, dim_size); + } else { + _vec_softmax_lastdim( + input_data_base, output_data_base, outer_size, dim_size); + } + } +}; + +template +struct vec_host_softmax_backward_lastdim { + static void + apply(Tensor& grad_input, const Tensor& grad, const Tensor& output) { + int64_t outer_size = 1; + int64_t dim_size = grad.size(grad.ndimension() - 1); + for (int64_t i = 0; i < grad.ndimension() - 1; ++i) + outer_size *= grad.size(i); + scalar_t* grad_input_data_base = grad_input.data(); + scalar_t* grad_data_base = grad.data(); + scalar_t* output_data_base = output.data(); + _vec_host_softmax_backward_lastdim( + grad_input_data_base, + grad_data_base, + output_data_base, + outer_size, + dim_size); + } +}; + +static void softmax_lastdim_kernel_impl(Tensor& result, const Tensor& self) { + AT_DISPATCH_FLOATING_TYPES(self.type(), "softmax_lastdim_kernel_impl", [&] { + vec_host_softmax_lastdim::apply(result, self); + }); +} + +static void log_softmax_lastdim_kernel_impl( + Tensor& result, + const Tensor& self) { + AT_DISPATCH_FLOATING_TYPES( + self.type(), "log_softmax_lastdim_kernel_impl", [&] { + vec_host_softmax_lastdim::apply(result, self); + }); +} + +static void softmax_backward_lastdim_kernel_impl( + Tensor& grad_input, + const Tensor& grad, + const Tensor& output) { + AT_DISPATCH_FLOATING_TYPES( + grad.type(), "softmax_backward_lastdim_kernel_impl", [&] { + vec_host_softmax_backward_lastdim::apply( + grad_input, grad, output); + }); +} + +static void log_softmax_backward_lastdim_kernel_impl( + Tensor& grad_input, + const Tensor& grad, + const Tensor& output) { + AT_DISPATCH_FLOATING_TYPES( + grad.type(), "log_softmax_backward_lastdim_kernel_impl", [&] { + vec_host_softmax_backward_lastdim::apply( + grad_input, grad, output); + }); +} + +} // anonymous namespace + +REGISTER_DISPATCH(softmax_lastdim_kernel, &softmax_lastdim_kernel_impl); +REGISTER_DISPATCH(log_softmax_lastdim_kernel, &log_softmax_lastdim_kernel_impl); +REGISTER_DISPATCH( + softmax_backward_lastdim_kernel, + &softmax_backward_lastdim_kernel_impl); +REGISTER_DISPATCH( + log_softmax_backward_lastdim_kernel, + &log_softmax_backward_lastdim_kernel_impl); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h new file mode 100644 index 0000000..dbd703b --- /dev/null +++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include "CapabilityDispatch.h" + +namespace at { +namespace native { + +using forward_fn = void(*)(Tensor &, const Tensor &); +using backward_fn = void(*)(Tensor &, const Tensor &, const Tensor&); + +extern DispatchStub softmax_lastdim_kernel; +extern DispatchStub log_softmax_lastdim_kernel; +extern DispatchStub softmax_backward_lastdim_kernel; +extern DispatchStub log_softmax_backward_lastdim_kernel; + +} +} diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp new file mode 100644 index 0000000..7416923 --- /dev/null +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -0,0 +1,171 @@ +#include "ATen/native/cpu/UnaryOpsKernel.h" + +#include +#include "ATen/Dispatch.h" +#include "ATen/cpu/vml.h" +#include "ATen/CPUApplyUtils.h" +#include "ATen/native/cpu/CapabilityDispatch.h" +#ifdef __AVX2__ +#include "ATen/native/cpu/avx_mathfun.h" +#endif + +namespace at { namespace native { +namespace { + +using namespace vec256; + +template +static int64_t _sigmoid(scalar_t* x, scalar_t* y, int64_t size); + +// This should be a temporary solution until we understand why SLEEF is slower +// for sigmoid + +template <> +int64_t _sigmoid(float* x, float* y, int64_t size) { + using Vec = Vec256; + int64_t i = 0; + for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) { + Vec ret = Vec::loadu(y + i); + Vec ret2 = Vec::loadu(y + i + Vec::size); + ret = ret.neg(); + ret2 = ret2.neg(); +#if defined(__AVX2__) && !defined(_MSC_VER) + ret = exp256_ps(ret); + ret2 = exp256_ps(ret2); +#else + ret = ret.exp(); + ret2 = ret2.exp(); +#endif + ret = Vec((float)(1)) + ret; + ret2 = Vec((float)(1)) + ret2; + ret = ret.reciprocal(); + ret2 = ret2.reciprocal(); + ret.store(x + i); + ret2.store(x + i + Vec::size); + } + return i; +} + +template <> +int64_t _sigmoid(double* x, double* y, int64_t size) { + using Vec = Vec256; + int64_t i = 0; + for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) { + Vec ret = Vec::loadu(y + i); + Vec ret2 = Vec::loadu(y + i + Vec::size); + ret = ret.neg(); + ret2 = ret2.neg(); + ret = ret.exp(); + ret2 = ret2.exp(); + ret = Vec((double)(1)) + ret; + ret2 = Vec((double)(1)) + ret2; + ret = ret.reciprocal(); + ret2 = ret2.reciprocal(); + ret.store(x + i); + ret2.store(x + i + Vec::size); + } + return i; +} + +static void sigmoid_kernel(Tensor& result, const Tensor& self) { + AT_DISPATCH_FLOATING_TYPES(self.type(), "sigmoid", [&] { + using Vec = Vec256; + CPU_tensor_parallel_kernel_apply2( + result, + self, + [](int64_t size, + scalar_t* x, + scalar_t* y, + int64_t stridex, + int64_t stridey) { + int64_t i = 0; + if (stridex == 1 && stridey == 1) { + i = _sigmoid(x, y, size); + } + for (; i < size; i += Vec::size) { + scalar_t buffer[Vec::size]; + int64_t width = Vec::size; + width = std::min(width, size - i); + for (int64_t j = 0; j < width; j++) { + buffer[j] = y[stridey * (i + j)]; + } + Vec ret = Vec::loadu(buffer); + ret = Vec((scalar_t)(0)) - ret; + ret = ret.exp(); + ret = Vec((scalar_t)(1)) + ret; + ret = ret.reciprocal(); + ret.store(buffer); + for (int64_t j = 0; j < width; j++) + x[stridex * (i + j)] = buffer[j]; + } + }); + }); +} + +#define IMPLEMENT_FLOAT_KERNEL(dispatchtypes, op) \ + static void op##_kernel(Tensor& result, const Tensor& self) { \ + AT_DISPATCH_##dispatchtypes##_TYPES(self.type(), #op, [&] { \ + if (self.is_contiguous() && result.is_contiguous()) { \ + vml::v##op( \ + result.data(), self.data(), self.numel()); \ + \ + } else { \ + static constexpr int64_t WIDTH = 131072 / sizeof(scalar_t); \ + CPU_tensor_parallel_kernel_apply2( \ + result, \ + self, \ + [](int64_t size, \ + scalar_t* x, \ + scalar_t* y, \ + int64_t stridex, \ + int64_t stridey) { \ + if (stridex == 1 && stridey == 1) { \ + vml::v##op(x, y, size); \ + } else { \ + for (int64_t i = 0; i < size; i += WIDTH) { \ + scalar_t buffer[WIDTH]; \ + int64_t width = WIDTH; \ + width = std::min(width, size - i); \ + for (int64_t j = 0; j < width; j++) \ + buffer[j] = y[stridey * (i + j)]; \ + vml::v##op(buffer, buffer, width); \ + for (int64_t j = 0; j < width; j++) \ + x[stridex * (i + j)] = buffer[j]; \ + } \ + } \ + }); \ + } \ + }); \ + } \ + REGISTER_DISPATCH(op##Impl, &op##_kernel) + +} // anonymous namespace + +REGISTER_DISPATCH(sigmoidImpl, &sigmoid_kernel) + +// IMPLEMENT_FLOAT_KERNEL(ALL, abs) +IMPLEMENT_FLOAT_KERNEL(FLOATING, acos) +IMPLEMENT_FLOAT_KERNEL(FLOATING, asin) +IMPLEMENT_FLOAT_KERNEL(FLOATING, atan) +IMPLEMENT_FLOAT_KERNEL(FLOATING, ceil) +IMPLEMENT_FLOAT_KERNEL(FLOATING, cos) +// IMPLEMENT_FLOAT_KERNEL(FLOATING, cosh) +IMPLEMENT_FLOAT_KERNEL(FLOATING, erf) +IMPLEMENT_FLOAT_KERNEL(FLOATING, erfc) +IMPLEMENT_FLOAT_KERNEL(FLOATING, exp) +IMPLEMENT_FLOAT_KERNEL(FLOATING, expm1) +IMPLEMENT_FLOAT_KERNEL(FLOATING, floor) +IMPLEMENT_FLOAT_KERNEL(FLOATING, log) +IMPLEMENT_FLOAT_KERNEL(FLOATING, log10) +IMPLEMENT_FLOAT_KERNEL(FLOATING, log1p) +IMPLEMENT_FLOAT_KERNEL(FLOATING, log2) +IMPLEMENT_FLOAT_KERNEL(FLOATING, round) +IMPLEMENT_FLOAT_KERNEL(FLOATING, rsqrt) +IMPLEMENT_FLOAT_KERNEL(FLOATING, sin) +// IMPLEMENT_FLOAT_KERNEL(FLOATING, sinh) +IMPLEMENT_FLOAT_KERNEL(FLOATING, sqrt) +IMPLEMENT_FLOAT_KERNEL(FLOATING, tan) +IMPLEMENT_FLOAT_KERNEL(FLOATING, tanh) +IMPLEMENT_FLOAT_KERNEL(FLOATING, trunc) + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h new file mode 100644 index 0000000..d9bffad --- /dev/null +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include "CapabilityDispatch.h" + +namespace at { namespace native { + +using unary_fn = void(*)(Tensor&, const Tensor&); + +extern DispatchStub absImpl; +extern DispatchStub acosImpl; +extern DispatchStub asinImpl; +extern DispatchStub atanImpl; +extern DispatchStub ceilImpl; +extern DispatchStub cosImpl; +// extern DispatchStub coshImpl; +extern DispatchStub erfImpl; +extern DispatchStub erfcImpl; +extern DispatchStub expImpl; +extern DispatchStub expm1Impl; +extern DispatchStub floorImpl; +extern DispatchStub logImpl; +extern DispatchStub log10Impl; +extern DispatchStub log1pImpl; +extern DispatchStub log2Impl; +extern DispatchStub roundImpl; +extern DispatchStub rsqrtImpl; +extern DispatchStub sigmoidImpl; +extern DispatchStub sinImpl; +// extern DispatchStub sinhImpl; +extern DispatchStub sqrtImpl; +extern DispatchStub tanImpl; +extern DispatchStub tanhImpl; +extern DispatchStub truncImpl; + + +// Missing unary functions +// digamma +// lgamma + +// TODO: See below +// erfinv +// fill +// frac +// clone +// contiguous +// clamp/_min/_max +// neg +// reciprocal +// sigmoid +// sign +// zero + + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/avx_mathfun.h b/aten/src/ATen/native/cpu/avx_mathfun.h new file mode 100644 index 0000000..3e40146 --- /dev/null +++ b/aten/src/ATen/native/cpu/avx_mathfun.h @@ -0,0 +1,715 @@ +#pragma once +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include "Intrinsics.h" + +/* yes I know, the top of this file is quite ugly */ +#if defined(__GNUC__) +# define ALIGN32_BEG __attribute__((aligned(32))) +#elif defined(_WIN32) +# define ALIGN32_BEG __declspec(align(32)) +#endif + +/* __m128 is ugly to write */ +typedef __m256 v8sf; // vector of 8 float (avx) +typedef __m256i v8si; // vector of 8 int (avx) +typedef __m128i v4si; // vector of 8 int (avx) + +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] = { Val, Val, Val, Val } + +_PI32AVX_CONST(1, 1); +_PI32AVX_CONST(inv1, ~1); +_PI32AVX_CONST(2, 2); +_PI32AVX_CONST(4, 4); + + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } + +_PS256_CONST(1 , 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, - 1.2420140846E-1); +_PS256_CONST(cephes_log_p4, + 1.4249322787E-1); +_PS256_CONST(cephes_log_p5, - 1.6668057665E-1); +_PS256_CONST(cephes_log_p6, + 2.0000714765E-1); +_PS256_CONST(cephes_log_p7, - 2.4999993993E-1); +_PS256_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + +#ifndef __AVX2__ + +typedef union imm_xmm_union { + v8si imm; + v4si xmm[2]; +} imm_xmm_union; + +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \ + imm_xmm_union u __attribute__((aligned(32))); \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ +} + +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \ + imm_xmm_union u __attribute__((aligned(32))); \ + u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \ + } + + +#define AVX2_BITOP_USING_SSE2(fn) \ +static inline v8si _mm256_##fn(v8si x, int a) \ +{ \ + /* use SSE2 instruction to perform the bitop AVX2 */ \ + v4si x1, x2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1,a); \ + x2 = _mm_##fn(x2,a); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return(ret); \ +} + +#warning "Using SSE2 to perform AVX2 bitshift ops" +AVX2_BITOP_USING_SSE2(slli_epi32) +AVX2_BITOP_USING_SSE2(srli_epi32) + +#define AVX2_INTOP_USING_SSE2(fn) \ +static inline v8si _mm256_##fn(v8si x, v8si y) \ +{ \ + /* use SSE2 instructions to perform the AVX2 integer operation */ \ + v4si x1, x2; \ + v4si y1, y2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1,y1); \ + x2 = _mm_##fn(x2,y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return(ret); \ +} + +#warning "Using SSE2 to perform AVX2 integer ops" +AVX2_INTOP_USING_SSE2(and_si128) +AVX2_INTOP_USING_SSE2(andnot_si128) +AVX2_INTOP_USING_SSE2(cmpeq_epi32) +AVX2_INTOP_USING_SSE2(sub_epi32) +AVX2_INTOP_USING_SSE2(add_epi32) + +#endif /* __AVX2__ */ + + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +inline v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x,x); + + v8sf y = *(v8sf*)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + + tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +inline v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + //imm0 = _mm256_cvttps_epi32(fx); + //tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, substract 1 */ + //v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x,x); + + v8sf y = *(v8sf*)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = _mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 8 sines at onces using AVX intrisics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +inline v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + +#ifndef __AVX2__ + v4si imm0_1, imm0_2; + v4si imm2_1, imm2_2; +#endif + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); + + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ + +#ifdef __AVX2__ + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); + imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4); + imm0 = _mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +void hardshrink_cuda_kernel(const Tensor& self, Tensor& out_tensor, scalar_t* lambd) { + at::cuda::CUDA_tensor_apply2( + self, + out_tensor, + [lambd] __device__ ( + scalar_t& self_val, + scalar_t& out_tensor_val) { + out_tensor_val = (self_val >= -*lambd && self_val <= *lambd) ? scalar_t(0) : self_val; + }); +} + +template +void hardshrink_backward_cuda_kernel(Tensor& out_tensor, scalar_t* lambd, const Tensor& self, const Tensor& grad) { + at::cuda::CUDA_tensor_apply3( + self, + grad, + out_tensor, + [lambd] __device__ ( + scalar_t& self_val, + scalar_t& grad_val, + scalar_t& out_tensor_val) { + out_tensor_val = (self_val >= -*lambd && self_val <= *lambd) ? scalar_t(0) : grad_val; + }); +} + +Tensor hardshrink_cuda(const Tensor & self, Scalar lambd) { + auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU); + auto out_tensor = at::empty_like(self); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "hardshrink_cuda", [&] { + hardshrink_cuda_kernel(self, out_tensor, lambd_tensor.data()); + }); + return out_tensor; +} + +Tensor hardshrink_backward_cuda(const Tensor & grad, const Tensor & self, Scalar lambd) { + auto lambd_tensor = lambd.toTensor().toType(self.type().scalarType()).toBackend(self.is_cuda() ? Backend::CUDA : Backend::CPU); + auto out_tensor = at::empty_like(grad); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "hardshrink_backward_cuda", [&] { + hardshrink_backward_cuda_kernel(out_tensor, lambd_tensor.data(), self, grad); + }); + return out_tensor; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/CUDAReduceOps.cpp b/aten/src/ATen/native/cuda/CUDAReduceOps.cpp new file mode 100644 index 0000000..a3b32ce --- /dev/null +++ b/aten/src/ATen/native/cuda/CUDAReduceOps.cpp @@ -0,0 +1,29 @@ +#include +#include "ATen/native/ReduceOpsUtils.h" + +namespace at { namespace native { + +Tensor _sum_cuda(const Tensor &self_) { return self_._sumall(); } + +Tensor _prod_cuda(const Tensor &self_) { return self_._prodall(); } + +Tensor &_sum_out_cuda(Tensor &result, const Tensor &self, int64_t dim, + bool keepdim) { + if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { + return result; + } else { + return at::_th_sum_out(result, self, dim, keepdim); + } +} + +Tensor &_prod_out_cuda(Tensor &result, const Tensor &self, int64_t dim, + bool keepdim) { + if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) { + return result; + } else { + return at::_th_prod_out(result, self, dim, keepdim); + } +} + + +}} diff --git a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp new file mode 100644 index 0000000..2e524f4 --- /dev/null +++ b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp @@ -0,0 +1,42 @@ +#include "ATen/ATen.h" + +namespace at { namespace native { + +// These are just forwarding stubs + +#define IMPLEMENT_UNARY_OP_PREQUEL(op) \ + Tensor& _##op##__cuda(Tensor& self) { \ + return at::_##op##_out(self, self); \ + } \ + Tensor& _##op##_out_cuda(Tensor& result, const Tensor& self) { \ + return at::_##op##_out(result, self); \ + } + + +IMPLEMENT_UNARY_OP_PREQUEL(abs) +IMPLEMENT_UNARY_OP_PREQUEL(acos) +IMPLEMENT_UNARY_OP_PREQUEL(asin) +IMPLEMENT_UNARY_OP_PREQUEL(atan) +IMPLEMENT_UNARY_OP_PREQUEL(ceil) +IMPLEMENT_UNARY_OP_PREQUEL(cos) +IMPLEMENT_UNARY_OP_PREQUEL(cosh) +IMPLEMENT_UNARY_OP_PREQUEL(erf) +IMPLEMENT_UNARY_OP_PREQUEL(erfc) +IMPLEMENT_UNARY_OP_PREQUEL(exp) +IMPLEMENT_UNARY_OP_PREQUEL(expm1) +IMPLEMENT_UNARY_OP_PREQUEL(floor) +IMPLEMENT_UNARY_OP_PREQUEL(log) +IMPLEMENT_UNARY_OP_PREQUEL(log10) +IMPLEMENT_UNARY_OP_PREQUEL(log1p) +IMPLEMENT_UNARY_OP_PREQUEL(log2) +IMPLEMENT_UNARY_OP_PREQUEL(round) +IMPLEMENT_UNARY_OP_PREQUEL(rsqrt) +IMPLEMENT_UNARY_OP_PREQUEL(sigmoid) +IMPLEMENT_UNARY_OP_PREQUEL(sin) +IMPLEMENT_UNARY_OP_PREQUEL(sinh) +IMPLEMENT_UNARY_OP_PREQUEL(sqrt) +IMPLEMENT_UNARY_OP_PREQUEL(tan) +IMPLEMENT_UNARY_OP_PREQUEL(tanh) +IMPLEMENT_UNARY_OP_PREQUEL(trunc) + +}} diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h new file mode 100644 index 0000000..49c56cb --- /dev/null +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -0,0 +1,399 @@ +#include "ATen/ATen.h" +#include "ATen/Config.h" +#include "ATen/native/cuda/CuFFTUtils.h" +#include "ATen/native/utils/ParamsHash.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { namespace detail { + +// This POD struct is used to let us easily compute hashes of the +// parameters. +// It will be the **key** to the plan cache. +struct CuFFTParams +{ + at::ScalarType scalar_type_; + int64_t input_sizes_[max_rank + 2]; + int64_t input_strides_[max_rank + 2]; + uint8_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3 + bool complex_input_; + bool complex_output_; + int64_t signal_sizes_[max_rank]; + bool onesided_; +}; + +// NB: This can't be a constructor, because then CuFFTParams +// would not be a POD anymore. +static inline void setCuFFTParams(CuFFTParams* params, + const Tensor& input, int64_t signal_ndim, bool complex_input, + bool complex_output, IntList checked_signal_sizes, bool onesided) { + + memset(params, 0, sizeof(CuFFTParams)); + params->scalar_type_ = input.type().scalarType(); + for (int i = 0; i != input.dim(); ++i) { + params->input_sizes_[i] = input.size(i); + if (input.size(i) != 1) { + params->input_strides_[i] = input.stride(i); + } + } + params->signal_ndim_ = (uint8_t) signal_ndim; + params->complex_input_ = complex_input; + params->complex_output_ = complex_output; + for (size_t i = 0; i != checked_signal_sizes.size(); ++i) { + params->signal_sizes_[i] = checked_signal_sizes[i]; + } + params->onesided_ = onesided; +} + +struct CuFFTHandleDeleter { + void operator()(cufftHandle* x) { + if (x != nullptr) { + CUFFT_CHECK(cufftDestroy(*x)); + } + } +}; + +__forceinline__ +static bool is_pow_of_two(int64_t x) { + return (x & (x - 1)) == 0; +} + +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. whether to clone input before executing the plan +// 3. the workspace size needed +// +// Its constructor also guarantees that if `input` is contiguous in all +// dimensions, e.g., from cloning, clone_input will be false. +// +// This class will be the **value** in the plan cache. +// It **owns** the raw plan via a unique_ptr. +class CuFFTConfig { +public: + + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + CuFFTConfig(const CuFFTConfig&) = delete; + CuFFTConfig& operator=(CuFFTConfig const&) = delete; + + explicit CuFFTConfig(Tensor& input, int64_t signal_ndim, bool complex_input, + bool complex_output, IntList checked_signal_sizes, bool onesided, + IntList output_sizes) { + + // signal sizes + std::vector signal_sizes(checked_signal_sizes.begin(), + checked_signal_sizes.end()); + + // input batch size + long long int batch = input.size(0); + + // Since cuFFT has limited non-unit stride support and various constraints, we + // use a flag to keep track throughout this function to see if we need to + // input = input.clone(); + clone_input = false; + + // For half, base strides on the real part of real-to-complex and + // complex-to-real transforms are not supported. Since our output is always + // contiguous, only need to check real-to-complex case. + if (input.type().scalarType() == ScalarType::Half) { + // cuFFT on half requires compute capability of at least SM_53 + auto dev_prop = at::globalContext().getCurrentDeviceProperties(); + if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) { + std::ostringstream ss; + ss << "cuFFT doesn't support signals of half type with compute " + << "capability less than SM_53, but the device containing input half " + << "tensor only has SM_" << dev_prop->major << dev_prop->minor; + throw std::runtime_error(ss.str()); + } + for (int64_t i = 0; i < signal_ndim; i++) { + auto signal_size = checked_signal_sizes[i]; + if (!is_pow_of_two(signal_size)) { + std::ostringstream ss; + ss << "cuFFT doesn't support signals of half type with size at any " + << "dimension that is not a power of two, but got a signal size of " + << checked_signal_sizes; + throw std::runtime_error(ss.str()); + } + } + clone_input |= input.stride(signal_ndim) != 1; + } + + // check the input sizes and strides to see if we need to make it contiguous + // cuFFT doesn't support batch dim with stride 0 + clone_input |= input.stride(0) == 0; + + if (complex_input) { + // Real/imag dimension must be like complex type. + clone_input |= input.stride(-1) != 1; + // Strides of other dimensions needs to be aligned when viewed as of complex + // type, i.e., multiples of 2. We check the batch dim and last signal dim + // here. If the input can be viewed as having embedded strides, the other + // signal dims will also satisfy this. + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + clone_input |= (batch > 0 && input.stride(0) % 2 != 0) || + input.stride(signal_ndim) % 2 != 0; + } + + // Checks if input strides can be viewed as embedded. + // See NOTE [ cuFFT Embedded Strides ]. + // + // TODO: Figure out why windows fails to compile + // at::optional> inembed_opt = at::nullopt; + // Then move the following to a helper function. + std::vector inembed(signal_ndim); + if (!clone_input) { + auto istrides = input.strides(); + auto last_istride = istrides[signal_ndim]; + clone_input = last_istride <= 0; + for (auto i = signal_ndim - 1; !clone_input && i > 0 /* inembed[0] doesn't matteer */; i--) { + auto istride = istrides[i]; + if (istride > 0 && istride % last_istride == 0) { + inembed[i] = istride / last_istride; + last_istride = istride; + } else { + clone_input = true; + } + } + } + + // Check if we can take advantage of simple data layout. + // + // Note that this is before the actual cloning. This is intentional so we can + // check for advanced data layout with complex-to-real transform. cuFFT + // out-of-place complex-to-real transforms with advanced layout may overwrite + // input, and we need to clone the input. + // + // This just needs contiguity in cases except for twosided real-to-complex + // transform where we won't have simple data layout as output is two sided. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + + bool simple_layout = !(!complex_input && complex_output && !onesided) && // not twosided R2C + (clone_input || input.is_contiguous()); // contiguous + if (!simple_layout && complex_input && !complex_output) { + clone_input = true; + simple_layout = true; + } + + // if input should be cloned but simple layout can't be used (e.g. twosided R2C) + if (clone_input && !simple_layout) { + auto input_size = input.sizes(); + std::copy(input_size.begin() + 1, // begin of signal dim in input + input_size.begin() + signal_ndim + 1, // end of signal dim in input + inembed.begin()); // begin of output + } + + cudaDataType itype, otype, exec_type; + if (input.type().scalarType() == ScalarType::Float) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (input.type().scalarType() == ScalarType::Double) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (input.type().scalarType() == ScalarType::Half) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + std::ostringstream ss; + ss << "cuFFT doesn't support tensor of type: " + << at::toString(input.type().scalarType()); + throw std::runtime_error(ss.str()); + } + + // create plan + auto raw_plan_ptr = new cufftHandle(); + CUFFT_CHECK(cufftCreate(raw_plan_ptr)); + plan_ptr.reset(raw_plan_ptr); + + // disable auto allocation of workspace to use THC allocator + CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + // make plan + if (simple_layout) { + // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL. + // In such case, cuFFT ignores base_istride, base_ostride, idist, and odist + // by assuming base_istride = base_ostride = 1. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + } else { + // set idist (stride at batch dim) + // set base_istride (stride at innermost dim of signal) + long long int idist, base_istride; + if (clone_input) { + idist = at::prod_intlist(input.sizes().slice(1, signal_ndim)); + base_istride = 1; + } else if (complex_input) { + idist = input.stride(0) >> 1; + base_istride = input.stride(signal_ndim) >> 1; + } else { + idist = input.stride(0); + base_istride = input.stride(signal_ndim); + } + // Even if batch dimension is one and idist (stride(0)) doesn't matter, + // cuFFT errors if idist = 0. This is hack to make it succeed. + if (idist == 0 && batch == 1) { + idist = 1; + } + + // set odist, onembed, base_ostride + long long int odist = at::prod_intlist(output_sizes.slice(1, signal_ndim)); + std::vector onembed(output_sizes.data() + 1, output_sizes.data() + signal_ndim + 1); + long long int base_ostride = 1; + + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + inembed.data(), base_istride, idist, itype, + onembed.data(), base_ostride, odist, otype, + batch, &ws_size_t, exec_type)); + } + ws_size = static_cast(ws_size_t); + } + + const cufftHandle &plan() const { return *plan_ptr.get(); } + + bool should_clone_input() const { return clone_input; } + + int64_t workspace_size() const { return ws_size; } + +private: + std::unique_ptr plan_ptr; + bool clone_input; + int64_t ws_size; +}; + +// NB: cuFFT allocates a starting plan array of size 1024. It should grow the +// array as more plans are created. However, a bug in cuFFT (at least +// present in CUDA 9.1) causes the cufftSetAutoAllocation call on the +// 1024-th plan to fail with CUFFT_INVALID_PLAN. Therefore, we check that +// cache size is leq 1023. The initial plan array size is 1024 for +// CUDA 8.0 ~ 9.2 so setting this as a CUDA-version-agnostic constant should +// be fine for now. +// TODO: When CUDA 10 comes out, check if the bug is fixed or if we need another +// number for CUDA 10. +constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023; +static_assert(CUFFT_MAX_PLAN_NUM >= 0 && CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); + +// This cache assumes that the mapping from key to value never changes. +// This is **NOT** thread-safe. Please use a mutex when using it **AND** the +// value returned from try_emplace_value. +// The contract of using this cache is that try_emplace_value should only be +// used when the max_size is positive. +class CuFFTParamsLRUCache { +public: + using kv_t = typename std::pair; + using map_t = typename std::unordered_map, + typename std::list::iterator, + ParamsHash, + ParamsEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + + CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_MAX_PLAN_NUM) {} + + CuFFTParamsLRUCache(int64_t max_size) { + _set_max_size(max_size); + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache using value_args and return it. + // Return const reference because CuFFTConfig shouldn't be tampered with once + // created. + // This is similar to c++ 17 try_emplace. + template + const CuFFTConfig &try_emplace_value(K&& key, VArgs&&... value_args) { + AT_ASSERT(_max_size > 0); + + map_kkv_iter_t map_it = _cache_map.find(key); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(key), + std::forward_as_tuple(value_args...)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + +private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + AT_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, + "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size); + AT_CHECK(new_size >= 0, + "cuFFT plan cache size must be non-negative, but got ", new_size); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +// Since ATen is separated into CPU build and CUDA build, we need a way to call +// these functions only when CUDA is loaded. We use CUDA hooks for this purpose +// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual +// native function counterparts (at native/SpectralOps.cpp), i.e., +// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size +// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache. +int64_t cufft_get_plan_cache_max_size_impl(); +void cufft_set_plan_cache_max_size_impl(int64_t max_size); +int64_t cufft_get_plan_cache_size_impl(); +void cufft_clear_plan_cache_impl(); + +}}} // namespace at::native::detail diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h new file mode 100644 index 0000000..5edfcbc --- /dev/null +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h @@ -0,0 +1,72 @@ +#pragma once + +#include "ATen/ATen.h" +#include "ATen/Config.h" + +#include +#include +#include +#include +#include + +namespace at { namespace native { + +// This means that max dim is 3 + 2 = 5 with batch dimension and possible +// complex dimension +constexpr int max_rank = 3; + +static inline std::string _cudaGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + default: + std::ostringstream ss; + ss << "unknown error " << error; + return ss.str(); + } +} + +static inline void CUFFT_CHECK(cufftResult error) +{ + if (error != CUFFT_SUCCESS) { + std::ostringstream ss; + ss << "cuFFT error: " << _cudaGetErrorEnum(error); + AT_ERROR(ss.str()); + } +} + +}} // at::native diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu new file mode 100644 index 0000000..4b346ca --- /dev/null +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -0,0 +1,122 @@ +#include "ATen/Dispatch.h" +#include "ATen/NativeFunctions.h" +#include "ATen/cuda/CUDAApplyUtils.cuh" +#include "ATen/AccumulateType.h" + +#include +#include +#include +#include +#include +#include + +#include "ATen/native/Distributions.h" + +#include +#include +#include +#include + +#include +#include +#include + +THCGenerator* THCRandom_getGenerator(THCState* state); + +namespace { +std::pair next_philox_seed(at::Generator* gen, uint64_t increment) { + auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState()); + uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment); + return std::make_pair(gen_->state.initial_seed, offset); +} + +template +void poisson_cuda_kernel( + at::Tensor& ret, + const at::Tensor& lambda, + std::pair seeds) { + at::cuda::CUDA_tensor_apply2( + ret, + lambda, + [seeds] __device__( + scalar_t & ret_val, const scalar_t& lambda) { + curandStatePhilox4_32_10_t state; + curand_init( + seeds.first, + blockIdx.x * blockDim.x + threadIdx.x, + seeds.second, + &state); + ret_val = static_cast(curand_poisson(&state, lambda)); + }); +} + +template +void gamma_cuda_kernel( + at::Tensor& ret, + const at::Tensor& alpha, + std::pair seeds) { + using accscalar_t = at::acc_type; + at::cuda::CUDA_tensor_apply2( + ret, + alpha, + [seeds] __device__( + scalar_t & ret_val, const scalar_t& alpha) { + curandStatePhilox4_32_10_t state; + curand_init( + seeds.first, + blockIdx.x * blockDim.x + threadIdx.x, + seeds.second, + &state); + BaseSampler standard_uniform([&state] __device__ () { + return curand_uniform(&state); + }); + BaseSampler standard_normal([&state] __device__ () { + return curand_normal(&state); + }); + auto sample = sample_gamma(alpha, standard_uniform, standard_normal); + auto min_value = std::numeric_limits::lowest(); + ret_val = (min_value > sample) ? min_value : sample; + }); +} + +template +void gamma_grad_cuda_kernel( + at::Tensor& ret, + const at::Tensor& self, + const at::Tensor& output) { + using accscalar_t = at::acc_type; + at::cuda::CUDA_tensor_apply3( + ret, self, output, + [] __device__ (scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) { + ret_val = standard_gamma_grad_one(self_val, output_val); + }); +} + +} // namespace + +namespace at { namespace native { +Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) { + Tensor ret = lambda.type().tensor(lambda.sizes()); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "poisson", [&] { + poisson_cuda_kernel(ret, lambda, next_philox_seed(gen, 20)); + }); + return ret; +} + +Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) { + Tensor ret = alpha.type().tensor(alpha.sizes()); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "gamma", [&] { + gamma_cuda_kernel(ret, alpha, next_philox_seed(gen, 10)); + }); + return ret; +} + +Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) { + Tensor ret = self.type().tensor(self.sizes()); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "_standard_gamma_grad", [&] { + gamma_grad_cuda_kernel(ret, self, output); + }); + return ret; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu new file mode 100644 index 0000000..affe20d --- /dev/null +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -0,0 +1,371 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/Error.h" + +#include "ATen/AccumulateType.h" + +#include +#include +#include +#include + +#include +#include + + +namespace at { namespace native { + +namespace { + +static const int WARP_SIZE = 32; +static const int BLOCKDIMY = 32; + +template + +__global__ void embedding_backward_feature_kernel + (int64_t* indices, + const scalar_t* __restrict__ grad, + scalar_t* __restrict__ grad_weight, + int n, // OK to pass as int, we don't expect 2 billion+ samples in one shot + int64_t stride, + int padding_idx) +{ + extern __shared__ char buf[]; + accscalar_t* smem = (accscalar_t*)buf; + accscalar_t* my_s = smem + WARP_SIZE*threadIdx.y; + int* indices_batch = (int*)(buf + sizeof(accscalar_t)*WARP_SIZE*blockDim.y); + + const int s = (int)stride; // OK to make int, we don't expect 2 billion+ embedding row size + + const int f = threadIdx.x + blockIdx.x*blockDim.x; // feature_dim + + for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y) + { + // Entire block cooperates to load a batch of 1024 indices to process + int tid = threadIdx.x + threadIdx.y*blockDim.x; + if(batch_start + tid < n) + indices_batch[tid] = (int)indices[batch_start + tid]; + + // Loop over the batch of <= 1024 loaded indices in chunks of blockDim.y = 32 + for(int chunk_start = batch_start; chunk_start < n; chunk_start += blockDim.y) + { + // This does double duty: it makes sure indices_batch is ready, and it makes sure match-group + // leaders are done with their accumulates before other warps start loading again. + __syncthreads(); + + int n_this_chunk = (n - chunk_start) < blockDim.y ? (n - chunk_start) : blockDim.y; + + int src_row = chunk_start + threadIdx.y; + int dst_row = indices_batch[src_row - batch_start]; // This warp's target row in grad_weight + + // All warps load their smem segments with incoming grad data + if(src_row < n && f < s && dst_row != padding_idx) + my_s[threadIdx.x] = static_cast(grad[src_row*stride + f]); + + __syncthreads(); + + // To ensure determinism, we can't just have each warp add its grad data to its dst_row. + // We need to check if any other warps pulled grad data targeting dst_row. + // If so, we elect the first warp in each matching group as the leader. + // Each leader warp serializes the accumulates targeting dst_row in shared memory, + // then finishes by adding the accumulated buffer to dst_row in grad_weight. + if(dst_row != padding_idx && src_row < n) // Per-warp exit condition, safe with ballot_sync + { + int match_found_this_thread = + (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]); + if(threadIdx.x >= n_this_chunk) + match_found_this_thread = 0; + unsigned int matchmask = WARP_BALLOT(match_found_this_thread); + + int first_remaining_peer = __ffs(matchmask) - 1; + + if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader + { + matchmask ^= (1 << first_remaining_peer); + while(matchmask) + { + first_remaining_peer = __ffs(matchmask) - 1; + my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer]; + matchmask ^= (1 << first_remaining_peer); + } + if(f < s) + grad_weight[dst_row*stride + f] += static_cast(my_s[threadIdx.x]); + } + } + } + } +} + + +template +__global__ void embedding_backward_kernel( + int64_t* input, int64_t* indices, scalar_t* grad_output, scalar_t* grad_weight, + int64_t* count, int64_t numel, int64_t stride, int padding_idx) { + + using accscalar_t = acc_type; + int idx = blockIdx.x * 4 + threadIdx.y; + + // Each warp is responsible for an input into the LookupTable. + // If the preceding input has the same as this input, then the warp + // exits immediately. The warp also processes subsequent inputs with the + // same value. + // + // Input Warp + // 1 + // 1 ( exits without doing any work) + // 5 + // 8 + + // Number of values proceessed by each thread (grain size) + const int SZ = 4; + + if (idx < numel + && (idx == 0 || input[idx] != input[idx - 1]) + && input[idx] != padding_idx) { + do { + const int start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ; + const int weight_row = ((int) input[idx]) * stride; + const int grad_row = ((int) indices[idx]) * stride; + const accscalar_t scale = count ? (accscalar_t)1.0 / count[idx] : 1.0; + + accscalar_t gradient[SZ]; + accscalar_t weight[SZ]; + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) { + int feature_dim = start_feature + ii * WARP_SIZE; + if (feature_dim < stride) { + gradient[ii] = static_cast(grad_output[grad_row + feature_dim]); + weight[ii] = static_cast(grad_weight[weight_row + feature_dim]); + } + } + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) { + weight[ii] += gradient[ii] * scale; + } + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) { + int feature_dim = start_feature + ii * WARP_SIZE; + if (feature_dim < stride) { + grad_weight[weight_row + feature_dim] = static_cast(weight[ii]); + } + } + + idx++; + } while (idx < numel && input[idx] == input[idx - 1]); + } +} + +/* Calculate norms of the rows of weight_ptr given by idx_ptr and capture them in norms */ +template +__global__ void renorm_kernel( + scalar_t* weights, int64_t* indices, accscalar_t max_norm, + accscalar_t norm_type, int64_t dim, + int64_t weights_stride0, int64_t weights_stride1) { + + // Some casting hacks since dynamic shared memory and templates don't work together: + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + + int tid = threadIdx.x; + int base_index = indices[blockIdx.x] * weights_stride0; + + accscalar_t v = 0; + for (int i = tid; i < dim; i += blockDim.x) { + auto x = static_cast(weights[base_index + i * weights_stride1]); + if (norm_type == 1) { + v += std::abs(x); + } else if (norm_type == 2) { + v += x * x; + } else { + v += std::pow(x, norm_type); + } + } + + using Op = ReduceAdd; + v = reduceBlock(sdata, blockDim.x, v, Op(), 0); + + if (tid == 0) { + sdata[0] = std::pow(v, static_cast(1.0 / norm_type)); + } + __syncthreads(); + + // now we renormalize the blocks that need it + if (sdata[0] > max_norm) { + auto factor = static_cast(max_norm / (sdata[0] + 1e-7)); + for (int i = tid; i < dim; i += blockDim.x) { + weights[base_index + i * weights_stride1] *= factor; + } + } +} + +} // anonymous namespace + +Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices, + int64_t num_weights, int64_t padding_idx, + bool scale_grad_by_freq) { + auto grad_arg = TensorArg(grad_, "grad", 1); + auto indices_arg = TensorArg(indices, "indices", 1); + checkScalarType("embedding_backward", indices_arg, kLong); + checkSameGPU("embedding_backward", grad_arg, indices_arg); + + auto num_indices = indices.numel(); + auto grad = grad_.contiguous().view({num_indices, grad_.size(-1)}); + auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options()); + + int64_t stride = grad_weight.stride(0); + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + + if (num_indices <= 768 && !scale_grad_by_freq) { + auto indices_contig = indices.contiguous(); + + dim3 grid(THCCeilDiv(stride, (int64_t)WARP_SIZE)); + dim3 block(WARP_SIZE, BLOCKDIMY); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (grad.type(), + "embedding_backward", + [&] + { + using accscalar_t = acc_type; + embedding_backward_feature_kernel + <<>> + (indices_contig.data(), + grad.data(), + grad_weight.data(), + num_indices, + stride, + padding_idx); + }); + + THCudaCheck(cudaGetLastError()); + return grad_weight; + } + + auto sorted_indices = at::empty_like(indices); + auto orig_indices = at::empty_like(indices); + using device_ptr = thrust::device_ptr; + + // Sort the inputs into sorted with the corresponding indices; we + // don't need a stable or multidimensional sort, so just use Thrust + // directly + { + sorted_indices.copy_(indices); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + // Fill sortedOrigIndices with sequential indices + auto count_iter = thrust::counting_iterator(0); + auto orig_data = device_ptr(orig_indices.data()); + thrust::copy(policy, count_iter, count_iter + num_indices, orig_data); + + // Sort; a stable sort is not required + auto sorted_data = device_ptr(sorted_indices.data()); + thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, + ThrustLTOp()); + } + + Tensor count; + if (scale_grad_by_freq) { + count = at::empty_like(indices); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + // Compute an increasing sequence per unique item in sortedIndices: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + auto sorted_data = device_ptr(sorted_indices.data()); + auto count_data = device_ptr(count.data()); + thrust::inclusive_scan_by_key( + policy, + sorted_data, + sorted_data + num_indices, + thrust::make_constant_iterator(1), + count_data + ); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( + policy, + thrust::make_reverse_iterator(sorted_data + num_indices), + thrust::make_reverse_iterator(sorted_data), + thrust::make_reverse_iterator(count_data + num_indices), + thrust::make_reverse_iterator(count_data + num_indices), + thrust::equal_to(), + thrust::maximum() + ); + } + + dim3 grid(THCCeilDiv(num_indices, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128)); + dim3 block(32, 4); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "embedding_backward", [&] { + embedding_backward_kernel<<>>( + sorted_indices.data(), + orig_indices.data(), + grad.data(), + grad_weight.data(), + count.defined() ? count.data() : nullptr, + num_indices, + stride, + padding_idx); + }); + THCudaCheck(cudaGetLastError()); + + return grad_weight; +} + +Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, + double max_norm, double norm_type) { + auto self_arg = TensorArg(self, "self", 1); + auto indices_arg = TensorArg(indices, "indices", 1); + checkDim("embedding_renorm_", self_arg, 2); + checkSameGPU("embedding_renorm", self_arg, indices_arg); + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + using device_ptr = thrust::device_ptr; + + auto num_indices = indices.numel(); + auto indices_contig = indices.contiguous(); + auto indices_data = device_ptr(indices_contig.data()); + + // FIXME: thrust::unique only removes consecutive elements that are equal. + // We have race conditions when indices contain duplicates which are not + // adjacent + auto unique_indices = indices.type().tensor(indices.numel()); + auto unique_data = device_ptr(unique_indices.data()); + auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data); + auto num_unique_indices = static_cast(end - unique_data); + + dim3 grid(num_unique_indices); + dim3 block(128); + int dim = self.stride(0); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "embedding_backward", [&] { + using accscalar_t = acc_type; + renorm_kernel<<>>( + self.data(), + unique_indices.data(), + static_cast(max_norm), + static_cast(norm_type), + dim, self.stride(0), self.stride(1)); + }); + THCudaCheck(cudaGetLastError()); + + return self; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu new file mode 100644 index 0000000..9169cb0 --- /dev/null +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -0,0 +1,389 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" + +#include "ATen/AccumulateType.h" + +#include +#include +#include +#include +#include + +#include +#include + +const int WARP_SIZE = 32; +const int MODE_SUM = 0; +const int MODE_MEAN = 1; +const int MODE_MAX = 2; + +namespace at { +namespace native { + +namespace { + +// This kernel assumes that all input tensors except `weight` are contiguous. +template +__global__ void EmbeddingBag_updateOutputKernel( + int64_t *input, int64_t *offsets, scalar_t *weight, scalar_t *output, + int64_t *offset2bag, int64_t numIndices, int64_t numBags, + int64_t featureSize, int64_t weight_stide0, int64_t weight_stride1, + int mode, int64_t *bag_size, int64_t *max_indices) { + + // the strategy here is that each bag x feature is handled by a single thread + + using accscalar_t = acc_type; + int64_t chunksPerBag = THCCeilDiv(featureSize, (int64_t)blockDim.x); + int64_t numChunks = numBags * chunksPerBag; + int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y; + int64_t chunkStride = gridDim.x * blockDim.y; + + for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) { + int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x; + if (featureDim < featureSize) { + int64_t bag = chunk / chunksPerBag; + scalar_t *weightFeat = weight + featureDim * weight_stride1; + int64_t begin = offsets[bag]; + int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices; + assert(end >= begin); + + accscalar_t weightFeatSum = 0; + scalar_t weightFeatMax; + + int64_t bag_size_ = 0; + int64_t maxWord = -1; + for (int64_t emb = begin; emb < end; emb++) { + const int64_t weightRow = input[emb] * weight_stide0; + scalar_t weightValue = weightFeat[weightRow]; + + if (mode == MODE_MAX) { + if (emb == begin || weightValue > weightFeatMax) { + weightFeatMax = weightValue; + maxWord = input[emb]; + } + } else { + weightFeatSum += static_cast(weightValue); + } + + bag_size_++; + if (featureDim == 0) { + offset2bag[emb] = bag; + } + } + if (mode == MODE_MEAN) { + weightFeatSum = weightFeatSum / static_cast(bag_size_); + bag_size[bag] = bag_size_; + } + + if (mode == MODE_MEAN || mode == MODE_SUM) { + output[bag * featureSize + featureDim] = static_cast(weightFeatSum); + } + else if (mode == MODE_MAX) { + max_indices[bag * featureSize + featureDim] = maxWord; + output[bag * featureSize + featureDim] = weightFeatMax; + } + } + } +} + +// FIXME: removed the accGradParametersKernelByFeature case present in +// LookupTable. That kernel is faster at small sizes (<768 indices), which +// does not need EmbeddingBag (LookupTable + Sum works fine), but would +// still be nice to not be slow in that case. + +// This kernel assumes that all input tensors are contiguous. +template +__global__ void EmbeddingBag_accGradParametersKernel_sum_avg( + int64_t *input, int64_t *indices, scalar_t *gradOutput, + scalar_t *gradWeight, int64_t *offset2bag, int64_t *count, ptrdiff_t numel, + int64_t stride, int mode, int64_t *bag_size) { + + using accscalar_t = acc_type; + int idx = blockIdx.x * 4 + threadIdx.y; + + // Each warp is responsible for an input into the LookupTable. + // If the preceding input has the same as this input, then the warp + // exits immediately. The warp also processes subsequent inputs with the + // same value. // + // Input Warp + // 1 + // 1 ( exits without doing any work) + // 5 + // 8 + + // Number of values proceessed by each thread (grain size) + const int SZ = 4; + + if (idx < numel && (idx == 0 || input[idx] != input[idx - 1])) { + do { + const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; + const int weightRow = ((int)input[idx]) * stride; + + // Note: only this line changes from LookupTable_accgradParametersKernel + const int origRow = ((int)indices[idx]); + const int seq_number = offset2bag[origRow]; + const int gradOutputRow = ((int)seq_number) * stride; + + const accscalar_t scale = count ? (accscalar_t)1.0 / count[idx] : 1.0; + + accscalar_t gradient[SZ]; + accscalar_t weight[SZ]; + +#pragma unroll + for (int ii = 0; ii < SZ; ii++) { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) { + gradient[ii] = + static_cast(gradOutput[gradOutputRow + featureDim]); + if (mode == MODE_MEAN) { + gradient[ii] /= bag_size[seq_number]; + } + weight[ii] = + static_cast(gradWeight[weightRow + featureDim]); + } + } + +#pragma unroll + for (int ii = 0; ii < SZ; ii++) { + weight[ii] += gradient[ii] * scale; + } + +#pragma unroll + for (int ii = 0; ii < SZ; ii++) { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) { + gradWeight[weightRow + featureDim] = + static_cast(weight[ii]); + } + } + + idx++; + } while (idx < numel && input[idx] == input[idx - 1]); + } +} + + +Tensor embedding_bag_backward_cuda_sum_avg( + const Tensor &grad, + const Tensor &indices, + const Tensor &offset2bag, + const Tensor &bag_size_, + int64_t num_weights, + bool scale_grad_by_freq, int64_t mode) { + + Tensor &bag_size = const_cast(bag_size_); + + auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.type()); + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + + ptrdiff_t numel = indices.numel(); + int64_t stride = grad_weight.stride(0); + + auto sorted_indices = indices.type().tensor(indices.sizes()); + auto orig_indices = indices.type().tensor(indices.sizes()); + using device_ptr = thrust::device_ptr; + + // Sort the inputs into sorted with the corresponding indices; we + // don't need a stable or multidimensional sort, so just use Thrust + // directly + { + sorted_indices.copy_(indices); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + // Fill sortedOrigIndices with sequential indices + auto count_iter = thrust::counting_iterator(0); + auto orig_data = device_ptr(orig_indices.data()); + thrust::copy(policy, count_iter, count_iter + numel, orig_data); + + // Sort; a stable sort is not required + auto sorted_data = device_ptr(sorted_indices.data()); + thrust::sort_by_key(policy, sorted_data, sorted_data + numel, orig_data, + ThrustLTOp()); + } + + Tensor count; + if (scale_grad_by_freq) { + count = indices.type().tensor(indices.sizes()); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + // Compute an increasing sequence per unique item in sortedIndices: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + auto sorted_data = device_ptr(sorted_indices.data()); + auto count_data = device_ptr(count.data()); + thrust::inclusive_scan_by_key(policy, sorted_data, sorted_data + numel, + thrust::make_constant_iterator(1), + count_data); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( + policy, thrust::make_reverse_iterator(sorted_data + numel), + thrust::make_reverse_iterator(sorted_data), + thrust::make_reverse_iterator(count_data + numel), + thrust::make_reverse_iterator(count_data + numel), + thrust::equal_to(), thrust::maximum()); + } + + dim3 grid(THCCeilDiv(numel, (ptrdiff_t)4), THCCeilDiv(stride, (int64_t)128)); + dim3 block(32, 4); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.type(), "embedding_bag_backward_cuda_sum_avg_kernel", [&] { + EmbeddingBag_accGradParametersKernel_sum_avg< + scalar_t><<>>( + sorted_indices.data(), orig_indices.data(), + grad.data(), grad_weight.data(), + offset2bag.data(), + count.defined() ? count.data() : nullptr, numel, stride, + mode, bag_size.data()); + }); + + THCudaCheck(cudaGetLastError()); + return grad_weight; +} + +template +__global__ void EmbeddingBag_accGradParametersKernel_max( + int64_t *max_indices, scalar_t *gradOutput, + scalar_t *gradWeight, int64_t stride, int64_t numBags) { + + using accscalar_t = acc_type; + + int64_t chunksPerBag = THCCeilDiv(stride, (int64_t)blockDim.x); + int64_t numChunks = numBags * chunksPerBag; + int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y; + int64_t chunkStride = gridDim.x * blockDim.y; + + for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) { + int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x; + if (featureDim < stride) { + int64_t bag = chunk / chunksPerBag; + + int64_t word_idx = max_indices[bag * stride + featureDim]; + + atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]); + } + } +} + +Tensor embedding_bag_backward_cuda_max(const Tensor &grad, + const Tensor &max_indices, + int64_t num_weights) { + + auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.type()); + + int64_t stride = grad_weight.stride(0); + + int64_t numBags = grad.size(0); + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + + dim3 block = dim3(32, 8); + int grid = 1024; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.type(), "embedding_bag_backward_cuda_max", [&] { + EmbeddingBag_accGradParametersKernel_max< + scalar_t><<>>( + max_indices.data(), grad.data(), + grad_weight.data(), stride, numBags); + }); + + THCudaCheck(cudaGetLastError()); + return grad_weight; +} +} + +// Assumes all input tensors are contiguous. +// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details +std::tuple +_embedding_bag_cuda(const Tensor &weight, const Tensor &indices, + const Tensor &offsets, const bool scale_grad_by_freq, + const int64_t mode, bool sparse) { + auto indices_arg = TensorArg(indices, "indices", 1); + checkScalarType("embedding_bag_cuda", indices_arg, kLong); + auto offsets_arg = TensorArg(offsets, "offsets", 1); + checkScalarType("embedding_bag_cuda", offsets_arg, kLong); + auto weight_arg = TensorArg(weight, "weight", 1); + checkSameGPU("embedding_bag_cuda", weight_arg, indices_arg); + checkSameGPU("embedding_bag_cuda", weight_arg, offsets_arg); + + int64_t numIndices = indices.size(0); + int64_t numBags = offsets.size(0); + int64_t featureSize = weight.size(1); + + auto bag_size = at::zeros(offsets.sizes(), indices.options()); + auto offset2bag = + at::zeros({indices.size(0)}, indices.options()); // offset2bag = [0 0 0 0 0] + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + + auto output = at::zeros({offsets.size(0), weight.size(1)}, weight.options()); + + Tensor max_indices; + + if (mode == MODE_MAX) { + max_indices = at::zeros({offsets.size(0), weight.size(1)}, indices.options()); + } else { + // No need to allocate if we aren't doing a backwards pass + max_indices = at::zeros({0}, indices.options()); + } + + dim3 block = dim3(32, 8); + int grid = 1024; + AT_DISPATCH_FLOATING_TYPES_AND_HALF(weight.type(), "embedding_bag_cuda", [&] { + EmbeddingBag_updateOutputKernel<<>>( + indices.data(), offsets.data(), + weight.data(), output.data(), + offset2bag.data(), numIndices, numBags, featureSize, + weight.stride(0), weight.stride(1), mode, bag_size.data(), + mode == MODE_MAX ? max_indices.data() : NULL); + }); + + THCudaCheck(cudaGetLastError()); + return std::tuple(output, offset2bag, bag_size, max_indices); +} + +Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &indices, + const Tensor &offsets, + const Tensor &offset2bag, + const Tensor &bag_size_, + const Tensor &max_indices, + int64_t num_weights, + bool scale_grad_by_freq, int64_t mode) { + // indices, offsets and offset2bag are assumed having correct dtypes and + // contiguous here due to the checks in _embedding_bag_backward in + // EmbeddingBag.cpp. + // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml + // for more details. + + Tensor grad = grad_.contiguous(); + auto indices_arg = TensorArg(indices, "indices", 1); + auto offsets_arg = TensorArg(offsets, "offsets", 1); + auto grad_arg = TensorArg(grad, "grad", 1); + checkSameGPU("embedding_bag_cuda", grad_arg, offsets_arg); + checkSameGPU("embedding_bag_cuda", grad_arg, indices_arg); + + switch (mode) { + case MODE_SUM: + case MODE_MEAN: + return embedding_bag_backward_cuda_sum_avg(grad, indices, offset2bag, bag_size_, num_weights, scale_grad_by_freq, mode); + + case MODE_MAX: + return embedding_bag_backward_cuda_max(grad, max_indices, num_weights); + + default: + AT_ERROR( + "Unknown mode for embedding_bag_backward_cuda %d", mode); + } +} + +} +} diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu new file mode 100644 index 0000000..c31d557 --- /dev/null +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -0,0 +1,142 @@ +#include "ATen/Context.h" +#include "ATen/Dispatch.h" +#include "ATen/NativeFunctions.h" +#include "ATen/cuda/PinnedMemoryAllocator.h" +#include "ATen/cuda/CUDAApplyUtils.cuh" + +#include "ATen/native/LinearAlgebraUtils.h" +#include "ATen/native/Gesv.h" + +#include "THC.h" // for USE_MAGMA + +#ifdef USE_MAGMA +#include +#include +#endif + +namespace at { +namespace native { + +#ifdef USE_MAGMA +template +void magmaGesvBatched( + magma_int_t n, magma_int_t nrhs, scalar_t** dA_array, magma_int_t ldda, + magma_int_t** dipiv_array, scalar_t** dB_array, magma_int_t lddb, + magma_int_t* dinfo_array, magma_int_t batch_count, magma_queue_t queue) { + AT_ERROR("gesv only takes float or double Tensors"); +} + +template<> +void magmaGesvBatched( + magma_int_t n, magma_int_t nrhs, float** dA_array, magma_int_t ldda, + magma_int_t** dipiv_array, float** dB_array, magma_int_t lddb, + magma_int_t* dinfo_array, magma_int_t batch_count, magma_queue_t queue) { + magma_sgesv_batched( + n, nrhs, dA_array, ldda, dipiv_array, + dB_array, lddb, dinfo_array, batch_count, queue); +} + +template<> +void magmaGesvBatched( + magma_int_t n, magma_int_t nrhs, double** dA_array, magma_int_t ldda, + magma_int_t** dipiv_array, double** dB_array, magma_int_t lddb, + magma_int_t* dinfo_array, magma_int_t batch_count, magma_queue_t queue) { + magma_dgesv_batched( + n, nrhs, dA_array, ldda, dipiv_array, + dB_array, lddb, dinfo_array, batch_count, queue); +} + +static magma_queue_t createMagmaQueue(const Tensor& tensor) { + auto& context = tensor.type().get_context(); + magma_queue_t magma_queue; + magma_queue_create_from_cuda( + tensor.get_device(), + context.getCurrentCUDAStream(), + THCState_getCurrentBlasHandle(context.getTHCState()), + THCState_getCurrentSparseHandle(context.getTHCState()), + &magma_queue); + return magma_queue; +} + +static inline magma_int_t magma_int_cast(int64_t value, const char* varname) { + auto result = static_cast(value); + if (static_cast(result) != value) { + AT_ERROR("magma: The value of %s (%lld) is too large to fit into a magma_int_t (%llu bytes)", + varname, (long long)value, sizeof(magma_int_t)); + } + return result; +} +#endif + +// Creates an array of size elements of type T, backed by pinned memory +// wrapped in a Storage +template +static inline std::unique_ptr pin_memory(int64_t size, Tensor dummy) { + int64_t adjusted_size = size * sizeof(T); + auto* allocator = cuda::getPinnedMemoryAllocator(); + auto& backend = dummy.type().toBackend(kCPU).toScalarType(kByte); + return backend.storageWithAllocator(adjusted_size, allocator); +} + +#define ALLOCATE_ARRAY(name, type, size, dummy_tensor) \ + auto storage_##name = pin_memory(size, dummy_tensor); \ + name = reinterpret_cast(storage_##name->data()); + +template +static void applyGesv(Tensor& b, Tensor& A, std::vector infos) { +#ifndef USE_MAGMA +AT_ERROR("gesv: MAGMA library not found in " + "compilation. Please rebuild with MAGMA."); +#else + auto A_data = A.data(); + auto b_data = b.data(); + auto A_mat_stride = matrixStride(A); + auto b_mat_stride = matrixStride(b); + + magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount"); + magma_int_t n = magma_int_cast(A.size(-2), "A.size(-2)"); + magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)"); + + magma_int_t* info_array; + magma_int_t* ipiv_data; + magma_int_t** ipiv_array; + scalar_t** A_array; + scalar_t** b_array; + + ALLOCATE_ARRAY(info_array, magma_int_t, batch_size, b); + ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size * n, b); + ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size, b); + ALLOCATE_ARRAY(A_array, scalar_t*, batch_size, b); + ALLOCATE_ARRAY(b_array, scalar_t*, batch_size, b); + + // Set up the created arrays + for (int64_t i = 0; i < batch_size; i++) { + A_array[i] = &A_data[i * A_mat_stride]; + b_array[i] = &b_data[i * b_mat_stride]; + ipiv_array[i] = &ipiv_data[i * n]; + } + + magmaGesvBatched( + n, nrhs, A_array, n, ipiv_array, b_array, n, + info_array, batch_size, createMagmaQueue(b)); + + for (int64_t i = 0; i < batch_size; i++) { + infos[i] = info_array[i]; + } +#endif +} + +std::tuple _gesv_helper_cuda(const Tensor& self, const Tensor& A) { + std::vector infos(batchCount(A), 0); + auto A_working_copy = cloneBatchedColumnMajor(A); + auto b_working_copy = cloneBatchedColumnMajor(self); + AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{ + applyGesv(b_working_copy, A_working_copy, infos); + }); + checkErrors(infos); + return std::tuple(b_working_copy, A_working_copy); +} + +}} // namespace at::native + +#undef ALLOCATE_ARRAY diff --git a/aten/src/ATen/native/cuda/RoiPooling.cu b/aten/src/ATen/native/cuda/RoiPooling.cu new file mode 100644 index 0000000..63f1f26 --- /dev/null +++ b/aten/src/ATen/native/cuda/RoiPooling.cu @@ -0,0 +1,214 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Error.h" + +#include +#include + +namespace at { +namespace native { + +__host__ __device__ __forceinline__ float fmin(float a, float b) { + return a > b ? b : a; +} + +__host__ __device__ __forceinline__ float fmax(float a, float b) { + return a > b ? a : b; +} + +template +__global__ void RoiPooling2d_forward_kernel( + const int outputElements, + const T *input, + const T *rois, + const T spatialScale, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int pooledHeight, + const int pooledWidth, + T *output, + int *argmaxes) +{ + for (int linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < outputElements; + linearIndex += blockDim.x * gridDim.x) + { + // Calculate position in output Tensor, i.e. a specific combination + // of proposal, channel, pool height and pool width + // TODO: write to improve performance by minimize computation + int pw = linearIndex % pooledWidth; + int ph = (linearIndex / pooledWidth) % pooledHeight; + int ch = (linearIndex / pooledWidth / pooledHeight) % inputChannels; + int proposal = linearIndex / pooledWidth / pooledHeight / inputChannels; + + // Get particular proposal data + const T *roisOffset = rois + (proposal * 5); + int n = roisOffset[0]; + int startWidth = llrintf(roisOffset[1] * spatialScale); + int startHeight = llrintf(roisOffset[2] * spatialScale); + int endWidth = llrintf(roisOffset[3] * spatialScale); + int endHeight = llrintf(roisOffset[4] * spatialScale); + + // TODO: fix malformed RoIs to be 1x1 + + int roiHeight = endHeight - startHeight; + int roiWidth = endWidth - startWidth; + + // Calculate size of tile based on the size of this particular RoI and the + // output size + T tileHeight = static_cast(roiHeight) / static_cast(pooledHeight); + T tileWidth = static_cast(roiWidth) / static_cast(pooledWidth); + + // Calculate offset into the pooled region + int tileHStart = static_cast(floorf(static_cast(ph) * tileHeight)); + int tileWStart = static_cast(floorf(static_cast(pw) * tileWidth)); + int tileHEnd = static_cast(ceilf(static_cast(ph + 1) * tileHeight)); + int tileWEnd = static_cast(ceilf(static_cast(pw + 1) * tileWidth)); + + // Calculate offset into the image itself, based on RoI + pooled offsets, + // and ensure it falls within image boundaries + tileHStart = fmin(fmax(tileHStart + startHeight, 0), inputHeight); + tileWStart = fmin(fmax(tileWStart + startWidth, 0), inputWidth); + tileHEnd = fmin(fmax(tileHEnd + startHeight, 0), inputHeight); + tileWEnd = fmin(fmax(tileWEnd + startWidth, 0), inputWidth); + + // If our pooling region is empty, we set the output to 0, otherwise to + // the min float so we can calculate the max properly + bool isEmpty = (tileHStart >= tileHEnd) || (tileWStart >= tileWEnd); + T max = isEmpty ? 0 : FLT_MIN; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + int maxIdx = -1; + + const T *inputOffset = input + ((n * inputChannels + ch) * inputHeight * inputWidth); + for (int th = tileHStart; th < tileHEnd; ++th) { + for (int tw = tileWStart; tw < tileWEnd; ++tw) { + int index = (th * inputWidth) + tw; + if (inputOffset[index] > max) { + max = inputOffset[index]; + maxIdx = index; + } + } + } + output[linearIndex] = max; + + // TODO optional argmax + argmaxes[linearIndex] = maxIdx; + } +} + +std::tuple RoiPooling2d_forward_cuda( + const Tensor& input, + const Tensor& rois, + int64_t pooledHeight, + int64_t pooledWidth, + double spatialScale) +{ + + // Input is the output of the last convolutional layer in the Backbone network, so + // it should be in the format of NCHW + AT_CHECK(input.ndimension() == 4, "Input to RoI Pooling should be a NCHW Tensor"); + + // ROIs is the set of region proposals to process. It is a 2D Tensor where the first + // dim is the # of proposals, and the second dim is the proposal itself in the form + // [batch_index startW startH endW endH] + AT_CHECK(rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)"); + AT_CHECK(rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]"); + + auto proposals = rois.size(0); + auto inputChannels = input.size(1); + auto inputHeight = input.size(2); + auto inputWidth = input.size(3); + + // Output Tensor is (num_rois, C, pooledHeight, pooledWidth) + auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth}); + + // TODO: need some mechanism for determining train vs. test + + // During training, we need to store the argmaxes for the pooling operation, so + // the argmaxes Tensor should be the same size as the output Tensor + auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth}); + + AT_CHECK(input.is_contiguous(), "input must be contiguous"); + AT_CHECK(rois.is_contiguous(), "rois must be contiguous"); + + dim3 block(512); + dim3 grid((output.numel() + 512 - 1) / 512); + RoiPooling2d_forward_kernel<<>>( + output.numel(), input.data(), rois.data(), static_cast(spatialScale), inputChannels, + inputHeight, inputWidth, pooledHeight, pooledWidth, output.data(), argmaxes.data()); + AT_CHECK(cudaGetLastError() == cudaSuccess, "RoiPooling2d_forward_kernel failed with error code ", cudaGetLastError()); + + return std::make_tuple(output, argmaxes); +} + +template +__global__ void RoiPooling2d_backward_kernel( + const int outputElements, + const T *gradOutput, + const int *argmaxes, + const int proposals, + const T spatialScale, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int pooledHeight, + const int pooledWidth, + T *gradInput, + const T *rois) +{ + for (int linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < outputElements; + linearIndex += blockDim.x * gridDim.x) + { + int pw = linearIndex % pooledWidth; + int ph = (linearIndex / pooledWidth) / pooledHeight; + int ch = (linearIndex / pooledWidth / pooledHeight) % inputChannels; + int proposal = linearIndex / pooledWidth / pooledHeight / inputChannels; + + const T *roisOffset = rois + (proposal * 5); + int n = roisOffset[0]; + int gradInputOffset = (n * inputChannels + ch) * inputHeight * inputWidth; + int gradOutputOffset = (n * inputChannels + ch) * pooledHeight * pooledWidth; + const T* gradOutputShifted = gradOutput + gradOutputOffset; + T *gradInputShifted = gradInput + gradInputOffset; + const int *argmaxesShifted = argmaxes + gradOutputOffset; + + int argmax = argmaxesShifted[ph * pooledWidth + pw]; + if (argmax != -1) { + atomicAdd(gradInputShifted + argmax, gradOutputShifted[ph * pooledWidth + pw]); + } + } +} + +Tensor RoiPooling2d_backward_cuda( + const Tensor& input, + const Tensor& rois, + int64_t pooledHeight, + int64_t pooledWidth, + double spatialScale, + const Tensor& gradOutput, + const Tensor& argmaxes) +{ + // TODO: assertions? + + auto proposals = rois.size(0); + auto inputChannels = input.size(1); + auto inputHeight = input.size(2); + auto inputWidth = input.size(3); + + auto gradInput = input.type().tensor(input.sizes()); + + dim3 block(512); + dim3 grid((gradInput.numel() + 512 - 1) / 512); + RoiPooling2d_backward_kernel<<>>( + gradOutput.numel(), gradOutput.data(), argmaxes.data(), proposals, + static_cast(spatialScale), inputChannels, inputHeight, inputWidth, + pooledHeight, pooledWidth, gradInput.data(), rois.data()); + AT_CHECK(cudaGetLastError() == cudaSuccess, "RoiPooling2d_backward_kernel failed with error code ", cudaGetLastError()); + + return gradInput; +} + +} // at::native +} // at diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu new file mode 100644 index 0000000..0ee5d18 --- /dev/null +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -0,0 +1,596 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/WrapDimUtils.h" +#include +#include +#include +#include + +#include "ATen/AccumulateType.h" + + +namespace at { +namespace native { + +namespace { + +template +struct LogSoftMaxForwardEpilogue { + __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum) + : logsum(max_input + std::log(sum)) {} + + __device__ __forceinline__ T operator()(T input) const { + return static_cast(input - logsum); +} + + const AccumT logsum; +}; + +template +struct LogSoftMaxBackwardEpilogue { + __device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum) + : sum(sum) {} + + __device__ __forceinline__ T operator()(T gradOutput, T output) const { + return static_cast(gradOutput - std::exp(static_cast(output)) * sum); + } + + const AccumT sum; +}; + +template +struct SoftMaxForwardEpilogue { + __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum) + : max_input(max_input) + , sum(sum) {} + + __device__ __forceinline__ T operator()(T input) const { + return static_cast(std::exp(input - max_input) / sum); + } + + const AccumT max_input; + const AccumT sum; +}; + +template +struct SoftMaxBackwardEpilogue { + __device__ __forceinline__ SoftMaxBackwardEpilogue(AccumT sum) + : sum(sum) {} + + // XXX: gradOutput that we get here is really gradOutput * output + // Look for cmul in SoftMax_updateGradInput + __device__ __forceinline__ T operator()(T gradOutput, T output) const { + return static_cast(gradOutput - output * sum); + } + + const AccumT sum; +}; + + + + +//////////////////////////////////////////////////////////////////////////////// +// Spatial kernel (fast with large inner_size and small dim_size) +//////////////////////////////////////////////////////////////////////////////// +// Let's assume that our input has been flattened to have only three dimension: +// outer x dim x inner +// The spatial algorithm tries to paralellize along all of them. +// Within a 2d block threadIdx.y paralellizes over dim slices, and threads that +// share it will speed up reductions over dim (along axis x). +// The 2d grid is used to paralellize inner dimension over y axis and outer over x. +inline dim3 SpatialSoftMax_getGridSize( + dim3 block, uint32_t max_active_blocks, + uint64_t outer_size, uint64_t dim_size, uint64_t inner_size) { + // First, tile as many blocks as we can over the y axis + uint32_t inner_blocks = (inner_size + block.y - 1) / block.y; + if (inner_blocks > max_active_blocks) + inner_blocks = max_active_blocks; + // Fill the x axis with as many blocks as we can fit (a little more is ok too) + uint32_t outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks; + if (outer_blocks > outer_size) + outer_blocks = outer_size; + return dim3(outer_blocks, inner_blocks); +} + +const int max_threads = 1024; + +inline dim3 SpatialSoftMax_getBlockSize( + uint64_t outer_size, uint64_t dim_size, uint64_t inner_size) { + uint32_t inner_threads = inner_size; + inner_threads = std::min(inner_threads, static_cast(max_threads)); + uint32_t dim_threads = 1; + if (inner_threads <= 64 && dim_size >= 64) { + while (inner_threads * dim_threads <= max_threads && dim_threads <= dim_size) + dim_threads *= 2; + dim_threads /= 2; + } + return dim3(dim_threads, inner_threads); +} + + +template +void SpatialSoftMax_getLaunchSizes( + Kernel k, + uint64_t outer_size, uint64_t dim_size, uint64_t inner_size, + dim3& grid, dim3& block, uint32_t& smem_size) { + block = SpatialSoftMax_getBlockSize(outer_size, dim_size, inner_size); + uint32_t block_threads = block.x * block.y; + smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t); + int max_active_blocks; +#ifdef __HIP_PLATFORM_HCC__ + max_active_blocks = 16; +#else + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + k, block_threads, smem_size); +#endif + max_active_blocks *= at::globalContext().getCurrentDeviceProperties()->multiProcessorCount; + grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); +} + +inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { + uint64_t block_size = 1; + uint64_t max_block_size = std::min(dim_size / ILP, static_cast(max_threads)); + while (block_size < max_block_size) block_size *= 2; + // Launch at least a single warp - the kernel assumes that. + block_size = std::max(block_size, static_cast(32)); + return dim3(block_size); +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +// Note that it's not a complete block-wide reduction. +// Only threads that share threadIdx.y reduce values. +template class ReduceOp> +__forceinline__ __device__ +T spatialBlockReduceX(T *shared, T val) { + ReduceOp r; + shared += threadIdx.y * blockDim.x; + + __syncthreads(); + + shared[threadIdx.x] = val; + + // NOTE: loop starts with __syncthreads() + int offset = blockDim.x / 2; + while (offset > 0) { + __syncthreads(); + if (threadIdx.x < offset) + shared[threadIdx.x] = r(shared[threadIdx.x], shared[threadIdx.x + offset]); + offset /= 2; + } + + __syncthreads(); + + return shared[0]; +} + +template class Epilogue> +__global__ void cunn_SpatialSoftMaxForward( + scalar_t *output, scalar_t *input, + uint32_t outer_size, uint32_t dim_size, uint32_t inner_size) +{ + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + const uint32_t outer_stride = inner_size * dim_size; + const uint32_t dim_stride = inner_size; + + for (uint32_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) { + const uint32_t outer_offset = outer_index * outer_stride; + for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) { + const uint32_t data_offset = outer_offset + inner_index; + //////////////////////////////////////////////////////////// + // These two blocks are really eqivalent, but specializing on + // blockDim.x == 1 makes the kernel faster when it's unused. + // I didn't want to thread an extra template parameter, and nvcc + // seems to be smart enough to hoist the if outside of the loops. + //////////////////////////////////////////////////////////// + + if (blockDim.x > 1) { + accscalar_t max_input = THCNumerics::min(); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) { + const accscalar_t value = static_cast(input[data_offset + d * dim_stride]); + max_input = Max()(max_input, value); + } + max_input = spatialBlockReduceX(sdata,max_input); + + accscalar_t sum = 0; + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) + sum += std::exp(static_cast(input[data_offset + d * dim_stride]) + - max_input); + sum = spatialBlockReduceX(sdata, sum); + + Epilogue epilogue(max_input, sum); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) + output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]); + } else { + accscalar_t max_input = THCNumerics::min(); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) { + const accscalar_t value = static_cast(input[data_offset + d * dim_stride]); + max_input = Max()(max_input, value); + } + accscalar_t sum = 0; + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) + sum += std::exp(static_cast(input[data_offset + d * dim_stride]) + - max_input); + Epilogue epilogue(max_input, sum); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) + output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]); + } + } + } +} + + + +template class Epilogue> +__global__ void cunn_SpatialSoftMaxBackward( + scalar_t *gradInput, scalar_t *output, scalar_t *gradOutput, + uint32_t outer_size, uint32_t dim_size, uint32_t inner_size) +{ + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + const uint32_t outer_stride = inner_size * dim_size; + const uint32_t dim_stride = inner_size; + + for (uint32_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) { + const uint32_t outer_offset = outer_index * outer_stride; + for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) { + const uint32_t data_offset = outer_offset + inner_index; + // See the comment in forward kernel + if (blockDim.x > 1) { + accscalar_t sum = 0; + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) + sum += gradOutput[data_offset + d * dim_stride]; + sum = spatialBlockReduceX(sdata, sum); + + Epilogue epilogue(sum); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) { + gradInput[data_offset + d * dim_stride] = + epilogue(gradOutput[data_offset + d * dim_stride], + output[data_offset + d * dim_stride]); + } + } else { + accscalar_t sum = 0; + for (uint32_t d = 0; d < dim_size; d++) + sum += gradOutput[data_offset + d * dim_stride]; + + Epilogue epilogue(sum); + for (uint32_t d = 0; d < dim_size; d++) { + gradInput[data_offset + d * dim_stride] = + epilogue(gradOutput[data_offset + d * dim_stride], + output[data_offset + d * dim_stride]); + } + } + } + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// Regular kernel (fast when dim_size is large; requires inner_size == 1) +//////////////////////////////////////////////////////////////////////////////// + + +template +struct MaxFloat +{ + __device__ __forceinline__ AccumT operator()(AccumT max, T v) const { + return ::max(max, (AccumT)v); + } +}; + +template +struct AddFloat +{ + __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { + return sum + v; + } +}; + +template +struct SumExpFloat +{ + __device__ __forceinline__ SumExpFloat(AccumT v) + : max_k(v) {} + + __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { + return sum + std::exp(v - max_k); + } + + const AccumT max_k; +}; + +template class Reduction, typename AccumT> +__device__ __forceinline__ AccumT +blockReduce(AccumT* smem, AccumT val, + const Reduction& r, + AccumT defaultVal) +{ + // To avoid RaW races from chaining blockReduce calls together, we need a sync here + __syncthreads(); + + smem[threadIdx.x] = val; + + __syncthreads(); + + AccumT warpVal = defaultVal; + + // First warp will perform per-warp reductions for the remaining warps + if (threadIdx.x < 32) { + int lane = threadIdx.x % 32; + if (lane < blockDim.x / 32) { +#pragma unroll + for (int i = 0; i < 32; ++i) { + warpVal = r(warpVal, smem[lane * 32 + i]); + } + smem[lane] = warpVal; + } + } + + __syncthreads(); + + // First thread will perform a reduction of the above per-warp reductions + AccumT blockVal = defaultVal; + + if (threadIdx.x == 0) { + for (int i = 0; i < blockDim.x / 32; ++i) { + blockVal = r(blockVal, smem[i]); + } + smem[0] = blockVal; + } + + // Sync and broadcast + __syncthreads(); + return smem[0]; +} + +template class Reduction, int ILP, typename T, typename AccumT> +__device__ __forceinline__ AccumT +ilpReduce(T* data, + int size, + const Reduction& r, + AccumT defaultVal) +{ + AccumT threadVal = defaultVal; + int offset = threadIdx.x; + + int last = size % (ILP * blockDim.x); + + // Body (unroll by ILP times) + for (; offset < size - last; offset += blockDim.x * ILP) { + T tmp[ILP]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) + tmp[j] = data[offset + j * blockDim.x]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) + threadVal = r(threadVal, tmp[j]); + } + + // Epilogue + for (; offset < size; offset += blockDim.x) + threadVal = r(threadVal, data[offset]); + + return threadVal; +} + +template class Epilogue> +__global__ void +cunn_SoftMaxForward(scalar_t *output, scalar_t *input, int classes) +{ + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + // forward pointers to batch[blockIdx.x] + // each block handles a sample in the mini-batch + input += blockIdx.x * classes; + output += blockIdx.x * classes; + + // find the max + accscalar_t threadMax = ilpReduce( + input, classes, MaxFloat(), -THCNumerics::max()); + accscalar_t max_k = blockReduce( + sdata, threadMax, Max(), -THCNumerics::max()); + + // reduce all values + accscalar_t threadExp = ilpReduce( + input, classes, SumExpFloat(max_k), static_cast(0)); + accscalar_t sumAll = blockReduce( + sdata, threadExp, Add(), static_cast(0)); + + Epilogue epilogue(max_k, sumAll); + int offset = threadIdx.x; + int last = classes % (ILP * blockDim.x); + for (; offset < classes - last; offset += blockDim.x * ILP) { + scalar_t tmp[ILP]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) + tmp[j] = input[offset + j * blockDim.x]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) + output[offset + j * blockDim.x] = epilogue(tmp[j]); + } + + for (; offset < classes; offset += blockDim.x) + output[offset] = epilogue(input[offset]); +} + +template class Epilogue> +__global__ void +cunn_SoftMaxBackward(scalar_t *gradInput, scalar_t *output, scalar_t *gradOutput, int classes) +{ + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + gradInput += blockIdx.x * classes; + output += blockIdx.x * classes; + gradOutput += blockIdx.x * classes; + + accscalar_t threadSum = ilpReduce( + gradOutput, classes, AddFloat(), accscalar_t(0)); + accscalar_t sum_k = blockReduce( + sdata, threadSum, Add(), accscalar_t(0)); + + Epilogue epilogue(sum_k); + int offset = threadIdx.x; + int last = classes % (ILP * blockDim.x); + for (; offset < classes - last; offset += blockDim.x * ILP) { + scalar_t tmpGradOutput[ILP]; + scalar_t tmpOutput[ILP]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) { + tmpGradOutput[j] = gradOutput[offset + j * blockDim.x]; + tmpOutput[j] = output[offset + j * blockDim.x]; + } + +#pragma unroll + for (int j = 0; j < ILP; ++j) + gradInput[offset + j * blockDim.x] = epilogue(tmpGradOutput[j], tmpOutput[j]); + } + + for (; offset < classes; offset += blockDim.x) + gradInput[offset] = epilogue(gradOutput[offset], output[offset]); +} + + + + + + +template class Epilogue> +Tensor host_softmax(const Tensor & input_, const int64_t dim_){ + auto input = input_.contiguous(); + Tensor output = at::empty_like(input); + if (input.dim() == 0) input = input.view(1); + int64_t dim = maybe_wrap_dim(dim_, input.dim()); + AT_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); + int64_t outer_size = 1; + int64_t dim_size = input.size(dim); + int64_t inner_size = 1; + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + for (int64_t i = 0; i < dim; ++i) + outer_size *= input.size(i); + for (int64_t i = dim + 1; i < input.dim(); ++i) + inner_size *= input.size(i); + // This kernel spawns a block per each element in the batch. + // XXX: it assumes that inner_size == 1 + if (inner_size == 1) { + const int ILP = 2; + dim3 grid(outer_size); + dim3 block = SoftMax_getBlockSize(ILP, dim_size); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "host_softmax", [&] { + using accscalar_t = acc_type; + cunn_SoftMaxForward + <<>>( + output.data(), input.data(), dim_size + ); + }); + // This kernel runs in a 2D grid, where each application along y dimension has a fixed + // outer_size, and runs in parallel over inner_size. Dimension x is parallel over outer_size. + // Reductions over dim are done in a single-threaded manner. + } else { + uint32_t smem_size; + dim3 grid, block; + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "host_softmax", [&] { + using accscalar_t = acc_type; + SpatialSoftMax_getLaunchSizes( + &cunn_SpatialSoftMaxForward, + outer_size, dim_size, inner_size, + grid, block, smem_size); + cunn_SpatialSoftMaxForward + <<>>( + output.data(), input.data(), outer_size, dim_size, inner_size + ); + }); + } + THCudaCheck(cudaGetLastError()); + return output; +} + +template class Epilogue> +Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t dim_){ + int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); + auto grad = grad_.contiguous(); + Tensor gI = at::empty_like(grad); + if (grad.dim() == 0) grad = grad.view(1); + AT_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); + auto output = output_.contiguous(); + if (output.dim() == 0) output = output.view(1); + int64_t outer_size = 1; + int64_t dim_size = output.size(dim); + int64_t inner_size = 1; + for (int64_t i = 0; i < dim; ++i) + outer_size *= output.size(i); + for (int64_t i = dim + 1; i < output.dim(); ++i) + inner_size *= output.size(i); +// See descriptions of kernels above. + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + if (inner_size == 1) { + const int ILP = 2; + dim3 grid(outer_size); + dim3 block = SoftMax_getBlockSize(ILP, dim_size); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "host_softmax_backward", [&] { + using accscalar_t = acc_type; + cunn_SoftMaxBackward + <<>>( + gI.data(), output.data(), grad.data(), dim_size + ); + }); + } else { + uint32_t smem_size; + dim3 grid, block; + AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "host_softmax_backward", [&] { + using accscalar_t = acc_type; + SpatialSoftMax_getLaunchSizes( + &cunn_SpatialSoftMaxBackward, + outer_size, dim_size, inner_size, + grid, block, smem_size); + + cunn_SpatialSoftMaxBackward + <<>>( + gI.data(), output.data(), grad.data(), + outer_size, dim_size, inner_size + ); + }); + } + THCudaCheck(cudaGetLastError()); + return gI; +} +} + +Tensor log_softmax_cuda(const Tensor &input, const int64_t dim){ + return host_softmax(input, dim); +} + +Tensor log_softmax_backward_cuda(const Tensor &grad, const Tensor &output, int64_t dim, const Tensor &input){ + return host_softmax_backward(grad, output, dim); +} + +Tensor softmax_cuda(const Tensor &input, const int64_t dim){ + return host_softmax(input, dim); +} + +Tensor softmax_backward_cuda(const Tensor &grad, const Tensor &output, int64_t dim, const Tensor &input){ + + Tensor tmp = grad * output; + return host_softmax_backward(tmp, output, dim); +} + +} +} diff --git a/aten/src/ATen/native/cuda/SparseMM.cu b/aten/src/ATen/native/cuda/SparseMM.cu new file mode 100644 index 0000000..29dfd8d --- /dev/null +++ b/aten/src/ATen/native/cuda/SparseMM.cu @@ -0,0 +1,15 @@ +#include "ATen/ATen.h" +#include "ATen/Error.h" +#include "ATen/NativeFunctions.h" + +namespace at { namespace native { +// sparse, sparse, sparse, dense, real, real -> sparse +Tensor& _sspaddmm_out_only_sparse_cuda(Tensor& result, const Tensor& self, + const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { + AT_ERROR("tensor.sspaddmm(...) can only be called on sparse tensors"); +} +Tensor& _sspaddmm_out_cuda(Tensor& result, const Tensor& self, + const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { + AT_ERROR("NYI: CUDA sspaddmm is not implemented"); +} +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu new file mode 100644 index 0000000..7266ebd --- /dev/null +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -0,0 +1,310 @@ +#include "ATen/ATen.h" +#include "ATen/Config.h" +#include "ATen/Dispatch.h" +#include "ATen/Utils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/native/SpectralOpsUtils.h" +#include "ATen/native/cuda/CuFFTUtils.h" +#include "ATen/native/cuda/CuFFTPlanCache.h" +#include +#include + +#include +#include +#include +#include +#include + +namespace at { namespace native { + +using namespace at::native::detail; + +// In real-to-complex transform, cuFFT only fills half of the values due to +// conjugate symmetry. See native/SpectralUtils.h for more details. +// The following structs are used to fill in the other half with symmetry in +// case of real-to-complex transform with onesided=False flag. +// See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h. + +// counting_iterator => index to fill +struct cnt_to_dst_idx_functor : public thrust::unary_function +{ + const int64_t last_dim_size; + const int64_t last_dim_start_slice; + const int64_t last_dim_to_fill_size; + + cnt_to_dst_idx_functor(int64_t last_dim_size, int64_t last_dim_start_slice) : + last_dim_size(last_dim_size), last_dim_start_slice(last_dim_start_slice), + last_dim_to_fill_size(last_dim_size - last_dim_start_slice) {} + + __host__ __device__ __forceinline__ + int64_t operator()(const int64_t& i) const + { + int64_t imag = i % 2; + int64_t idx = i / 2; + int64_t num_dim = idx / last_dim_to_fill_size; + int64_t slice_idx = idx % last_dim_to_fill_size; + return (num_dim * last_dim_size + last_dim_start_slice + slice_idx) * 2 + imag; + } +}; + +// index to fill => index to read from +template +struct dst_idx_to_src_functor : public thrust::unary_function +{ + // output can have at most dim 5 (batch + 3 signal dim + real/imag) + int64_t sizes[max_rank + 2], strides[max_rank + 2]; + const int64_t signal_ndim; + scalar_t *data; // device ptr + + dst_idx_to_src_functor(const Tensor& batched_complex_signal) + : signal_ndim(batched_complex_signal.dim() - 1), + data(batched_complex_signal.data()) { + for (int64_t i = 0; i < signal_ndim; i++) { + sizes[i] = batched_complex_signal.size(i); + strides[i] = batched_complex_signal.stride(i); + } + } + + __device__ __forceinline__ + scalar_t operator()(const int64_t& write_idx_with_imag) const + { + int64_t imag = write_idx_with_imag % 2; + // all but first (batch) and last (real/imag) dims need to be reflected + int64_t read_idx = 0; + int64_t remainder = write_idx_with_imag - imag; + int64_t dim_idx, dim_stride; + for (int64_t i = 0; i < signal_ndim; i++) { + dim_stride = strides[i]; + dim_idx = remainder / dim_stride; + if (i == 0) { + read_idx += dim_idx * dim_stride; + } else if (dim_idx != 0) { + read_idx += (sizes[i] - dim_idx) * dim_stride; + } + remainder = remainder % dim_stride; + } + if (imag) { + return -data[read_idx + 1]; + } else { + return data[read_idx]; + } + } +}; + +// input should be a contiguous batched tensor of same size as full (twosided) +// signals, but only contains half (onesided) of the values. +// This function modifies inplace. +__forceinline__ +static void _fft_fill_with_conjugate_symmetry_(Tensor& input, + int64_t size_last_dim, int64_t last_dim_start_slice) { + if (last_dim_start_slice >= size_last_dim) { + return; + } + + // copy + int64_t n = input.numel() / size_last_dim * (size_last_dim - last_dim_start_slice); + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "_fft_fill_with_conjugate_symmetry_", [&] { + typedef thrust::device_ptr device_ptr; + typedef thrust::counting_iterator counter; + typedef thrust::transform_iterator dst_idx_iterator; + typedef thrust::permutation_iterator dst_iterator; + typedef thrust::transform_iterator, dst_idx_iterator> src_iterator; + + dst_idx_iterator dst_idxs(counter(0), cnt_to_dst_idx_functor(size_last_dim, last_dim_start_slice)); + + auto data = device_ptr(input.data()); + dst_iterator dsts(data, dst_idxs); + src_iterator srcs(dst_idxs, dst_idx_to_src_functor(input)); + thrust::copy_n(policy, srcs, n, dsts); + }); +} + +// NOTE [ cuFFT Embedded Strides ] +// +// cuFFT supports a subset of arbitrary strides via their "advanced data layout" +// option (http://docs.nvidia.com/cuda/cufft/index.html#advanced-data-layout). +// Specifically, these are tensors that can be viewed as subtensors resulted +// from slicing a larger contiguous tensors. For such input tensors, let the +// sizes of the enclosing tensor be `inembed`, and we can have in 3d case: +// +// input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z)] +// +// Above is the simplified formula ignoring the batch dimension. In fact, the +// last dimension of the enclosing tensor doesn't have to be contiguous, i.e., +// it can be greater than 1. Then one can set the base stride for the enclosing +// tensor with `istride`. Then we have +// +// input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z) * istride] +// +// For example, consider +// +// enclosing = torch.zeros(6, 8, 10) # contiguous +// input = enclosing[:4, 2:6, 6:] +// input.size() # [ 4, 4, 4] +// input.stride() # [80, 10, 1] +// # inembed = [6, 8, 10] +// input[2, 1, 3] = input[((2 * 8) + 1) * 10 + 3] # using above formula +// = input[173] +// = input[2 * 80 + 1 * 10 + 1 * 3] # using strides directly +// +// Generally, the embedded strides can be computed as +// +// embed[i] = stride[i - 1] / stride[i]. +// +// Note that the value of embed[0] isn't used to compute indices and doesn't +// matter. +// +// Contrary to advanced data layout, simple layout means that *embeds have +// unit-strides. In particular, unit-stride refers to that the input and output +// tensors being contiguous, and that the strides at the innermost signal +// dimension being unit (1) w.r.t. the corresponding data type. + +static inline Tensor _run_cufft( + const CuFFTConfig &config, Tensor& input, int64_t signal_ndim, + bool complex_input, bool complex_output, bool inverse, + IntList checked_signal_sizes, bool normalized, bool onesided, + IntList output_sizes, bool input_was_cloned +) { + if (config.should_clone_input() && !input_was_cloned) { + input = input.clone(); + } + + auto& plan = config.plan(); + auto& ctx = at::globalContext(); + + // set output + auto output = input.type().tensor(output_sizes); + + // set to current stream + CUFFT_CHECK(cufftSetStream(plan, ctx.getCurrentCUDAStream())); + + auto ws = ctx.getType(at::Backend::CUDA, at::ScalarType::Byte).tensor({ config.workspace_size() }); + CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr())); + + // run + CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(), + inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); + + // rescale if needed by normalized flag or inverse transform + auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1]; + if (normalized || inverse) { + auto signal_numel = at::prod_intlist(checked_signal_sizes); + double scale_denom; + if (normalized) { + scale_denom = std::sqrt(static_cast(signal_numel)); + } else { + scale_denom = static_cast(signal_numel); + } + if (!complex_input && complex_output && !onesided) { + auto end_data_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim); + output.narrow(signal_ndim, 0, end_data_slice).div_(scale_denom); + } else { + output.div_(scale_denom); + } + } + + // if needed, fill out the other half using conjugate symmetry + if (!complex_input && complex_output && !onesided) { + auto start_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim); + _fft_fill_with_conjugate_symmetry_(output, size_last_signal_dim, start_slice); + } + return output; +} + +// The cuFFT plan cache, defined in CuFFTUtils.h +struct CuFFTParamsLRUCache plan_cache; +std::mutex plan_cache_mutex; + +namespace detail { + +int64_t cufft_get_plan_cache_max_size_impl() { + std::lock_guard guard(plan_cache_mutex); + return plan_cache.max_size(); +} + +void cufft_set_plan_cache_max_size_impl(int64_t max_size) { + std::lock_guard guard(plan_cache_mutex); + plan_cache.resize(max_size); +} + +int64_t cufft_get_plan_cache_size_impl() { + std::lock_guard guard(plan_cache_mutex); + return plan_cache.size(); +} + +void cufft_clear_plan_cache_impl() { + std::lock_guard guard(plan_cache_mutex); + return plan_cache.clear(); +} + +} // namespace at::native::detail + +// cuFFT +// Currently not utilizing multi GPUs so this can be potentially sped up. +Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim, + bool complex_input, bool complex_output, bool inverse, + IntList checked_signal_sizes, bool normalized, bool onesided, + IntList output_sizes) { + Tensor input = self; + bool input_was_cloned = false; + + // Slice when twosided complex-to-real. This is not always needed because we + // calculate the inembed. But it will benefit us in certain cases where we + // clone the input tensor. + // + // See NOTE [ cuFFT Embedded Strides ]. + // See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h. + if (complex_input && !complex_output && !onesided) { + auto onesided_size = infer_ft_real_to_complex_onesided_size(checked_signal_sizes[signal_ndim - 1]); + input = input.narrow(signal_ndim, 0, onesided_size); + } + + // cuFFT requires input and output data pointers to complex type aligned. + // Our allocated output tensor is always 256 bytes aligned so it is fine, but + // we need to check input tensor to make sure that it is not unaligned, e.g., + // from a slicing. + auto complex_size_bytes = 2 * input.type().elementSizeInBytes(); + if (reinterpret_cast(input.data_ptr()) % complex_size_bytes != 0) { + input = input.clone(); + input_was_cloned = true; + } + + // Now that we have done error check and data_ptr checks, we delegate all + // futher cuFFT parameter computation and plan creation to the helper class + // CuFFTConfig in CuFFTUtils.h. + + // If plan caching is enabled, we check the cache. Note that this accesses + // plan_cache.max_size() and thus makes this function less functional. + // However, integrating additional arguments into the "public" level c++ APIs, + // e.g., irfft, is difficult as we have a long call sequence looking like + // irfft --> _fft --> _fft_with_size --dispatching-to-> _fft_cufft + + // This read is not locked for perf reason. Shouldn't matter too much because + // we check again after acquiring the lock. + if (plan_cache.max_size() > 0) { + CuFFTParams params; + setCuFFTParams(¶ms, input, signal_ndim, complex_input, + complex_output, checked_signal_sizes, onesided); + std::lock_guard guard(plan_cache_mutex); + if (plan_cache.max_size() > 0) { // check again after acquiring the lock + const CuFFTConfig &config = plan_cache.try_emplace_value(std::move(params), + input, signal_ndim, complex_input, + complex_output, checked_signal_sizes, + onesided, output_sizes); + return _run_cufft(config, input, signal_ndim, complex_input, + complex_output, inverse, checked_signal_sizes, normalized, + onesided, output_sizes, input_was_cloned); + } + } + CuFFTConfig config(input, signal_ndim, complex_input, complex_output, + checked_signal_sizes, onesided, output_sizes); + return _run_cufft(config, input, signal_ndim, complex_input, + complex_output, inverse, checked_signal_sizes, normalized, + onesided, output_sizes, input_was_cloned); +} + +}} // at::native diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu new file mode 100644 index 0000000..46c812c --- /dev/null +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -0,0 +1,291 @@ +#include "ATen/ATen.h" +#include "ATen/cuda/CUDAApplyUtils.cuh" + +namespace at { +namespace cuda { +#define THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM 100 +#define THRESH_NUMBER_BINS_FOR_GLOBAL_MEM 1000 +#define FOR_KERNEL_LOOP(i, lim) \ + for (IndexType i = blockIdx.x * blockDim.x + threadIdx.x; i < lim; \ + i += gridDim.x * blockDim.x) + +/* + Memory types used for the 3 histogram implementations. + See `CUDA_tensor_histogram` below. + */ +enum class CUDAHistogramMemoryType { SHARED, MULTI_BLOCK, GLOBAL }; + +/* + Kernel for computing the histogram of the input. + */ +template < + typename output_t, + typename input_t, + typename IndexType, + int ADims, + int PDims, + int BDims, + CUDAHistogramMemoryType MemoryType = CUDAHistogramMemoryType::MULTI_BLOCK, + typename Op> +__global__ void kernelHistogram1D( + detail::TensorInfo a, /* output */ + detail::TensorInfo p, /* partial output */ + detail::TensorInfo b, /* input */ + int binsize, + IndexType totalElements, + Op getOp) { + extern __shared__ unsigned char my_smem[]; + output_t* smem = nullptr; + + if (MemoryType == CUDAHistogramMemoryType::SHARED) { + ////////////////////////// Shared memory ////////////////////////// + // atomically add to block specific shared memory + // then atomically add to the global output tensor + smem = reinterpret_cast(my_smem); + for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) { + smem[i] = 0; + } + __syncthreads(); + FOR_KERNEL_LOOP(linearIndex, totalElements) { + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = + detail::IndexToOffset::get(linearIndex, b); + // Use value at `b` as an offset of `smem` + const IndexType pOffset = b.data[bOffset] / binsize; + atomicAdd(&smem[pOffset], getOp(linearIndex)); + } + __syncthreads(); + // NOTE: atomically update output bin count. + // Atomic update is imp since __syncthread() will only synchronize threads + // in a given block, not across blocks. + for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) { + const IndexType aOffset = + detail::IndexToOffset::get(i, a); + atomicAdd(&a.data[aOffset], smem[i]); + } + + } else if (MemoryType == CUDAHistogramMemoryType::MULTI_BLOCK) { + ////////////////////////// Multi Block memory ////////////////////////// + // atomically add to block specific global tensor + // then atomically add to the global output tensor + // compute histogram for the block + FOR_KERNEL_LOOP(linearIndex, totalElements) { + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = + detail::IndexToOffset::get(linearIndex, b); + const auto bVal = b.data[bOffset]; + // Use value at `b` as an offset of `p` + const IndexType pIdx = p.strides[0] * blockIdx.x + bVal / binsize; + const IndexType pOffset = + detail::IndexToOffset::get(pIdx, p); + atomicAdd(&p.data[pOffset], getOp(linearIndex)); + } + __syncthreads(); + // NOTE: atomically update output bin count. + // Atomic update is imp since __syncthread() will only synchronize threads + // in a given block, not across blocks. + const IndexType pIdx = p.strides[0] * blockIdx.x; + const IndexType pOffset = + detail::IndexToOffset::get(pIdx, p); + for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) { + const IndexType aOffset = + detail::IndexToOffset::get(i, a); + atomicAdd(&a.data[aOffset], p.data[pOffset + i]); + } + + } else { + ////////////////////////// Global memory ////////////////////////// + // atomically add to the output tensor + // compute histogram for the block + FOR_KERNEL_LOOP(linearIndex, totalElements) { + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = + detail::IndexToOffset::get(linearIndex, b); + const auto bVal = b.data[bOffset]; + // Use value at `b` as an offset of `a` + const IndexType aIdx = bVal / binsize; + const IndexType aOffset = + detail::IndexToOffset::get(aIdx, a); + atomicAdd(&a.data[aOffset], getOp(linearIndex)); + } + } +} + +#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP) \ + kernelHistogram1D \ + <<>>( \ + aInfo, pInfo, bInfo, binsize, totalElements, WEIGHTS_OP); \ + AT_ASSERTM(cudaGetLastError() == cudaSuccess, "kernelHistogram1D failed"); + +#define HANDLE_SWITCH_CASE(mType, getOp) \ + switch (mType) { \ + case CUDAHistogramMemoryType::SHARED: \ + HANDLE_CASE(CUDAHistogramMemoryType::SHARED, getOp); \ + break; \ + case CUDAHistogramMemoryType::MULTI_BLOCK: \ + HANDLE_CASE(CUDAHistogramMemoryType::MULTI_BLOCK, getOp); \ + break; \ + default: \ + HANDLE_CASE(CUDAHistogramMemoryType::GLOBAL, getOp); \ + } + +inline int64_t getFreeGlobalMemory() { + // no need to use `cudaSetDevice` + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + AT_ASSERTM( + cudaGetLastError() == cudaSuccess, + "CUDA_tensor_histogram failed to get free global memory"); + return static_cast(free_mem); +} + +/* + Calculate the frequency of the input values. + + `a` contains the final output or the histogram. + Input `b` is assumed to be 1-D non-negative int array. + `c` optionally contains the weight vector. + See `help torch.bincount` for details on the math. + + 3 implementations based of input size and memory usage: + case: #bins < THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM and enough shared mem + SHARED: Each block atomically adds to it's own **shared** hist copy, + then atomically updates the global tensor. + case: #bins < THRESH_NUMBER_BINS_FOR_GLOBAL_MEM and enough global mem + MULTI_BLOCK: Each block atomically adds to it's own **global** hist + copy, then atomically updates the global tensor. + case: THRESH_NUMBER_BINS_FOR_GLOBAL_MEM <= #bins + GLOBAL: all threads atomically update to a single **global** hist copy. + */ +template +bool CUDA_tensor_histogram( + at::Tensor a, /* output */ + at::Tensor b, /* input */ + at::Tensor c, /* weights(optional) */ + int64_t nbins, + int binsize, + TensorArgType aType = TensorArgType::ReadWrite, + TensorArgType bType = TensorArgType::ReadOnly, + TensorArgType cType = TensorArgType::ReadOnly) { + checkBackend("CUDA_tensor_histogram", {a, b}, Backend::CUDA); + if (HasWeights) { + checkBackend("CUDA_tensor_histogram", {c}, Backend::CUDA); + } + auto totalElements = b.size(0); + + const dim3 block = getApplyBlock(); + dim3 grid; + int64_t curDevice = current_device(); + if (curDevice == -1 || !getApplyGrid(totalElements, grid, curDevice)) { + return false; + } + + CUDAHistogramMemoryType memType = CUDAHistogramMemoryType::GLOBAL; + auto maxSharedMem = + at::globalContext().getCurrentDeviceProperties()->sharedMemPerBlock; + auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes + auto maxGlobalMem = getFreeGlobalMemory(); + auto multiBlockMem = nbins * grid.x * sizeof(output_t) + 8; // 8 guard bytes + // determine memory type to use in the kernel + if (nbins < THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM && + sharedMem < maxSharedMem) { + memType = CUDAHistogramMemoryType::SHARED; + } else if ( + nbins < THRESH_NUMBER_BINS_FOR_GLOBAL_MEM && + multiBlockMem < (maxGlobalMem / 2)) { + // check against half of free mem to be extra safe + // due to cached allocator, we may anyway have slightly more free mem + memType = CUDAHistogramMemoryType::MULTI_BLOCK; + } + + // alloc memory for MULTI_BLOCK + using IndexType = int64_t; + auto aInfo = detail::getTensorInfo(a); + auto bInfo = detail::getTensorInfo(b); + detail::TensorInfo pInfo(nullptr, 0, {}, {}); + Tensor partial_output; + if (memType == CUDAHistogramMemoryType::MULTI_BLOCK) { + partial_output = native::zeros({grid.x, nbins}, a.options()); + pInfo = detail::getTensorInfo(partial_output); + } + + if (HasWeights) { + auto cInfo = detail::getTensorInfo(c); + const auto getWeightsOp = [cInfo] __device__(IndexType cIndex) { + const IndexType cOffset = + detail::IndexToOffset::get(cIndex, cInfo); + return cInfo.data[cOffset]; + }; + HANDLE_SWITCH_CASE(memType, getWeightsOp) + } else { + static const auto getDummyOp = [] __device__(IndexType) { return 1L; }; + HANDLE_SWITCH_CASE(memType, getDummyOp) + } + return true; +} + +#undef HANDLE_CASE +#undef HANDLE_SWITCH_CASE +#undef FOR_KERNEL_LOOP +#undef THRESH_NUMBER_BINS_FOR_GLOBAL_MEM +#undef THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM +} // namespace cuda + +namespace { +///////////////// bincount ///////////////// +template +Tensor _bincount_cuda_template( + const Tensor& self, + const Tensor& weights, + int64_t minlength) { + if (minlength < 0) { + AT_ERROR("minlength should be >= 0"); + } + if (self.dim() != 1 || self.numel() == 0 || + (!std::is_same::value && + *self.min().toBackend(kCPU).data() < 0)) { + AT_ERROR("bincount only supports 1-d non-negative integral inputs."); + } + + bool has_weights = weights.defined(); + if (has_weights && weights.size(0) != self.size(0)) { + AT_ERROR("input and weights should have the same length"); + } + + auto maxScalarGpu = Scalar(self.max()); + auto nbins = maxScalarGpu.local().to() + 1L; + nbins = std::max(nbins, minlength); + // alloc output counter on GPU + Tensor output; + if (has_weights) { + output = native::zeros({nbins}, weights.options()); + auto ret = cuda::CUDA_tensor_histogram( + output, self, weights, nbins, 1); + } else { + output = native::zeros({nbins}, device(kCUDA).dtype(kLong)); + auto ret = cuda::CUDA_tensor_histogram( + output, self, weights, nbins, 1); + } + return output; +} +} // namespace + +namespace native { +Tensor _bincount_cuda( + const Tensor& self, + const Tensor& weights, + int64_t minlength) { + return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] { + const auto scalar = weights.type().scalarType(); + if (scalar == ScalarType::Undefined || scalar == ScalarType::Float) + return _bincount_cuda_template(self, weights, minlength); + return _bincount_cuda_template( + self, weights.toType(CUDA(kDouble)), minlength); + }); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu new file mode 100644 index 0000000..8e0cf4e --- /dev/null +++ b/aten/src/ATen/native/cuda/TensorCompare.cu @@ -0,0 +1,41 @@ +#include "ATen/NativeFunctions.h" +#include "ATen/Dispatch.h" + +#include "ATen/cuda/CUDAApplyUtils.cuh" + +namespace { +template +void where_cuda( + at::Tensor& ret, + const at::Tensor& condition, + const at::Tensor& self, + const at::Tensor& other) { + // Yes this name is repetitive, but the CPU version is called + // CPU_tensor_apply4 and we don't have a CPU namespace or directory. + at::cuda::CUDA_tensor_apply4( + ret, + condition, + self, + other, + [] __device__( + scalar_t & ret_val, + const uint8_t& cond_val, + const scalar_t& self_val, + const scalar_t& other_val) { + ret_val = cond_val ? self_val : other_val; + }); +} +} // namespace + +namespace at { namespace native { +Tensor _s_where_cuda( + const Tensor& condition, + const Tensor& self, + const Tensor& other) { + Tensor ret = self.type().tensor(self.sizes()); + AT_DISPATCH_ALL_TYPES_AND_HALF(ret.type(), "where", [&] { + where_cuda(ret, condition, self, other); + }); + return ret; +} +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu new file mode 100644 index 0000000..420733d --- /dev/null +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -0,0 +1,88 @@ +#include "ATen/ATen.h" +#include "ATen/Error.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { +namespace native { + +Tensor& eye_out_cuda(Tensor& result, int64_t n) { + return at::native::eye_out_cuda(result, n, /*m=*/-1); +} + +Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { +#ifndef USE_TH_SIZE_ZERO_DIM + AT_CHECK(n > 0, "n must be greater than 0, got ", n); +#else + AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); +#endif + +#ifndef USE_TH_SIZE_ZERO_DIM + if(m <= 0) { +#else + if(m < 0) { +#endif + m = n; + } + + result.resize_({n, m}); + result.zero_(); + + int64_t sz = std::min(n, m); + int64_t stride = result.stride(0) + result.stride(1); + + Tensor diag = result.as_strided({sz}, {stride}); + diag.fill_(1); + return result; +} + +Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) { + AT_CHECK(n >= 0, "n must be non-negative, got", n); + AT_CHECK(result.type().scalarTensor(n).defined(), + "n is too large for result tensor type: '", result.type().toString(), "'"); + + result.resize_({n}); + + if (result.type().scalarType() == at::ScalarType::Half) { + auto result_float = CUDA(kFloat).tensor({n}); + result.copy_(randperm_out_cuda(result_float, n, generator)); + } else { + if (n < 30000) { // For small inputs, we offload it to CPU instead. + auto result_cpu = result.type().toBackend(kCPU).tensor({n}); + randperm_out(result_cpu, n, generator); + result.copy_(result_cpu); + } else { + // Generate random values for the keys array + AT_DISPATCH_ALL_TYPES( + result.type(), "randperm_out_cuda", [&] { + auto keys = result.type().tensor(result.sizes()).random_(generator); + + auto result_data = thrust::device_ptr(result.data()); + auto keys_data = thrust::device_ptr(keys.data()); + + auto state = globalContext().getTHCState(); + THCThrustAllocator thrustAlloc(state); + auto policy = thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)); + + thrust::sequence(policy, result_data, result_data + n); + + // Use the sorted order of keys to rearrange the result array + thrust::sort_by_key(policy, keys_data, keys_data + n, result_data); + } + ); + } + } + + return result; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu new file mode 100644 index 0000000..cc8e78c --- /dev/null +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -0,0 +1,124 @@ +#include "ATen/native/TensorTransformations.h" + +#include "ATen/cuda/detail/IndexUtils.cuh" +#include "ATen/NativeFunctions.h" + +#include +#include + +namespace at { +namespace native { + +#define AT_APPLY_THREADS_PER_BLOCK 32 * 16 +#define AT_APPLY_BLOCKS_PER_SM 4 + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) +#endif +__global__ void +kernel_pointwise_flip_apply2(const cuda::detail::TensorInfo in_tensor_info, + cuda::detail::TensorInfo out_tensor_info, + IndexType N, + int flip_dim, + IndexType total_dims) { + for (IndexType linear_index = blockIdx.x * blockDim.x + threadIdx.x; linear_index < N; linear_index += gridDim.x * blockDim.x) { + IndexType dst_offset = 0; + if (flip_dim == 0) { + // flip 1st dim + dst_offset = (in_tensor_info.sizes[0] - 1 - linear_index / in_tensor_info.strides[0]) * in_tensor_info.strides[0] + linear_index % in_tensor_info.strides[0]; + } + else { + // flip last dim + IndexType i = total_dims - 1; + dst_offset = linear_index / in_tensor_info.strides[0] * in_tensor_info.strides[0] + (in_tensor_info.sizes[i] - 1 - linear_index % in_tensor_info.strides[0]); + } + out_tensor_info.data[dst_offset] = in_tensor_info.data[linear_index]; + } +} + +template +__global__ +void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size, + int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) { + + int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x; + if (linear_index >= N) { + return; + } + + int64_t cur_indices = linear_index, rem = 0, dst_offset = 0; + for (int64_t i = 0; i < total_dims; i++) { + int64_t temp = cur_indices; + cur_indices = cur_indices / strides_contiguous[i]; + rem = temp - cur_indices * strides_contiguous[i]; + // flip the indices if it is in flip_dims + for (int64_t j = 0; j < flip_dims_size; j++) { + if (i == flip_dims[j]) { + cur_indices = shape[i] - 1 - cur_indices; + } + } + dst_offset += cur_indices * strides[i]; + cur_indices = rem; + } + out_tensor[linear_index] = in_tensor[dst_offset]; +} + +// Flip tensor given a list of dims +Tensor flip_cuda(const Tensor& self, IntList dims) { + auto in_tensor = self; + const int64_t flip_dims_size = dims.size(), total_dims = in_tensor.dim(), N = in_tensor.numel(); + check_errors(total_dims, flip_dims_size, dims); + + int64_t block_size = 512; + dim3 dim_block(block_size); + dim3 dim_grid((N + block_size - 1) / block_size); + + // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work + if (flip_dims_size == 1 && in_tensor.is_contiguous() && (dims[0] == 0 || dims[0] == total_dims - 1)) { + auto out_tensor = at::empty_like(self); + AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] { + auto in_tensor_info = cuda::detail::getTensorInfo(in_tensor); + auto out_tensor_info = cuda::detail::getTensorInfo(out_tensor); + int flip_dim = in_tensor_info.collapseDims(dims[0]); + out_tensor_info.collapseDims(dims[0]); + kernel_pointwise_flip_apply2 + <<>>( + in_tensor_info, out_tensor_info, N, flip_dim, total_dims); + }); + return out_tensor; + } + + auto flip_dims = std::vector(dims); + auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast(flip_dims.size())}); + + auto shape = std::vector(in_tensor.sizes()); + auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast(shape.size())}); + + auto strides = std::vector(in_tensor.strides()); + auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast(strides.size())}); + + auto out_tensor = at::empty_like(in_tensor); + + // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(), + // it is used to compute indices for each element in non-contiguous tensor + Tensor stride_contiguous = at::zeros({total_dims}, kLong); + int64_t* stride_contiguous_d = stride_contiguous.data(); + for (int64_t i = total_dims - 1; i >= 0; i--) { + if (i == total_dims - 1) { + stride_contiguous_d[i] = 1; + } else { + stride_contiguous_d[i] = std::max(shape[i+1], 1) * stride_contiguous_d[i + 1]; + } + } + + AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] { + flip_cuda_kernel<<>>( + in_tensor.data(), out_tensor.data(), N, flip_dims_t.toType(CUDA(kLong)).data(), flip_dims_size, + strides_t.toType(CUDA(kLong)).data(), stride_contiguous.toType(CUDA(kLong)).data(), shape_t.toType(CUDA(kLong)).data(), total_dims); + }); + + return out_tensor; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu new file mode 100644 index 0000000..32dc7d3 --- /dev/null +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -0,0 +1,90 @@ +#include "ATen/ATen.h" + +#include +#include +#include + +#include +#include +#include + +namespace at { +namespace native{ + +#ifndef __HIP_PLATFORM_HCC__ + +namespace { +template +__global__ void inverse_indices_kernel( + const scalar_t* input_data, + const scalar_t* output_data, + int64_t* inverse_indices_data, + int64_t num_inp, + int64_t num_out) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride = blockDim.x * gridDim.x; + + for (int64_t i = idx; i < num_inp * num_out; i += stride) { + if (input_data[i / num_out] == output_data[i % num_out]){ + inverse_indices_data[i / num_out] = i % num_out; + } + } + } + + +template + std::tuple _unique_cuda_template( + const Tensor& self, + const bool return_inverse) { + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + const Tensor& input = self.contiguous(); + int64_t num_inp = input.numel(); + const scalar_t* input_data = input.data(); + + //sort & unique + Tensor output = input.clone(); + output = output.view(-1); + scalar_t* output_data = output.data(); + thrust::sort(policy, output_data, output_data + num_inp); + scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp); + int64_t num_out = output_end - output_data; + output.resize_(num_out); + + Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong)); + + if (return_inverse) { + inverse_indices.resize_(input.sizes()); + int64_t* inverse_indices_data = inverse_indices.data(); + int block = 512; + int grid = std::min((num_inp * num_out + block - 1) / block, 2048L); + inverse_indices_kernel<<>>( + input_data, output_data, inverse_indices_data, num_inp, num_out); + } + + THCudaCheck(cudaGetLastError()); + return std::tuple(output, inverse_indices); + + } +} // namespace + +#endif + +std::tuple +_unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) { +#ifndef __HIP_PLATFORM_HCC__ + return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] { + // The current CUDA implementation of unique always sort due to the + // lack of hashtable implementation in thrust + return _unique_cuda_template(self, return_inverse); + }); +#else + AT_ERROR("unique_cuda: HIP not supported"); +#endif +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp new file mode 100644 index 0000000..f73a2ad --- /dev/null +++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp @@ -0,0 +1,98 @@ +#include +#include +#include +#include + +#if !AT_CUDNN_ENABLED() + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +Tensor cudnn_affine_grid_generator_forward( + const Tensor& theta, + int64_t N, int64_t C, int64_t H, int64_t W) { + throw std::runtime_error("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); +} + +Tensor cudnn_affine_grid_generator_backward( + const Tensor& grad_theta, + int64_t N, int64_t C, int64_t H, int64_t W) { + throw std::runtime_error("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); +} + +}} + +#else // AT_CUDNN_ENABLED() + +#include +#include +#include +#include +#include + +#include + +namespace at { namespace native { + +namespace { + +void setSamplerDescriptor(SpatialTransformerDescriptor& desc, + cudnnDataType_t dataType, + int N, int C, int H, int W) +{ + int inputSize[4] = {N, C, H, W}; + desc.set(dataType, 4, inputSize); +} + +} // namespace + +Tensor cudnn_affine_grid_generator_forward( + const Tensor& theta_t, + int64_t N, int64_t C, int64_t H, int64_t W) +{ + setCuDNNStreamToCurrent(); + + TensorArg theta{ theta_t.contiguous(), "theta", 1 }; + CheckedFrom c = "cudnn_affine_grid_generator_forward"; + checkContiguous(c, theta); + checkSize(c, theta, {N, 2, 3}); + + auto grid_t = theta->type().tensor(); + grid_t.resize_({N, H, W, 2}); + + auto dataType = getCudnnDataType(*theta); + SpatialTransformerDescriptor desc; + setSamplerDescriptor(desc, dataType, N, C, H, W); + AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorForward(getCudnnHandle(), desc.desc(), + theta->data_ptr(), + grid_t.data_ptr())); + return grid_t; +} + +Tensor cudnn_affine_grid_generator_backward( + const Tensor& grad_grid_t, + int64_t N, int64_t C, int64_t H, int64_t W) +{ + setCuDNNStreamToCurrent(); + + TensorArg grad_grid{ grad_grid_t.contiguous(), "grad_grid", 1 }; + CheckedFrom c = "cudnn_affine_grid_generator_backward"; + checkContiguous(c, grad_grid); + checkSize(c, grad_grid, {N, H, W, 2}); + + auto grad_theta_t = grad_grid->type().tensor(); + grad_theta_t.resize_({N, 2, 3}); + + auto dataType = getCudnnDataType(grad_theta_t); + SpatialTransformerDescriptor desc; + setSamplerDescriptor(desc, dataType, N, C, H, W); + AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorBackward(getCudnnHandle(), desc.desc(), + grad_grid->data_ptr(), + grad_theta_t.data_ptr())); + return grad_theta_t; +} + +}} // namespace at::native + +#endif // AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp new file mode 100644 index 0000000..9b2a256 --- /dev/null +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -0,0 +1,222 @@ +#include +#include +#include +#include + +#if !AT_CUDNN_ENABLED() + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +std::tuple cudnn_batch_norm( + const Tensor& input, const Tensor& weight, + const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, + bool training, double exponential_average_factor, double epsilon) { + throw std::runtime_error("cudnn_batch_norm: ATen not compiled with cuDNN support"); +} + +std::tuple cudnn_batch_norm_backward( + const Tensor& input, const Tensor& grad_output, const Tensor& weight, + const Tensor& running_mean, const Tensor& running_var, + const Tensor& save_mean, const Tensor& save_var, + double epsilon) { + throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); +} + +}} // namespace at::native + +#else // AT_CUDNN_ENABLED + +#include +#include +#include + +#include + +namespace at { namespace native { + +namespace { + +Tensor expandScale(const Tensor& t, int64_t dim) { + std::vector size{ 1, t.numel() }; + while (static_cast(size.size()) < dim) { + size.emplace_back(1); + } + return t.view(size); +} + +} // namespace + +std::tuple cudnn_batch_norm( + const Tensor& input_t, const Tensor& weight_t, + const Tensor& bias_t, const Tensor& running_mean_t, const Tensor& running_var_t, + bool training, double exponential_average_factor, double epsilon) +{ + TensorArg input{ input_t, "input", 1 }, + weight{ weight_t, "weight", 2 }, + bias{ bias_t, "bias", 3 }, + running_mean{ running_mean_t, "running_mean", 4 }, + running_var{ running_var_t, "running_var", 5 }; + CheckedFrom c = "cudnn_batch_norm"; + setCuDNNStreamToCurrent(); + + checkAllDefined(c, {input, weight, bias}); + if (!training) { + checkAllDefined(c, {running_mean, running_var}); + } + checkAllSameGPU(c, {input, weight, bias, running_mean, running_var}); + if (input->type().scalarType() == ScalarType::Half) { + checkScalarType(c, weight, ScalarType::Float); + } else { + checkAllSameType(c, {input, weight}); + } + checkAllSameType(c, {weight, bias, running_mean, running_var}); + // TODO: is weight required to be contiguous? + checkAllContiguous(c, {input, weight, bias, running_mean, running_var}); + checkDimRange(c, input, 2, 6 /* exclusive */); + auto num_features = input->size(1); + for (auto t : {weight, bias, running_mean, running_var}) { + if (t->defined()) { + checkNumel(c, t, num_features); + } + } + + cudnnBatchNormMode_t mode; + if (input->dim() == 2) { + mode = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode = CUDNN_BATCHNORM_SPATIAL; +#if CUDNN_VERSION >= 7003 + if(training) + mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#endif + } + + auto output_t = input->type().tensor(input->sizes()); + TensorArg output{ output_t, "output", 0 }; + + auto handle = getCudnnHandle(); + auto dataType = getCudnnDataType(*input); + TensorDescriptor idesc{ *input, 4 }; // input descriptor + TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, running_mean, etc. + + Constant one(dataType, 1); + Constant zero(dataType, 0); + Tensor save_mean, save_var; + + if (training) { + int64_t num_features = input_t.size(1); + save_mean = weight_t.type().tensor({ num_features }); + save_var = weight_t.type().tensor({ num_features }); + AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining( + handle, mode, &one, &zero, + idesc.desc(), input->data_ptr(), + idesc.desc(), output->data_ptr(), + wdesc.desc(), + weight->data_ptr(), + bias->data_ptr(), + exponential_average_factor, + at::maybe_data_ptr(running_mean), + at::maybe_data_ptr(running_var), + epsilon, + save_mean.data_ptr(), + save_var.data_ptr())); + } else { + AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference( + handle, mode, &one, &zero, + idesc.desc(), input->data_ptr(), + idesc.desc(), output->data_ptr(), + wdesc.desc(), + weight->data_ptr(), + bias->data_ptr(), + running_mean->data_ptr(), + running_var->data_ptr(), + epsilon)); + } + + // save_mean and save_var can be undefined + // If this causes problems, we can initialize them to empty tensors + // of the correct type + return std::tuple{output_t, save_mean, save_var}; +} + +// NB: CuDNN only implements the backward algorithm for batchnorm +// in training mode (evaluation mode batchnorm has a different algorithm), +// which is why this doesn't accept a 'training' parameter. +std::tuple cudnn_batch_norm_backward( + const Tensor& input_t, const Tensor& grad_output_t, const Tensor& weight_t, + // Unused: but we require them to be passed so that double backwards + // has access + const Tensor& running_mean, const Tensor& running_var, + const Tensor& save_mean_t, const Tensor& save_var_t, + double epsilon) +{ + TensorArg input{ input_t, "input", 1 }, + grad_output{ grad_output_t, "grad_output", 2 }, + weight{ weight_t, "weight", 3 }, + save_mean{ save_mean_t, "save_mean", 4 }, + save_var{ save_var_t, "save_var", 5 }; + CheckedFrom c = "cudnn_batch_norm_backward"; + setCuDNNStreamToCurrent(); + + checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); + checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var}); + if (input->type().scalarType() == ScalarType::Half) { + checkScalarType(c, weight, ScalarType::Float); + } else { + checkAllSameType(c, {input, weight}); + } + checkAllSameType(c, {input, grad_output}); + checkAllSameType(c, {weight, save_mean, save_var}); + // TODO: is weight required to be contiguous? + checkAllContiguous(c, {input, grad_output, save_mean, save_var}); + checkDimRange(c, input, 2, 6 /* exclusive */); + checkSameSize(c, input, grad_output); + auto num_features = input->size(1); + for (auto t : {weight, save_mean, save_var}) { + checkNumel(c, t, num_features); + } + + cudnnBatchNormMode_t mode; + if (input->dim() == 2) { + mode = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { +#if CUDNN_VERSION >= 7003 + mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#else + mode = CUDNN_BATCHNORM_SPATIAL; +#endif + } + + auto grad_input_t = input->type().tensor(input->sizes()); + auto grad_weight_t = weight->type().tensor(weight->sizes()); + auto grad_bias_t = weight->type().tensor(weight->sizes()); + + auto handle = getCudnnHandle(); + auto dataType = getCudnnDataType(*input); + + TensorDescriptor idesc{ *input, 4 }; // input, output, grad_output descriptor + TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, save_mean, etc. + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK(cudnnBatchNormalizationBackward( + handle, mode, &one, &zero, &one, &zero, + idesc.desc(), input->data_ptr(), + idesc.desc(), grad_output->data_ptr(), + idesc.desc(), grad_input_t.data_ptr(), + wdesc.desc(), weight->data_ptr(), + grad_weight_t.data_ptr(), + grad_bias_t.data_ptr(), + epsilon, + save_mean->data_ptr(), + save_var->data_ptr())); + + return std::tuple{grad_input_t, grad_weight_t, grad_bias_t}; +} + +}} // namespace native + +#endif diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp new file mode 100644 index 0000000..b3ee016 --- /dev/null +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -0,0 +1,1204 @@ +#include +#include +#include +#include + +#if !AT_CUDNN_ENABLED() + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +at::Tensor cudnn_convolution( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_backward_input( + IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_backward_weight( + IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_backward_bias( + const at::Tensor& grad_output) { + throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support"); +} + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_transpose( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, + IntList padding, IntList output_padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_transpose_backward_input( + const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_transpose_backward_weight( + IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); +} + +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); +} + +}} + +#else // AT_CUDNN_ENABLED + +#include "THC/THC.h" + +#include +#include +#include +#include +#include "ATen/native/utils/ParamsHash.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { + +// TODO: Go through all the checking code again and make sure +// we haven't missed anything. + +// --------------------------------------------------------------------- +// +// Math +// +// --------------------------------------------------------------------- + +constexpr int input_batch_size_dim = 0; // also grad_input +constexpr int input_channels_dim = 1; +constexpr int output_batch_size_dim = 0; // also grad_output +constexpr int output_channels_dim = 1; +constexpr int weight_output_channels_dim = 0; +constexpr int weight_input_channels_dim = 1; + +// Often written as 2 + max_dim (extra dims for batch size and channels) +constexpr int max_dim = 3; + +// NB: conv_output_size and conv_input_size are not bijections, +// as conv_output_size loses information; this is why conv_input_size +// takes an extra output_padding argument to resolve the ambiguity. + +std::vector conv_output_size( + IntList input_size, IntList weight_size, + IntList padding, IntList stride, IntList dilation, int64_t groups +) { + // ASSERT(input_size.size() > 2) + // ASSERT(input_size.size() == weight_size.size()) + auto dim = input_size.size(); + std::vector output_size(dim); + output_size[0] = input_size[input_batch_size_dim]; + output_size[1] = weight_size[weight_output_channels_dim]; + for (size_t d = 2; d < dim; ++d) { + auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1; + output_size[d] = (input_size[d] + (2 * padding[d - 2]) + - kernel) / stride[d - 2] + 1; + } + return output_size; +} + +std::vector conv_input_size( + IntList output_size, IntList weight_size, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups +) { + // ASSERT(output_size.size() > 2) + // ASSERT(output_size.size() == weight_size.size()) + auto dim = output_size.size(); + std::vector input_size(dim); + input_size[0] = output_size[output_batch_size_dim]; + input_size[1] = weight_size[weight_input_channels_dim] * groups; + for (size_t d = 2; d < dim; ++d) { + int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1; + input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) + + kernel + output_padding[d - 2]; + } + return input_size; +} + +std::vector conv_weight_size( + IntList input_size, IntList output_size, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups +) { + auto dim = input_size.size(); + std::vector weight_size(dim); + weight_size[0] = output_size[1]; + weight_size[1] = input_size[1] / groups; + for (size_t d = 2; d < dim; ++d) { + int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2] + + 2 * padding[d - 2] - output_padding[d - 2]; + weight_size[d] = (kernel - 1) / dilation[d - 2] + 1; + } + return weight_size; +} + +// TODO: Move this into the standard library, with a better name? +Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { + auto group_size = t.size(dim) / groups; + return t.narrow(dim, group_idx * group_size, group_size); +} + +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Note [Legacy CuDNN grouped convolution support] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// CuDNN earlier than CuDNN 7 does not directly support group +// convolution, so we provide support for it by sequentially +// running a convolution per group with appropriately +// adjusted sizes. https://blog.yani.io/filter-group-tutorial/ +// has a fairly good diagram explaining how it works. + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name) +{ + if (args.size() > expected_size){ + std::stringstream ss; + ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; + throw std::runtime_error(ss.str()); + } + else if (args.size() < expected_size){ + std::stringstream ss; + ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; + throw std::runtime_error(ss.str()); + } + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + throw std::runtime_error(ss.str()); + } +} + + +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +static void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntList padding, IntList stride, IntList dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + +// This POD struct is used to let us easily compute hashes of the +// parameters +struct ConvolutionParams +{ + cudnnDataType_t dataType; + int input_size[2 + max_dim]; + int input_stride[2 + max_dim]; + int weight_size[2 + max_dim]; + int padding[max_dim]; + int stride[max_dim]; + int dilation[max_dim]; + int64_t groups; + bool deterministic; + // NB: transposed purposely omitted: transposed just swaps + // forward and backward, so you can reuse the benchmark entry, +}; + +// NB: This can't be a constructor, because then ConvolutionParams +// would not be a POD anymore. +// TODO: Use TensorGeometry here instead of the entire Tensor, which we +// don't actually need. (OTOH: We can always pass in +// grad_input/grad_output, so this is not very pressing) +void setConvolutionParams( + ConvolutionParams* params, + const at::Tensor& input, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool deterministic) { + + cudnnDataType_t dataType = getCudnnDataType(input); + memset(params, 0, sizeof(ConvolutionParams)); + params->dataType = dataType; + // ASSERT(weight.dim() == input.dim()) + for (int i = 0; i != input.dim(); ++i) { + params->input_size[i] = (int) input.size(i); + params->input_stride[i] = (int) input.stride(i); + params->weight_size[i] = (int) weight.size(i); + } + // ASSERT(padding.size() == stride.size()) + // ASSERT(padding.size() == dilation.size()) + for (size_t i = 0; i != padding.size(); ++i) { + params->padding[i] = padding[i]; + params->stride[i] = stride[i]; + params->dilation[i] = dilation[i]; + } + // In principle, we shouldn't parametrize by groups for legacy + // CuDNN, but it doesn't seem worth the effort to actually do this. + params->groups = groups; + params->deterministic = deterministic; +} + +// Convenience struct for passing around descriptors and data +// pointers +struct ConvolutionArgs { + cudnnHandle_t handle; + ConvolutionParams params; + TensorDescriptor idesc, odesc; + FilterDescriptor wdesc; + const Tensor& input, output, weight; + ConvolutionDescriptor cdesc; + + ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) { + } +}; + +// --------------------------------------------------------------------- +// +// Benchmarking +// +// --------------------------------------------------------------------- + +// TODO: Use something less heavy duty than a big honking mutex +template +struct BenchmarkCache { + std::mutex mutex; + std::unordered_map, ParamsEqual> map; + + bool find(const ConvolutionParams& params, T* results) { + std::lock_guard guard(mutex); + auto it = map.find(params); + if (it == map.end()) { + return false; + } + *results = it->second; + return true; + } + + void insert(const ConvolutionParams& params, const T& results) { + std::lock_guard guard(mutex); + map[params] = results; + } +}; + +BenchmarkCache fwd_algos; +BenchmarkCache bwd_data_algos; +BenchmarkCache bwd_filter_algos; + +// TODO: Stop manually allocating CUDA memory; allocate an ATen byte +// tensor instead. +struct Workspace { + Workspace(size_t size) : size(size), data(NULL) { + data = THCudaMalloc(globalContext().lazyInitCUDA(), size); + } + Workspace(const Workspace&) = delete; + Workspace(Workspace&&) = default; + Workspace& operator=(Workspace&&) = default; + ~Workspace() { + if (data) { + THCudaFree(globalContext().lazyInitCUDA(), data); + } + } + + size_t size; + void* data; +}; + +template +struct algorithm_search { +}; + +cudnnStatus_t getWorkspaceSize( + const ConvolutionArgs& args, + cudnnConvolutionFwdAlgo_t algo, size_t* sz) +{ + return cudnnGetConvolutionForwardWorkspaceSize( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + algo, + sz + ); +} +cudnnStatus_t getWorkspaceSize( + const ConvolutionArgs& args, + cudnnConvolutionBwdDataAlgo_t algo, size_t* sz) +{ + return cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + algo, + sz); +} +cudnnStatus_t getWorkspaceSize( + const ConvolutionArgs& args, + cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz) +{ + return cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + algo, + sz); +} + +template +size_t getMaxWorkspaceSize( + const ConvolutionArgs& args, + const algo_t *algo, int n_algo) +{ + THCState *state = globalContext().lazyInitCUDA(); + + size_t max_ws_size = 0; + size_t max_block_size = 0; + size_t total_gpu_mem = 0; + size_t free_gpu_mem = 0; + + THCudaCheck(THCudaMemGetInfoCached(state, &free_gpu_mem, &total_gpu_mem, &max_block_size)); + + for (int i = 0; i < n_algo; i++) { + cudnnStatus_t err; + size_t sz; + err = getWorkspaceSize(args, algo[i], &sz); + if (CUDNN_STATUS_SUCCESS != err || sz == 0 + || sz < max_ws_size || sz > max_block_size) continue; + max_ws_size = sz; + } + return max_ws_size; +} + +template +perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) { + if (deterministic) { + // iterate over perf results of all algorithms and find the best deterministic algo + for (int i = 0; i < n_algo; i++) { + // TODO: Shouldn't all returned results be successful? + // Double check documentation for cudnnFindConvolutionForwardAlgorithmEx + if (perfResults[i].status == CUDNN_STATUS_SUCCESS && + perfResults[i].determinism == CUDNN_DETERMINISTIC) { + return perfResults[i]; + } + } + throw std::runtime_error("no deterministic convolution algorithms available in CuDNN"); + } else { + return perfResults[0]; + } +} + +template<> +struct algorithm_search { + using perf_t = cudnnConvolutionFwdAlgoPerf_t; + using algo_t = cudnnConvolutionFwdAlgo_t; + + static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + static BenchmarkCache& cache() { return fwd_algos; } + + static perf_t findAlgorithm(const ConvolutionArgs& args) { + static const algo_t algos[] = { + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, + }; + static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; + static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing cuDNN convolution forward algorithms"); + int perf_count; + std::unique_ptr perf_results(new perf_t[num_algos]); + size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); + Workspace ws(max_ws_size); + AT_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx( + args.handle, + args.idesc.desc(), args.input.data_ptr(), + args.wdesc.desc(), args.weight.data_ptr(), + args.cdesc.desc(), + args.odesc.desc(), args.output.data_ptr(), + num_algos, + &perf_count, + perf_results.get(), + ws.data, + ws.size)); + return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count); + } + + static void getAlgorithm( + const ConvolutionArgs& args, + algo_t* algo) + { + cudnnConvolutionFwdPreference_t pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + AT_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + pref, + 0, + algo)); + } + + static void getWorkspaceSize( + const ConvolutionArgs& args, + algo_t algo, size_t* workspaceSize) + { + AT_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + algo, + workspaceSize)); + } +}; + +template<> +struct algorithm_search { + using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; + using algo_t = cudnnConvolutionBwdDataAlgo_t; + + static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + static BenchmarkCache& cache() { return bwd_data_algos; } + + static perf_t findAlgorithm(const ConvolutionArgs& args) { + static const algo_t algos[] = { + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED + }; + static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; + static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing cuDNN convolution backward data algorithms."); + int perf_count; + std::unique_ptr perf_results(new perf_t[num_algos]); + size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); + Workspace ws(max_ws_size); + AT_CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx( + args.handle, + args.wdesc.desc(), args.weight.data_ptr(), + args.odesc.desc(), args.output.data_ptr(), + args.cdesc.desc(), + args.idesc.desc(), args.input.data_ptr(), + num_algos, + &perf_count, + perf_results.get(), + ws.data, + ws.size)); + return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count); + } + + static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) { + AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, + 0, + algo)); + } + + static void getWorkspaceSize( + const ConvolutionArgs& args, + cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize) + { + AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + algo, + workspaceSize)); + } +}; + +template<> +struct algorithm_search { + using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; + using algo_t = cudnnConvolutionBwdFilterAlgo_t; + + static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + + static BenchmarkCache& cache() { return bwd_filter_algos; } + + static perf_t findAlgorithm(const ConvolutionArgs& args) { + static const algo_t algos[] = { + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, +#if CUDNN_VERSION >= 6000 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, +#endif + }; + // NOTE: - 1 because ALGO_WINOGRAD is not implemented + static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1; + static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing cuDNN convolution backward filter algorithms."); + std::unique_ptr perf_results(new perf_t[num_algos]); + size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); + int perf_count; + Workspace ws(max_ws_size); + + AT_CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx( + args.handle, + args.idesc.desc(), args.input.data_ptr(), + args.odesc.desc(), args.output.data_ptr(), + args.cdesc.desc(), + args.wdesc.desc(), args.weight.data_ptr(), + num_algos, + &perf_count, + perf_results.get(), + ws.data, + ws.size)); + return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count); + } + + static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) { + AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, + 0, + algo) + ); + } + + static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize) + { + AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + algo, + workspaceSize)); + } +}; + +template +void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) { + using search = algorithm_search; + auto& cache = search::cache(); + + if (cache.find(args.params, algo)) { + return; + } + + if (args.params.deterministic && !benchmark) { + *algo = search::DEFAULT_ALGO; + return; + } + + if (!benchmark) { + search::getAlgorithm(args, algo); + return; + } + + if (cache.find(args.params, algo)) { + // re-check cache since another thread may have benchmarked the algorithm + return; + } + + auto perfResults = search::findAlgorithm(args); + // for deterministic algo, look at all the perf results and return the best + // deterministic algo + if (perfResults.status == CUDNN_STATUS_SUCCESS && + !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) { + *algo = perfResults.algo; + } else { + *algo = search::DEFAULT_ALGO; + } + cache.insert(args.params, *algo); + + // Free the cached blocks in our caching allocator. They are + // needed here because the above benchmarking uses a huge amount of memory, + // e.g. a few GBs. + THCCachingAllocator_emptyCache(); +} + +template +Workspace chooseAlgorithm( + const ConvolutionArgs& args, + bool benchmark, + algo_t* algo) +{ + findAlgorithm(args, benchmark, algo); + + using search = algorithm_search; + size_t workspace_size; + search::getWorkspaceSize(args, *algo, &workspace_size); + try { + return Workspace(workspace_size); + } catch (std::runtime_error& e) { + cudaGetLastError(); // clear OOM error + + // switch to default algorithm and record it in the cache to prevent + // further OOM errors + *algo = search::DEFAULT_ALGO; + search::cache().insert(args.params, *algo); + + search::getWorkspaceSize(args, *algo, &workspace_size); + return Workspace(workspace_size); + } +} + +// --------------------------------------------------------------------- +// +// Bias addition +// +// --------------------------------------------------------------------- + +// In-place! +void cudnn_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const TensorArg& bias) +{ + checkAllSameType(c, {output, bias}); + checkAllSameGPU(c, {output, bias}); + checkSize(c, bias, { output->size(output_channels_dim) }); + + // See Note [CuDNN broadcast padding]. Handle the left padding + // ourselves, but use TensorDescriptor's padding argument to do the rest. + TensorDescriptor bdesc, odesc; + bdesc.set(bias->expand({1, bias->size(0)}), output->dim()); + odesc.set(*output); + + auto handle = getCudnnHandle(); + auto dataType = getCudnnDataType(*bias); + Constant one(dataType, 1); + + AT_CUDNN_CHECK(cudnnAddTensor(handle, &one, bdesc.desc(), bias->data_ptr(), + &one, odesc.desc(), output->data_ptr())); +} + +// The general strategy: +// +// - cudnn_convolution (Tensor) +// Entry points for clients, takes bias +// +// - cudnn_convolution_forward (TensorArg) +// Entry point, which may be reused between regular +// convolution and transposed convolution. Does NOT take bias. +// +// - raw_cudnn_convolution_forward_out (Tensor) +// Low level function which invokes CuDNN, and takes an output +// tensor which is directly written to (thus _out). +// +// Where does argument checking happen? Here's the division of +// responsibility: +// - Things that happen in at::Tensor +// - TensorArg allocation +// - setCuDNNStreamToCurrent +// - Things that happen in TensorArg +// - Check arguments (type, GPU, shape) +// +// TODO: Consider renaming zero-indexed arguments to "self" + + + +// --------------------------------------------------------------------- +// +// Convolution forward / Transposed convolution backward +// +// --------------------------------------------------------------------- + +// The raw API directly invokes CuDNN and does not emulate support +// for group convolution on old versions of CuDNN. +// +// There are a few reasons this should never be directly exposed +// via ATen: +// +// - It takes output as a parameter (this should be computed!) +// - It doesn't do input checking +// - It doesn't resize output (it is assumed to be correctly sized) +// - It takes a ConvolutionParams struct +// +void raw_cudnn_convolution_forward_out( + const Tensor& output, const Tensor& input, const Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getCudnnDataType(input); + + ConvolutionArgs args{ input, output, weight }; + args.handle = getCudnnHandle(); + setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(input); + args.wdesc.set(weight); + args.odesc.set(output); + args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + // TODO: when we do legacy group convolution support, we'll repeatedly + // reinitialize the workspace for each convolution we do. This is + // wasteful; we'd rather reuse the workspace. OTOH, legacy group + // convolution support is already pretty slow, so this might not + // matter. (This applies to raw_cudnn_convolution_backward_input as well.) + cudnnConvolutionFwdAlgo_t fwdAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK(cudnnConvolutionForward( + args.handle, + &one, args.idesc.desc(), input.data_ptr(), + args.wdesc.desc(), weight.data_ptr(), + args.cdesc.desc(), fwdAlg, workspace.data, workspace.size, + &zero, args.odesc.desc(), output.data_ptr())); +} + +Tensor cudnn_convolution_forward( + CheckedFrom c, + const TensorArg& input, const TensorArg& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + checkAllSameType(c, {input, weight}); + checkAllSameGPU(c, {input, weight}); + + auto output_t = input->type().tensor( + conv_output_size(input->sizes(), weight->sizes(), + padding, stride, dilation, groups)); + + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{ output_t, "result", 0 }; + convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); + + // See #4500 + Tensor weight_contig = weight->contiguous(); + +#if CUDNN_VERSION < 7000 + for (int i = 0; i < groups; i++) { + raw_cudnn_convolution_forward_out( + narrowGroup(*output, output_channels_dim, i, groups), + narrowGroup(*input, input_channels_dim, i, groups), + narrowGroup(weight_contig, weight_output_channels_dim, i, groups), + padding, stride, dilation, 1, benchmark, deterministic); + } +#else + raw_cudnn_convolution_forward_out( + *output, *input, weight_contig, + padding, stride, dilation, groups, benchmark, deterministic); +#endif + + return *output; +} + +Tensor cudnn_convolution( + const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }, + bias { bias_t, "bias", 3 }; + setCuDNNStreamToCurrent(); + CheckedFrom c = "cudnn_convolution"; + auto output_t = cudnn_convolution_forward( + c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); + if (bias->defined()) { + cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + } + return output_t; +} + +// NB: output_padding not needed here, as there is no ambiguity to +// resolve +Tensor cudnn_convolution_transpose_backward_input( + const Tensor& grad_output_t, const Tensor& weight_t, + IntList padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + TensorArg grad_output { grad_output_t, "grad_output", 1 }, + weight { weight_t, "weight", 2 }; + setCuDNNStreamToCurrent(); + return cudnn_convolution_forward( + "cudnn_convolution_transpose_backward_input", + grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); +} + +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[1]) { + grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[2]) { + grad_bias = at::cudnn_convolution_backward_bias(grad_output); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +// --------------------------------------------------------------------- +// +// Convolution backward / Transposed convolution forward +// +// --------------------------------------------------------------------- + +void raw_cudnn_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getCudnnDataType(grad_output); + + ConvolutionArgs args{ grad_input, grad_output, weight }; + args.handle = getCudnnHandle(); + setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(grad_input); + args.wdesc.set(weight); + args.odesc.set(grad_output); + args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + cudnnConvolutionBwdDataAlgo_t bwdDataAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK(cudnnConvolutionBackwardData( + args.handle, + &one, args.wdesc.desc(), weight.data_ptr(), + args.odesc.desc(), grad_output.data_ptr(), + args.cdesc.desc(), bwdDataAlg, workspace.data, workspace.size, + &zero, args.idesc.desc(), grad_input.data_ptr())); +} + +// Backward and transpose are algorithmically equivalent, but they +// compute their geometry differently. In a backwards, you knew what +// the original size of the input tensor was, so you can cache that +// geometry and fill it directly. In transposed convolution, it is +// more conventional to not explicitly specify the output (previously +// input) size, and compute it. This, however, leaves a degree of +// freedom; this degree of freedom is resolved using the +// output_padding parameter. Both of these interfaces are equivalent, +// but they are differently convenient depending on the use case. + +Tensor cudnn_convolution_backward_input( + CheckedFrom c, + IntList input_size, const TensorArg& grad_output, const TensorArg& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + checkAllSameType(c, {grad_output, weight}); + checkAllSameGPU(c, {grad_output, weight}); + + auto grad_input_t = grad_output->type().tensor(input_size); + + // Avoid "grad_input" when this is being used as transposed convolution + TensorArg grad_input{ grad_input_t, "result", 0 }; + convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + + // See #4500 + Tensor weight_contig = weight->contiguous(); + +#if CUDNN_VERSION < 7000 + for (int i = 0; i < groups; i++) { + raw_cudnn_convolution_backward_input_out( + narrowGroup(*grad_input, input_channels_dim, i, groups), + narrowGroup(*grad_output, output_channels_dim, i, groups), + narrowGroup(weight_contig, weight_output_channels_dim, i, groups), + padding, stride, dilation, 1, benchmark, deterministic); + } +#else + raw_cudnn_convolution_backward_input_out( + *grad_input, *grad_output, weight_contig, + padding, stride, dilation, groups, benchmark, deterministic); +#endif + + return *grad_input; +} + +Tensor cudnn_convolution_transpose_forward( + CheckedFrom c, + const TensorArg& grad_output, const TensorArg& weight, + IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), + padding, output_padding, stride, dilation, groups); + return cudnn_convolution_backward_input(c, input_size, grad_output, weight, + padding, stride, dilation, groups, benchmark, deterministic); +} + +Tensor cudnn_convolution_backward_input( + IntList input_size, const Tensor& grad_output_t, const Tensor& weight_t, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + weight{ weight_t, "weight", 2 }; + setCuDNNStreamToCurrent(); + return cudnn_convolution_backward_input( + "cudnn_convolution_backward_input", + input_size, grad_output, weight, + padding, stride, dilation, groups, benchmark, deterministic); +} + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic, std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[1]) { + grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + } + if (output_mask[2]) { + grad_bias = at::cudnn_convolution_backward_bias(grad_output); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +Tensor cudnn_convolution_transpose( + const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t, + IntList padding, IntList output_padding, IntList stride, IntList dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }, + bias { bias_t, "bias", 3 }; + CheckedFrom c = "cudnn_convolution_transpose"; + auto output_t = cudnn_convolution_transpose_forward( + c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic); + if (bias->defined()) { + cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + } + return output_t; +} + +// --------------------------------------------------------------------- +// +// Convolution backward (weight) +// +// --------------------------------------------------------------------- + +void raw_cudnn_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getCudnnDataType(input); + + ConvolutionArgs args{ input, grad_output, grad_weight }; + args.handle = getCudnnHandle(); + setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(input); + args.wdesc.set(grad_weight); + args.odesc.set(grad_output); + args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + cudnnConvolutionBwdFilterAlgo_t bwdFilterAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK(cudnnConvolutionBackwardFilter( + args.handle, + &one, args.idesc.desc(), input.data_ptr(), + args.odesc.desc(), grad_output.data_ptr(), + args.cdesc.desc(), bwdFilterAlg, workspace.data, workspace.size, + &zero, args.wdesc.desc(), grad_weight.data_ptr())); +} + +Tensor cudnn_convolution_backward_weight( + CheckedFrom c, + IntList weight_size, const TensorArg& grad_output, const TensorArg& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + + checkAllSameType(c, {grad_output, input}); + checkAllSameGPU(c, {grad_output, input}); + + auto grad_weight_t = grad_output->type().tensor(weight_size); + + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{ grad_weight_t, "result", 0 }; + convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); + +#if CUDNN_VERSION < 7000 + for (int i = 0; i < groups; i++) { + raw_cudnn_convolution_backward_weight_out( + narrowGroup(*grad_weight, weight_output_channels_dim, i, groups), + narrowGroup(*grad_output, output_channels_dim, i, groups), + narrowGroup(*input, input_channels_dim, i, groups), + padding, stride, dilation, groups, benchmark, deterministic); + } +#else + raw_cudnn_convolution_backward_weight_out( + *grad_weight, *grad_output, *input, + padding, stride, dilation, groups, benchmark, deterministic); +#endif + + return grad_weight_t; +} + +Tensor cudnn_convolution_backward_weight( + IntList weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + setCuDNNStreamToCurrent(); + return cudnn_convolution_backward_weight( + "cudnn_convolution_backward_weight", + weight_size, grad_output, input, + padding, stride, dilation, groups, benchmark, deterministic); +} + +Tensor cudnn_convolution_transpose_backward_weight( + IntList weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntList padding, IntList stride, IntList dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + setCuDNNStreamToCurrent(); + return cudnn_convolution_backward_weight( + "cudnn_convolution_backward_weight", + weight_size, input, grad_output, + padding, stride, dilation, groups, benchmark, deterministic); +} + +// --------------------------------------------------------------------- +// +// Convolution backward (bias) +// +// --------------------------------------------------------------------- + +Tensor cudnn_convolution_backward_bias( + const Tensor& grad_output_t) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }; + setCuDNNStreamToCurrent(); + + auto grad_bias_t = grad_output->type().tensor( + { grad_output->size(output_channels_dim) }); + + TensorArg grad_bias{ grad_bias_t, "result", 0 }; + + // See Note [CuDNN broadcast padding]. Handle the left padding + // ourselves, but use TensorDescriptor's pad argument to do the rest. + TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), + static_cast(grad_output->dim())}; + TensorDescriptor odesc{*grad_output}; + + auto handle = getCudnnHandle(); + auto dataType = getCudnnDataType(*grad_bias); + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK(cudnnConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), + &zero, bdesc.desc(), grad_bias->data_ptr())); + return *grad_bias; +} + + +}} // namespace + +#endif diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp new file mode 100644 index 0000000..c6b7ffc --- /dev/null +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -0,0 +1,147 @@ +#include +#include +#include +#include + +#if !AT_CUDNN_ENABLED() + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +Tensor cudnn_grid_sampler_forward( + const Tensor& input_t, const Tensor& grid_t) { + throw std::runtime_error("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support"); +} + +std::tuple cudnn_grid_sampler_backward( + const Tensor& input_t, const Tensor& grid_t, + const Tensor& grad_output_t) { + throw std::runtime_error("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support"); +} + +}} + +#else // AT_CUDNN_ENABLED + +#include +#include +#include + +#include + +// TODO: descriptor checking + + +namespace at { namespace native { + +namespace { + +void setSamplerDescriptor(SpatialTransformerDescriptor& desc, cudnnDataType_t dataType, const at::Tensor& tensor) +{ + int inputSize[4] = {0}; + for (int i = 0; i < tensor.dim(); ++i) { + inputSize[i] = (int) tensor.size(i); + } + desc.set(dataType, 4, inputSize); +} + +void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) +{ + // assert size of grid is n*h*w*2 + // FYI: grid is between [-1, 1], where -1 left most pixel, + // 1 represents right most pixel (and hence 0 is the center pixel) + // if grid has values >1 or <-1, those values are ignored + checkContiguous(c, grid); + checkDim(c, grid, 4); + // TODO: Maybe more user friendly to report where the expected size + // came from + checkSize(c, grid, 0, input->size(0)); + checkSize(c, grid, 3, 2); +} + +} // namespace + +Tensor cudnn_grid_sampler_forward( + const Tensor& input_t, const Tensor& grid_t) +{ + TensorArg input{ contiguousIfZeroInStrides(input_t), "input", 1 }, + grid{ grid_t.contiguous(), "grid", 2 }; + CheckedFrom c = "cudnn_grid_sampler_forward"; + setCuDNNStreamToCurrent(); + checkAllSameGPU(c, {input, grid}); + checkAllSameType(c, {input, grid}); + checkGridSize(c, grid, input); + checkDim(c, input, 4); + + auto output_t = input->type().tensor(); + output_t.resize_({input->size(0), input->size(1), grid->size(1), grid->size(2)}); + + TensorDescriptor idesc{ *input }; // input descriptor + TensorDescriptor odesc{ output_t }; // output descriptor + SpatialTransformerDescriptor desc; // sampler descriptor + + auto handle = getCudnnHandle(); + auto dataType = getCudnnDataType(*input); + setSamplerDescriptor(desc, dataType, output_t); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + AT_CUDNN_CHECK(cudnnSpatialTfSamplerForward( + handle, desc.desc(), + &one, idesc.desc(), input->data_ptr(), + grid->data_ptr(), + &zero, odesc.desc(), output_t.data_ptr() + )); + + return output_t; +} + +// NB: CuDNN does not support output mask; you always get both +// gradients. +std::tuple cudnn_grid_sampler_backward( + const Tensor& input_t, const Tensor& grid_t, + const Tensor& grad_output_t) +{ + TensorArg input{ contiguousIfZeroInStrides(input_t), "input", 1 }, + grid{ grid_t.contiguous(), "grid", 2 }, + grad_output{ contiguousIfZeroInStrides(grad_output_t), "grad_output", 3 }; + CheckedFrom c = "cudnn_grid_sampler_backward"; + setCuDNNStreamToCurrent(); + checkAllSameGPU(c, {input, grad_output, grid}); + checkGridSize(c, grid, input); + checkDim(c, input, 4); + checkDim(c, grad_output, 4); + + auto grad_input_t = input->type().tensor(); + grad_input_t.resize_(input->sizes()); + auto grad_grid_t = grid->type().tensor(); + grad_grid_t.resize_(grid->sizes()); + + TensorDescriptor idesc{ *input }; // input descriptor + TensorDescriptor odesc{ *grad_output }; // grad_output descriptor + TensorDescriptor gdesc{ grad_input_t }; // grad_input descriptor + SpatialTransformerDescriptor desc; // sampler descriptor + + auto handle = getCudnnHandle(); + auto dataType = getCudnnDataType(*input); + setSamplerDescriptor(desc, dataType, *grad_output); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + AT_CUDNN_CHECK(cudnnSpatialTfSamplerBackward( + handle, desc.desc(), + &one, idesc.desc(), input->data_ptr(), + &zero, gdesc.desc(), grad_input_t.data_ptr(), + &one, odesc.desc(), grad_output->data_ptr(), + // intruigingly, the outputs don't need descriptors + grid->data_ptr(), + &zero, grad_grid_t.data_ptr() + )); + + return std::tuple{ grad_input_t, grad_grid_t }; +} + +}} // namespace at::cudnn + +#endif diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp new file mode 100644 index 0000000..aced0a0 --- /dev/null +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -0,0 +1,1011 @@ +#include +#include +#include +#include +#include +#include +#include + +#if !AT_CUDNN_ENABLED() + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +Tensor _cudnn_rnn_flatten_weight( + TensorList weight_arr, int64_t weight_stride0, + int64_t input_size, + int64_t fn_mode, int64_t fn_hidden_size, + int64_t fn_num_layers, bool batch_first, + bool fn_bidirectional + ) { + throw std::runtime_error("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support"); +} + +std::tuple _cudnn_rnn( + const Tensor& input_r, + TensorList weight, int64_t weight_stride0, + const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx, + int64_t fn_mode, int64_t fn_hidden_size, + int64_t fn_num_layers, bool batch_first, double fn_dropout, + bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes, + const Tensor& fn_dropout_state + ) { + throw std::runtime_error("_cudnn_rnn: ATen not compiled with cuDNN support"); +} + +std::tuple> _cudnn_rnn_backward( + const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx, + const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r, + const Tensor& grad_cy_r, + int64_t mode, int64_t hidden_size, + int64_t num_layers, bool batch_first, double dropout, + bool train, bool bidirectional, IntList batch_sizes, + const Tensor& dropout_state, const Tensor& reserve, + std::array output_mask + ) { + throw std::runtime_error("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); +} + +Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) { + throw std::runtime_error("_cudnn_init_dropout_state: ATen not compiled with cuDNN support"); +} + +}} // namespace at::native + +#else // AT_CUDNN_ENABLED() + +#include +#include +#include +#include + +namespace at { namespace native { + +namespace { + // DropoutDescriptor + + struct DropoutDescriptorParams { + bool train; + double dropout; + Tensor dropout_state; + DropoutDescriptorParams() {} + void set(bool train_, double dropout_, Tensor dropout_state_) { + train = train_; + dropout = dropout_; + dropout_state = dropout_state_; + } + DropoutDescriptor descriptor(cudnnHandle_t handle) const { + auto dropout_p = train ? dropout : 0; + DropoutDescriptor dropout_desc; + if (dropout_p == 0) { + dropout_desc.set_no_dropout(handle); + } else { + dropout_desc.set(handle, dropout_p, dropout_state); + } + return dropout_desc; + } + }; + + // RNNDescriptor + + struct RNNDescriptorParams { + int64_t hidden_size; + int64_t num_layers; + cudnnDirectionMode_t bidirectional; + cudnnRNNMode_t mode; + cudnnDataType_t datatype; + + cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT; + + int64_t num_directions() const { + return bidirectional ? 2 : 1; + } + + void set_mode(int64_t fn_mode) { + switch (fn_mode) { + case CUDNN_RNN_RELU: + mode = CUDNN_RNN_RELU; + break; + case CUDNN_RNN_TANH: + mode = CUDNN_RNN_TANH; + break; + case CUDNN_LSTM: + mode = CUDNN_LSTM; + break; + case CUDNN_GRU: + mode = CUDNN_GRU; + break; + default: + { + std::ostringstream oss; + oss << "unrecognized cuDNN RNN mode " << fn_mode; + throw std::runtime_error(oss.str()); + } + } + } + + void set_bidirectional(bool fn_bidirectional) { + bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; + } + + void set(int64_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype) { + this->set_mode(mode); + this->hidden_size = hidden_size; + this->num_layers = num_layers; + this->set_bidirectional(bidirectional); + this->datatype = datatype; + } + + + RNNDescriptor descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const { + RNNDescriptor rnn_desc; + rnn_desc.set(handle, hidden_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype); + return rnn_desc; + } + + // In some cases, a use of RNNDescriptor does not rely on the + // DropoutDescriptor. In this case, we fake up a no-dropout + // descriptor to make the RNN descriptor initialization go through. + // This is used by _cudnn_rnn_flatten_weight, which needs an + // RNNDescriptor for get_parameters(), but does not actually need + // a fully initialized dropout descriptor. This lets us avoid + // having to pass the dropout state to flatten, which has no business + // knowing what the dropout state is. + RNNDescriptor descriptor(cudnnHandle_t handle) const { + DropoutDescriptor dropout_desc; + dropout_desc.set_no_dropout(handle); + return descriptor(handle, std::move(dropout_desc)); + } + }; + + // TensorDescriptor list + + std::vector rnn_descriptor_sequence(const Tensor& tensor, IntList batch_sizes) { + std::vector descriptors(batch_sizes.size()); + size_t i = 0; + // To be mutated in the loop + std::vector batch_tensor_size(tensor.sizes()); + for (auto batch_size : batch_sizes) { + batch_tensor_size[0] = batch_size; + // NB: cuDNN RNN API does not support 2d descriptors, so we + // must pad it out to 3d. + descriptors[i].set(getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3); + i++; + } + return descriptors; + } + + std::vector rnn_descriptor(const Tensor& tensor, int64_t N) { + std::vector descriptors(N); + for (int64_t i = 0; i < N; i++) { + descriptors[i].set(tensor, 5); + } + return descriptors; + } + + // The best way to understand the meaning of the values stored in + // this struct is to consider each of the possible ways our + // input can be structured. + // + // Suppose you want to run RNN on the following variable + // length inputs: + // + // Sequence 1: ABCD + // Sequence 2: EF + // Sequence 3: G + // + // (Let _ be padding when we have non-packed representations.) + // + // # Packed input (batch_sizes is non-empty) + // + // input_size + // +------+ + + // | A | | + // | E | mini_batch = | + // | G | batch_sizes[0] = 3 | + // +------+ | + // | B | | batch_sizes_sum = 7 + // | F | batch_sizes[1] = 2 | + // +------+ | + // | C | batch_sizes[2] = 1 | + // +------+ | + // | D | batch_sizes[3] = 1 | + // +------+ + + // + // (seq_length = 4) + // + // input.size() = batch_sizes_sum x input_size + // + // # Unpacked input (batch_first = false) + // + // mini_batch = 3 + // +-------+ + // | A E G | + // | B F _ | seq_length = 4 + // | C _ _ | + // | D _ _ | + // +-------+ + // ... input_size + // +-------+ + // + // input.size() = seq_length x mini_batch x input_size + // + // # Unpacked input (batch_first = true) + // + // seq_length = 4 + // +---------+ + // | A B C D | + // | E F _ _ | mini_batch = 3 + // | G _ _ _ | + // +---------+ + // ... input_size + // +---------+ + // + // input.size() = mini_batch x seq_length x input_size + // + struct TensorDescriptorListParams { + IntList batch_sizes; + int64_t seq_length; + int64_t mini_batch; + // NB: this is not input.size(), which is an IntList; instead, this + // size of the inner-most dimension. In NL applications, this is usually + // the size of the embedding. You can also think of this as the size + // of the "channel" dimension (at risk of confusing vision researchers :) + int64_t input_size; + // Only valid when !is_input_packed + int64_t batch_sizes_sum; // == sum(batch_sizes) + + bool is_input_packed() const { + return batch_sizes.size() != 0; + } + + void set(IntList input_sizes, IntList batch_sizes_, bool batch_first) { + batch_sizes = batch_sizes_; + if (is_input_packed()) { + seq_length = batch_sizes.size(); + mini_batch = batch_sizes[0]; + // NB: When input is packed, the mini_batch size is NOT the size + // of the outer dimension + batch_sizes_sum = input_sizes[0]; + input_size = input_sizes[1]; + } else { + if (batch_first) { + seq_length = input_sizes[1]; + mini_batch = input_sizes[0]; + } else { + seq_length = input_sizes[0]; + mini_batch = input_sizes[1]; + } + input_size = input_sizes[2]; + // TODO: Actually, would this make ASAN's job harder catching + // an uninitialized access? + batch_sizes_sum = -1; // something bogus in case we access it + } + } + + // TODO: check x for consistency with input_size? + std::vector descriptors(Tensor x) const { + auto is_input_packed = batch_sizes.size() != 0; + if (is_input_packed) { + return rnn_descriptor_sequence(x, batch_sizes); + } else { + return rnn_descriptor(x[0], seq_length); + } + } + }; + + // Everything together + + struct RNNParams { + DropoutDescriptorParams dropout; + RNNDescriptorParams rnn; + TensorDescriptorListParams tensors; + }; + + // NB: Doesn't include the weight descriptor + struct RNNDescriptors { + RNNDescriptor rnn_desc; + // NB: this won't actually lay out the tensor descriptor pointers + // in the right way, so you'll have to preprocess them + std::vector x_descs; + std::vector y_descs; + TensorDescriptor hx_desc; + TensorDescriptor hy_desc; + TensorDescriptor cx_desc; + TensorDescriptor cy_desc; + + RNNDescriptors(const RNNParams& fn, cudnnHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) { + rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle)); + x_descs = fn.tensors.descriptors(x); + y_descs = fn.tensors.descriptors(y); + hx_desc.set(hx, 5); + hy_desc.set(hx, 5); + if (cx.defined()) { + cx_desc.set(cx, 5); + cy_desc.set(cx, 5); + } + } + + // TODO: This is annoying, having to put the cudnnTensorDescriptor_t + // in a contiguous array... + std::vector get_descs(const std::vector& descs) { + std::vector r; + r.reserve(descs.size()); + for (auto& desc : descs) { + r.emplace_back(desc.desc()); + } + return r; + } + + std::vector get_x_descs() { + return get_descs(x_descs); + } + + std::vector get_y_descs() { + return get_descs(y_descs); + } + }; + + int64_t get_num_weights(cudnnHandle_t handle, const RNNDescriptor& rnn_desc, + const TensorDescriptor& x_desc, cudnnDataType_t datatype) { + size_t weight_size; + AT_CUDNN_CHECK(cudnnGetRNNParamsSize(handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype)); + auto elem_size = dataSize(datatype); + AT_ASSERTM(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size"); + return weight_size / elem_size; + } + + int64_t _num_linear_layers(cudnnRNNMode_t mode) { + switch(mode) { + case CUDNN_LSTM: + return 8; + case CUDNN_GRU: + return 6; + case CUDNN_RNN_RELU: + return 2; + case CUDNN_RNN_TANH: + return 2; + default: + AT_ERROR("unknown cuDNN RNN mode %d", mode); + } + } + + /* + Returns weight and bias tensors for each layer of the RNN. These tensors + are views on the underlying weight buffer allocated by CuDNN. + + Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, respectively), + these parameters are concatenated along the first dimension. + These parameters are returned in a consistent order by CuDNN: + (reset, forget, cell, output) for LSTM + (reset, input, new) for GRU + Args: + fn: The RNN function object holding the RNN state + handle: a CuDNN handle + weight_buf: a 1D tensor containing the CuDNN-allocated weight (or grad_weight) buffer + Returns: + parameters: [(weight_ih, weight_hh, bias_ih, bias_hh)*], with length equal to the num_layers. + This is represented as a pair of vector, and outer-dimension stride + (NB: Can't return MatrixRef because we need to allocate the underlying tensor) + */ + std::pair, size_t> // stride0 + get_parameters( + cudnnHandle_t handle, + const RNNDescriptorParams& rnn, + const RNNDescriptor& rnn_desc, + const TensorDescriptor& x_desc, + const FilterDescriptor& w_desc, + const Tensor& weight_buf + ) { + auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams }; + std::vector params; + int64_t num_linear_layers = _num_linear_layers(rnn.mode); + int64_t num_layers = rnn.num_directions() * rnn.num_layers; + size_t cur_offset = 0; + size_t global_layer_params_count = 0; + for (int64_t layer = 0; layer < num_layers; layer++) { + size_t layer_params_count = 0; + for (auto cudnn_method : cudnn_methods) { + for (int64_t linear_id = 0; linear_id < num_linear_layers; linear_id++) { + FilterDescriptor lin_layer_mat_desc; + void* matrix_pointer; + AT_CUDNN_CHECK(cudnn_method( + handle, + rnn_desc.desc(), + layer, + x_desc.desc(), + w_desc.desc(), + weight_buf.data_ptr(), + linear_id, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer + )); + cudnnDataType_t data_type; + cudnnTensorFormat_t format; + int nb_dims; + constexpr int min_dim = 3; + // TODO: The use of CPU tensor here is a bit goofy in C++, + // some sort of alloca would be good enough except that it is + // kind of convenient to be able to prod() on it. + Tensor filter_dim_a = at::CPU(kInt).tensor(min_dim); + AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor( + lin_layer_mat_desc.desc(), + min_dim, + &data_type, + &format, + &nb_dims, + filter_dim_a.data() + )); + + AT_ASSERTM(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim = ", min_dim); + filter_dim_a = filter_dim_a.slice(0, 0, nb_dims); + auto elem_size = dataSize(rnn.datatype); + auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr(); + AT_ASSERTM(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size); + size_t offset = offset_bytes / elem_size; + + // for all the RNN types provided by CUDNN, all the ih weights + // are the same size and are allocated in a contiguous chunk + // (same for the hh weights, and the ih and hh biases). + // Since we're storing all the weights in a single tensor anyway, + // might as well merge the CUDNN ones into a single tensor as well + int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data(); + if (linear_id == 0 || linear_id == num_linear_layers / 2) { + std::initializer_list size = { + mat_numel * num_linear_layers / 2, 1}; + // Generate a new parameter tensor which is a view into the + // weight_buf. + Tensor param = weight_buf.type().tensor().set_(*weight_buf.storage(), offset, size); + params.emplace_back(std::move(param)); + layer_params_count++; + } else { + AT_ASSERTM(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset); + } + cur_offset = offset + mat_numel; + } + } // for cudnn_method + if (layer == 0) { + global_layer_params_count = layer_params_count; + } else { + AT_ASSERTM(global_layer_params_count == layer_params_count, + "global_layer_params_count = ", global_layer_params_count, + "; layer_params_count = ", layer_params_count); + } + } // for layer + return std::make_pair(params, global_layer_params_count); + } + + void _copyParams(MatrixRef params_from, MatrixRef params_to) { + AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch"); + for (size_t i = 0; i < params_from.size(0); i++) { + auto layer_params_from = params_from[i]; + auto layer_params_to = params_to[i]; + // NOTE: these lists have all weights before all biases, so if the layer + // doesn't use biases, iteration will terminate once layer_params_from ends + // and ignore them. + for (auto a = layer_params_from.begin(), b = layer_params_to.begin(); + a != layer_params_from.end() && b != layer_params_to.end(); + ++a, ++b) { + auto param_from = *a, param_to = *b; + AT_ASSERTM(param_from.type() == param_to.type(), "parameter types mismatch"); + param_to.copy_(param_from.view_as(param_to)); + } + } + } + + std::vector _input_size(const TensorDescriptorListParams& tensors) { + if (tensors.is_input_packed()) { + return {tensors.batch_sizes_sum, tensors.input_size}; + } else { + return {tensors.seq_length, tensors.mini_batch, tensors.input_size}; + } + } + + std::vector _hidden_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) { + return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size}; + } + + std::vector _output_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) { + if (tensors.is_input_packed()) { + return {tensors.batch_sizes_sum, rnn.hidden_size * rnn.num_directions()}; + } else { + return {tensors.seq_length, tensors.mini_batch, rnn.hidden_size * rnn.num_directions()}; + } + } + +} // anonymous namespace + +// NB: does inplace update into TensorList +// It would be a relatively simple matter to refactor this into multiple +// functions, only one of which does an inplace update, but we leave this +// for future work +Tensor _cudnn_rnn_flatten_weight( + TensorList weight_arr, int64_t weight_stride0, + int64_t input_size, + int64_t fn_mode, int64_t fn_hidden_size, + int64_t fn_num_layers, bool batch_first, + bool fn_bidirectional + ) { + + if (weight_arr.size() == 0) { + throw std::runtime_error("_cudnn_rnn_flatten_weight_: cannot flatten empty weight list"); + } + + auto any_param = weight_arr[0]; + + RNNDescriptorParams rnn; + rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(any_param)); + + auto handle = getCudnnHandle(); + RNNDescriptor rnn_desc = rnn.descriptor(handle); + + TensorGeometry x_geom({1, input_size}); + TensorDescriptor x_desc; + x_desc.set(getCudnnDataType(any_param), x_geom.sizes(), x_geom.strides(), 5); + + auto num_weights = get_num_weights(handle, rnn_desc, x_desc, rnn.datatype); + auto weight_buf = any_param.type().tensor(num_weights).zero_(); + + FilterDescriptor w_desc; + w_desc.set(weight_buf, 3); + + // Slice off views into weight_buf + std::vector params_arr; + size_t params_stride0; + std::tie(params_arr, params_stride0) = get_parameters(handle, rnn, rnn_desc, x_desc, w_desc, weight_buf); + + MatrixRef weight{weight_arr, static_cast(weight_stride0)}, + params{params_arr, params_stride0}; + + // Copy weights + _copyParams(weight, params); + + // Update the storage + for (size_t i = 0; i < weight.size(0); i++) { + for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin(); + orig_param_it != weight[i].end() && new_param_it != params[i].end(); + orig_param_it++, new_param_it++) { + auto orig_param = *orig_param_it, new_param = *new_param_it; + orig_param.set_(new_param.view_as(orig_param)); + } + } + + return weight_buf; +} + +// NB: when fn_batch_sizes is empty, that means no batch sizes was specified +std::tuple _cudnn_rnn( + const Tensor& input_r, + TensorList weight, int64_t weight_stride0, + const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx, + int64_t fn_mode, int64_t fn_hidden_size, + int64_t fn_num_layers, bool batch_first, double fn_dropout, + bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes, + const Tensor& fn_dropout_state + ) { + + auto input = input_r; + auto weight_buf = weight_buf_r; + if (fn_dropout_state.defined()) { + auto input_arg = TensorArg(input, "input", 1); + auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15); + checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg); + } + RNNParams fn; + fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(input)); + fn.dropout.set(fn_train, fn_dropout, fn_dropout_state); + fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first); + + // TODO: Set device to input + + if (fn.rnn.mode != CUDNN_LSTM) { + if (cx.defined()) { + throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); + } + } + + // TODO: can batch_first be a wrapper around this function? + auto is_input_packed = fn.tensors.batch_sizes.size() != 0; + if (batch_first && !is_input_packed) { + input = input.transpose(0, 1); + } + + auto hidden_size = _hidden_size(fn.rnn, fn.tensors); + auto output_size = _output_size(fn.rnn, fn.tensors); + + if (!hx.is_contiguous()) { + throw std::runtime_error("rnn: hx is not contiguous"); + } + if (cx.defined() && !cx.is_contiguous()) { + throw std::runtime_error("rnn: cx is not contiguous"); + } + + auto x = input.contiguous(); + auto output = input.type().tensor(output_size); + auto hy = hx.type().tensor(hidden_size); + Tensor cy; + if (cx.defined()) { + cy = cx.type().tensor(hidden_size); + } else { + cy = hx.type().tensor(); // NB: Not allowed to return undefined tensors + } + auto y = output; + + auto handle = getCudnnHandle(); + RNNDescriptors descs(fn, handle, x, y, hx, cx); + + FilterDescriptor w_desc; + if (!weight_buf.defined()) { + auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], fn.rnn.datatype); + weight_buf = x.type().tensor(num_weights); + w_desc.set(weight_buf, 3); + weight_buf.zero_(); + std::vector params; + size_t params_stride0; + std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf); + _copyParams(MatrixRef{weight, static_cast(weight_stride0)}, + MatrixRef{params, params_stride0}); + } else { + w_desc.set(weight_buf, 3); + } + + if (cx.defined() && !cx.sizes().equals(hidden_size)) { + std::ostringstream oss; + oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes(); + throw std::runtime_error(oss.str()); + } + + size_t workspace_size; + auto x_descs_arr = descs.get_x_descs(); + auto y_descs_arr = descs.get_y_descs(); + AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &workspace_size + )); + Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size); + + Tensor reserve; + // NB: Previously, the test was for fn.requires_grad, but we don't have + // this information. Use 'train' as a proxy. + if (fn_train) { + size_t reserve_size; + AT_CUDNN_CHECK(cudnnGetRNNTrainingReserveSize( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &reserve_size + )); + reserve = input.type().toScalarType(kByte).tensor(reserve_size); + AT_CUDNN_CHECK(cudnnRNNForwardTraining( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), x.data_ptr(), + descs.hx_desc.desc(), hx.data_ptr(), + descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, + w_desc.desc(), weight_buf.data_ptr(), + y_descs_arr.data(), y.data_ptr(), + descs.hy_desc.desc(), hy.data_ptr(), + descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr, + workspace.data_ptr(), workspace.size(0), + reserve.data_ptr(), reserve.size(0) + )); + } else { // inference + reserve = input.type().toScalarType(kByte).tensor(); + AT_CUDNN_CHECK(cudnnRNNForwardInference( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), x.data_ptr(), + descs.hx_desc.desc(), hx.data_ptr(), + descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, + w_desc.desc(), weight_buf.data_ptr(), + y_descs_arr.data(), y.data_ptr(), + descs.hy_desc.desc(), hy.data_ptr(), + descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr, + workspace.data_ptr(), workspace.size(0) + )); + + } + + if (batch_first && !is_input_packed) { + output.transpose_(0, 1); + } + + return std::make_tuple(output, hy, cy, reserve, weight_buf); +} + +std::tuple _cudnn_rnn_backward_input( + const Tensor& input_r, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx, + const Tensor& output_r, const Tensor& grad_output_r, const Tensor& grad_hy, + const Tensor& grad_cy, + int64_t fn_mode, int64_t fn_hidden_size, + int64_t fn_num_layers, bool batch_first, double fn_dropout, + bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes, + const Tensor& fn_dropout_state, const Tensor& fn_reserve, + std::array output_mask + ) { + + auto input = input_r; + auto grad_output = grad_output_r; + auto output = output_r; + + RNNParams fn; + fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(input)); + fn.dropout.set(fn_train, fn_dropout, fn_dropout_state); + fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first); + + // TODO: Set device to input + auto handle = getCudnnHandle(); + + if (fn.rnn.mode != CUDNN_LSTM) { + if (cx.defined()) { + throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); + } + } + + auto is_input_packed = fn_batch_sizes.size() != 0; + if (batch_first && !is_input_packed) { + input = input.transpose(0, 1); + grad_output = grad_output.transpose(0, 1); + output = output.transpose(0, 1); + } + + auto input_size = _input_size(fn.tensors); + auto hidden_size = _hidden_size(fn.rnn, fn.tensors); + auto output_size = _output_size(fn.rnn, fn.tensors); + + if (!hx.is_contiguous()) { + throw std::runtime_error("rnn: hx is not contiguous"); + } + if (cx.defined() && !cx.is_contiguous()) { + throw std::runtime_error("rnn: cx is not contiguous"); + } + + auto x = input.contiguous(); + auto dy = grad_output.contiguous(); + auto y = output; + auto w = weight_buf; + auto dx = input.type().tensor(input.sizes()); // TODO: more compact way of saying this + auto dhy = grad_hy.contiguous().view(hidden_size); + auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(hidden_size) : Tensor(); + auto dhx = hx.type().tensor(hidden_size); + AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN"); + auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor(); + + if (!fn_train) { + throw std::runtime_error("cudnn RNN backward can only be called in training mode"); + } + if (!input.sizes().equals(input_size)) { + std::ostringstream oss; + oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes(); + throw std::runtime_error(oss.str()); + } + if (!output.sizes().equals(output_size)) { + std::ostringstream oss; + oss << "Expected output size " << IntList{output_size} << ", got " << output.sizes(); + throw std::runtime_error(oss.str()); + } + if (hx.defined() && !hx.sizes().equals(hidden_size)) { + std::ostringstream oss; + oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes(); + throw std::runtime_error(oss.str()); + } + if (cx.defined() && !cx.sizes().equals(hidden_size)) { + std::ostringstream oss; + oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes(); + throw std::runtime_error(oss.str()); + } + if (dhy.defined() && !dhy.sizes().equals(hidden_size)) { + std::ostringstream oss; + oss << "Expected d_hidden size " << IntList{hidden_size} << ", got " << dhy.sizes(); + throw std::runtime_error(oss.str()); + } + if (dcy.defined() && !dcy.sizes().equals(hidden_size)) { + std::ostringstream oss; + oss << "Expected d_cell size " << IntList{hidden_size} << ", got " << dcy.sizes(); + throw std::runtime_error(oss.str()); + } + if (!dhy.is_cuda() || !dy.is_cuda() || (dcy.defined() && !dcy.is_cuda())) { + throw std::runtime_error("Gradients aren't CUDA tensors"); + } + + RNNDescriptors descs(fn, handle, x, y, hx, cx); + + FilterDescriptor w_desc; + w_desc.set(weight_buf, 3); + + size_t workspace_size; + auto x_descs_arr = descs.get_x_descs(); + auto y_descs_arr = descs.get_y_descs(); + AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &workspace_size + )); + // TODO: put this in the correct device??? + Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size); + + AT_CUDNN_CHECK(cudnnRNNBackwardData( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + y_descs_arr.data(), y.data_ptr(), + y_descs_arr.data(), dy.data_ptr(), + descs.hy_desc.desc(), dhy.data_ptr(), + descs.cy_desc.desc(), cx.defined() ? dcy.data_ptr() : nullptr, + w_desc.desc(), w.data_ptr(), + descs.hx_desc.desc(), hx.data_ptr(), + descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, + x_descs_arr.data(), dx.data_ptr(), + descs.hx_desc.desc(), dhx.data_ptr(), + descs.cx_desc.desc(), cx.defined() ? dcx.data_ptr() : nullptr, + workspace.data_ptr(), workspace.size(0), + fn_reserve.data_ptr(), fn_reserve.size(0) + )); + + if (batch_first && !is_input_packed) { + dx = dx.transpose_(0, 1); + } + + return std::make_tuple(dx, dhx, dcx); +} + +// NB: This MUST BE CALLED AFTER _cudnn_rnn_backward_input. +// We'll give a user friendly combined function... +std::vector _cudnn_rnn_backward_weight( + // TODO: I think tensor geometry sufficient for weight_buf/weight + const Tensor& input_r, TensorList weight_arr, int64_t weight_stride0, + const Tensor& weight_buf, const Tensor& hx, const Tensor& cx, + const Tensor& output_r, + int64_t fn_mode, int64_t fn_hidden_size, + int64_t fn_num_layers, bool batch_first, double fn_dropout, + bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes, + const Tensor& fn_dropout_state, const Tensor& fn_reserve + ) { + + MatrixRef weight{ weight_arr, static_cast(weight_stride0) }; + + auto input = input_r; + auto output = output_r; + + RNNParams fn; + fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, getCudnnDataType(input)); + fn.dropout.set(fn_train, fn_dropout, fn_dropout_state); + fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first); + + auto handle = getCudnnHandle(); + + if (fn.rnn.mode != CUDNN_LSTM) { + if (cx.defined()) { + throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); + } + } + + auto is_input_packed = fn_batch_sizes.size() != 0; + if (batch_first && !is_input_packed) { + input = input.transpose(0, 1); + output = output.transpose(0, 1); + } + + auto input_size = _input_size(fn.tensors); + auto hidden_size = _hidden_size(fn.rnn, fn.tensors); + + if (!fn_train) { + throw std::runtime_error("cudnn RNN backward can only be called in training mode"); + } + if (!input.sizes().equals(input_size)) { + std::ostringstream oss; + oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes(); + throw std::runtime_error(oss.str()); + } + if (hx.defined() && !hx.sizes().equals(hidden_size)) { + std::ostringstream oss; + oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes(); + throw std::runtime_error(oss.str()); + } + // TODO: the above were the only checks in rnn.py, but it doesn't seem + // like these checks are enough + + if (!hx.is_contiguous()) { + throw std::runtime_error("rnn: hx is not contiguous"); + } + if (cx.defined() && !cx.is_contiguous()) { + throw std::runtime_error("rnn: cx is not contiguous"); + } + + auto x = input.contiguous(); + const auto& y = output; + auto dw = weight_buf.type().tensor(weight_buf.sizes()).zero_(); + + RNNDescriptors descs(fn, handle, x, y, hx, cx); + + FilterDescriptor w_desc; + w_desc.set(weight_buf, 3); + + size_t workspace_size; + auto x_descs_arr = descs.get_x_descs(); + auto y_descs_arr = descs.get_y_descs(); + AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &workspace_size + )); + Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size); + + AT_CUDNN_CHECK(cudnnRNNBackwardWeights( + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), x.data_ptr(), + descs.hx_desc.desc(), hx.data_ptr(), + y_descs_arr.data(), y.data_ptr(), + workspace.data_ptr(), workspace.size(0), + w_desc.desc(), dw.data_ptr(), + fn_reserve.data_ptr(), fn_reserve.size(0) + )); + + std::vector grad_weight_arr; + grad_weight_arr.reserve( weight.numel() ); + for (const auto& w : weight_arr) { + grad_weight_arr.emplace_back(w.type().tensor(w.sizes()).zero_()); + } + + std::vector grad_params_arr; + size_t grad_params_stride0; + std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw); + _copyParams(MatrixRef{grad_params_arr, grad_params_stride0}, + MatrixRef{grad_weight_arr, static_cast(weight_stride0)}); + + return grad_weight_arr; // stride is known from call site (and also inconvenient to return) +} + +// We need this dispatcher because _cudnn_rnn_backward_weight has a stringent +// ordering requirement with _cudnn_rnn_backward_input +std::tuple> _cudnn_rnn_backward( + const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx, + const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r, + const Tensor& grad_cy_r, + int64_t mode, int64_t hidden_size, + int64_t num_layers, bool batch_first, double dropout, + bool train, bool bidirectional, IntList batch_sizes, + const Tensor& dropout_state, const Tensor& reserve, + std::array output_mask + ) { + + auto grad_output = grad_output_r.defined() ? grad_output_r : output.type().zeros_like(output); + auto grad_hy = grad_hy_r.defined() ? grad_hy_r : hx.type().zeros_like(hx); + auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : cx.type().zeros_like(cx)) : grad_cy_r; + + Tensor dx, dhx, dcx; + // NB: unconditionally compute this gradient, because it mutates reserve + std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]}); + std::vector dw; + if (output_mask[3]) { + dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve); + } + return std::tuple{dx, dhx, dcx, dw}; +} + +// TODO: I am not sure if we actually need the 'dropout' and 'train' parameters +// to initialize just the state tensor +Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) { + auto handle = getCudnnHandle(); + DropoutDescriptor dropout_desc; + auto dropout_p = train ? dropout : 0; + dropout_desc.initialize_rng(ty, handle, dropout_p, dropout_seed); + return dropout_desc.state; +} + +}} // namespace at::native + +#endif // AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp new file mode 100644 index 0000000..c345182 --- /dev/null +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -0,0 +1,302 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/native/SpectralOpsUtils.h" +#include "ATen/Config.h" + +#if !AT_MKL_ENABLED() + +namespace at { namespace native { + +Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim, + bool complex_input, bool complex_output, + bool inverse, IntList checked_signal_sizes, + bool normalized, bool onesided, + IntList output_sizes) { + throw std::runtime_error("fft: ATen not compiled with MKL support"); +} + +}} + +#else // AT_MKL_ENABLED + +#include "ATen/ATen.h" +#include "ATen/Config.h" +#include "ATen/Dispatch.h" +#include "ATen/Utils.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace at { namespace native { + +// In real-to-complex transform, MKL FFT only fills half of the values due to +// conjugate symmetry. See native/SpectralUtils.h for more details. +// The following structs are used to fill in the other half with symmetry in +// case of real-to-complex transform with onesided=False flag. +// See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h. + +template +static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output, + int64_t signal_ndim, int64_t size_last_dim, + int64_t start_last_dim_idx, int64_t i, int64_t num) { + scalar_t *data = output.data(); + + // A slice means a slice of last dimension (of size size_last_dim) + + // This function iterates through the slices to fill, i.e. to_slice_data + // (basically data_slices[i:i+num]), and keeps track of the slices it reads + // data from, i.e., from_slice_data, using from_slice_indices, a vector + // containing the index of the from_slice_data slice. + + // Compute the indices for the first from_slice_data + std::vector from_slice_indices(signal_ndim); // up to before last signal dim + int64_t remainder = i; + // set last signal dim values + int64_t from_slice_offset = 0; + for (int64_t d = signal_ndim - 1; d >= 0; d--) { + int64_t dim_size = output.size(d); + int64_t dim_idx = remainder % dim_size; + remainder = remainder / dim_size; + from_slice_indices[d] = dim_idx; + if (d == 0) { + from_slice_offset += dim_idx * output.stride(d); + } else if (dim_idx != 0) { + from_slice_offset += (dim_size - dim_idx) * output.stride(d); + } + } + + // First to_slice_data and from_slice_data + scalar_t *to_slice_data = data + i * size_last_dim * 2; + scalar_t *from_slice_data = data + from_slice_offset; + + while (num > 0) { + // Fill to_slice_data from values in from_slice_data + for (int64_t j = start_last_dim_idx; j < size_last_dim; j++) { + // multiply index by 2 because of the last complex dim has size 2 + int64_t to_idx = j * 2; + int64_t from_idx = (size_last_dim - j) * 2; + to_slice_data[to_idx] = from_slice_data[from_idx]; + to_slice_data[to_idx + 1] = -from_slice_data[from_idx + 1]; + } + // Compute the next to_slice_data and from_slice_data slices + to_slice_data += size_last_dim * 2; + for (int64_t d = signal_ndim - 1; d >= 0; d--) { + // Compute the next index at this dimension using conjugate symmetry + // Break out of this loop if nothing carries over + from_slice_indices[d] = (from_slice_indices[d] + 1) % output.size(d); + if (d > 0) { + // At d > 0 nonbatch dim, to get next from_slice_data offset + // 1. if this dim idx becomes 1, will need to add (size - 1) * stride + // 2. otherwise, will need to subtract stride + if (from_slice_indices[d] == 0) { + // Substract. Carries over to previous dimension + from_slice_data -= output.stride(d); + } else if (from_slice_indices[d] == 1) { + // Dimension index becomes 1 + // Doesn't carry over to previous dimension + from_slice_data += (output.size(d) - 1) * output.stride(d); + break; + } else { + // Substract. Doesn't carry over to previous dimension + from_slice_data -= output.stride(d); + break; + } + } else { + // At d = 0 nonbatch dim, it means that to_slice_data ise now at a the + // beginning of a data sample. It maps to itself by conjugate symmetry. + from_slice_data = to_slice_data; + } + } + num--; + } +} + +// input should be a contiguous batched tensor of same size as full (twosided) +// signals, but only contains half (onesided) of the values. +// This function modifies inplace. +static inline void _fft_fill_with_conjugate_symmetry_(Tensor& input, + int64_t signal_ndim, int64_t size_last_dim, + int64_t last_dim_start_slice) { + if (last_dim_start_slice >= size_last_dim) { + return; + } + + int64_t num = 1; + for (int64_t d = 0; d < signal_ndim; d++) { + num *= input.size(d); + } +#ifdef _OPENMP + if (num > 500) { + int nthreads = omp_get_num_threads(); + int64_t num_slices_per_thread = num / nthreads + 1; + #pragma omp parallel + { + int tid = omp_get_thread_num(); + int64_t start = tid * num_slices_per_thread; + AT_DISPATCH_FLOATING_TYPES(input.type(), "_fft_fill_with_conjugate_symmetry", [&] { + _fft_fill_with_conjugate_symmetry_slice(input, signal_ndim, size_last_dim, + last_dim_start_slice, start, std::min(num_slices_per_thread, num - start)); + }); + } + return; + } +#endif + AT_DISPATCH_FLOATING_TYPES(input.type(), "_fft_fill_with_conjugate_symmetry", [&] { + _fft_fill_with_conjugate_symmetry_slice(input, signal_ndim, size_last_dim, + last_dim_start_slice, 0, num); + }); +} + +// MKL DFTI +Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, + bool complex_input, bool complex_output, + bool inverse, IntList checked_signal_sizes, + bool normalized, bool onesided, + IntList output_sizes) { + int64_t batch = self.size(0); + Tensor input = self; + // real/imag dimension must aligned when viewed as of complex type + if (complex_input) { + bool need_contiguous = input.stride(-1) != 1; + for (int64_t i = 0; !need_contiguous && i <= signal_ndim; i++) { + need_contiguous |= input.stride(i) % 2 != 0; + } + if (need_contiguous) { + input = input.contiguous(); + } + } + + // check if we can use MKL because MKL_LONG is 32bit on some OS, e.g. Windows + // need to check input and output size and strides + // be careful about complex domain, where the stride needs to be divided by 2 + // only need to test upper bound MKL_LONG_MAX as these values are non-negative + if (sizeof(MKL_LONG) < sizeof(int64_t)) { + bool need_contiguous = false; + int64_t inumel = 1 /* istride if we contiguous-fy */, onumel = 1; + int64_t isize, osize, istride, ostride; + for (int64_t i = signal_ndim; i >= 0; i--) { + isize = input.size(i); + osize = output_sizes[i]; + istride = complex_input ? input.stride(i) >> 1 : input.stride(i); + ostride = onumel; + if (isize > MKL_LONG_MAX || osize > MKL_LONG_MAX || ostride > MKL_LONG_MAX) { + std::ostringstream ss; + ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ " + << MKL_LONG_MAX << "]"; + throw std::runtime_error(ss.str()); + } + if (!need_contiguous && istride > MKL_LONG_MAX) { + // If we didn't plan to contiguous-fy but the `istride` exceeds bound, + // check if we can stride (equal to `inumel`) get back within bound if + // we contiguous-fy. If so, then we need to always check `inumel` + // instead for the remaining iterations. The iterations before this are + // fine as `inumel` is non-decreasing. + need_contiguous = true; + } + if (need_contiguous && inumel > MKL_LONG_MAX) { + std::ostringstream ss; + ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ " + << MKL_LONG_MAX << "]"; + throw std::runtime_error(ss.str()); + } + inumel *= isize; + onumel *= osize; + } + } + Tensor output = input.type().tensor(output_sizes); + + // precision + DFTI_CONFIG_VALUE prec; + if (input.type().scalarType() == ScalarType::Float) { + prec = DFTI_SINGLE; + } else if (input.type().scalarType() == ScalarType::Double) { + prec = DFTI_DOUBLE; + } else { + std::ostringstream ss; + ss << "MKL FFT doesn't support tensor of type: " + << at::toString(input.type().scalarType()); + throw std::runtime_error(ss.str()); + } + // signal type + DFTI_CONFIG_VALUE signal_type; + if (!inverse) { + signal_type = complex_input ? DFTI_COMPLEX : DFTI_REAL; + } else { + signal_type = complex_output ? DFTI_COMPLEX : DFTI_REAL; + } + // create descriptor with signal size + std::vector mkl_signal_sizes(checked_signal_sizes.begin(), checked_signal_sizes.end()); + DftiDescriptor descriptor; + descriptor.init(prec, signal_type, signal_ndim, mkl_signal_sizes.data()); + // out of place FFT + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); + // batch mode + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch)); + + auto istrides = input.strides(); + auto ostrides = output.strides(); + // batch dim stride, i.e., dist between each data + MKL_LONG idist = complex_input ? istrides[0] >> 1 : istrides[0]; + MKL_LONG odist = complex_output ? ostrides[0] >> 1 : ostrides[0]; + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); + // signal strides + // first val is offset, set to zero (ignored) + std::vector mkl_istrides(1 + signal_ndim, 0), mkl_ostrides(1 + signal_ndim, 0); + for (int64_t i = 1; i <= signal_ndim; i++) { + mkl_istrides[i] = complex_input ? istrides[i] >> 1 : istrides[i]; + mkl_ostrides[i] = complex_output ? ostrides[i] >> 1 : ostrides[i]; + } + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_STRIDES, mkl_istrides.data())); + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_ostrides.data())); + // if conjugate domain of real is involved, set standard CCE storage type + // this will become default in MKL in future + if (!complex_input || !complex_output) { + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); + } + // rescale if needed by normalized flag or inverse transform + if (normalized || inverse) { + auto signal_numel = at::prod_intlist(checked_signal_sizes); + double double_scale; + if (normalized) { + double_scale = 1.0 / std::sqrt(static_cast(signal_numel)); + } else { + double_scale = 1.0 / static_cast(signal_numel); + } + MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), + inverse ? DFTI_BACKWARD_SCALE : DFTI_FORWARD_SCALE, + prec == DFTI_DOUBLE ? double_scale : static_cast(double_scale))); + } + // finalize + MKL_DFTI_CHECK(DftiCommitDescriptor(descriptor.get())); + // run + if (!inverse) { + MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), input.data_ptr(), output.data_ptr())); + } else { + MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), input.data_ptr(), output.data_ptr())); + } + // now if needed, fill out the other half using Hermitian symmetry dim + if (!complex_input && complex_output && !onesided) { + auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1]; + auto start_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim); + _fft_fill_with_conjugate_symmetry_(output, signal_ndim, size_last_signal_dim, start_slice); + } + return output; +} + +}} // namespace at::native + +#endif + diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp new file mode 100644 index 0000000..00f4e8f --- /dev/null +++ b/aten/src/ATen/native/mkldnn/Conv.cpp @@ -0,0 +1,447 @@ +#include +#include +#include + +#if !AT_MKLDNN_ENABLED() + +namespace at { namespace native { + +at::Tensor mkldnn_convolution( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, + IntList padding, IntList stride, IntList dilation, int64_t groups) { + throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support"); +} + +at::Tensor mkldnn_convolution_backward_input( + IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) { + throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support"); +} + +std::tuple mkldnn_convolution_backward_weights( + IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) { + throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support"); +} + +std::tuple mkldnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, std::array output_mask) { + throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support"); +} + +}} + +#else // AT_MKLDNN_EBABLED + +#include + +using namespace mkldnn; + +namespace at { namespace native { + +constexpr int input_batch_size_dim = 0; // also grad_input +constexpr int input_channels_dim = 1; +constexpr int output_batch_size_dim = 0; // also grad_output +constexpr int output_channels_dim = 1; +constexpr int weight_output_channels_dim = 0; +constexpr int weight_input_channels_dim = 1; + +// Often written as 2 + max_dim (extra dims for batch size and channels) +constexpr int max_dim = 3; + +std::vector conv_output_size( + IntList input_size, IntList weight_size, + IntList padding, IntList stride, IntList dilation, int64_t groups) +{ + auto dim = input_size.size(); + std::vector output_size(dim); + output_size[0] = input_size[input_batch_size_dim]; + output_size[1] = weight_size[weight_output_channels_dim]; + for (size_t d = 2; d < dim; ++d) { + auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1; + output_size[d] = (input_size[d] + (2 * padding[d - 2]) + - kernel) / stride[d - 2] + 1; + } + return output_size; +} + +at::Tensor mkldnn_convolution( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, + IntList padding, IntList stride, IntList dilation, int64_t groups) +{ + auto output = input.type().tensor(conv_output_size( + input.sizes(), weight.sizes(), padding, stride, dilation, groups)); + + auto cpu_engine = CpuEngine::Instance().get_engine(); + + int32_t g = groups; + + int32_t n = input.size(0); + int32_t ic = input.size(1); + int32_t ih = input.size(2); + int32_t iw = input.size(3); + + int32_t oc = output.size(1); + int32_t oh = output.size(2); + int32_t ow = output.size(3); + + int32_t kh = weight.size(2); + int32_t kw = weight.size(3); + + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + auto data_t = memory::data_type::f32; + auto format_any = memory::format::any; + auto format_nchw = memory::format::nchw; + auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw; + auto format_x = memory::format::x; + + memory::dims input_tz = {n, ic, ih, iw}; + memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw}; + memory::dims bias_tz = {oc}; + memory::dims output_tz = {n, oc, oh, ow}; + memory::dims _stride = {sh, sw}; + memory::dims _padding = {ph, pw}; + + auto input_md = memory::desc({input_tz}, data_t, format_any); + auto weight_md = memory::desc({weight_tz}, data_t, format_any); + auto bias_md = memory::desc({bias_tz}, data_t, format_any); + auto output_md = memory::desc({output_tz}, data_t, format_any); + + std::shared_ptr conv_forward_desc; + if (bias.defined()) { + conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, input_md, weight_md, bias_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } else { + conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, input_md, weight_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } + + std::shared_ptr conv_forward_pd; + conv_forward_pd.reset(new convolution_forward::primitive_desc( + *conv_forward_desc, cpu_engine)); + + auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, + input.data_ptr()); + auto weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine}, + weight.data_ptr()); + auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, + output.data_ptr()); + + std::vector net; + + auto input_pd = conv_forward_pd->src_primitive_desc(); + auto input_memory = input_usr_memory; + if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) { + input_memory = memory(input_pd); + net.push_back(reorder(input_usr_memory, input_memory)); + } + + auto weight_pd = conv_forward_pd->weights_primitive_desc(); + auto weight_memory = weight_usr_memory; + if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) { + weight_memory = memory(weight_pd); + net.push_back(reorder(weight_usr_memory, weight_memory)); + } + + auto output_pd = conv_forward_pd->dst_primitive_desc(); + auto output_memory = output_usr_memory; + if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) { + output_memory = memory(output_pd); + } + + std::shared_ptr conv_forward; + std::shared_ptr bias_usr_memory; + if (bias.defined()) { + bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine}, + bias.data_ptr())); + conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory, + weight_memory, *bias_usr_memory, output_memory)); + } else { + conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory, + weight_memory, output_memory)); + } + net.push_back(*conv_forward); + + if (output_memory != output_usr_memory) { + net.push_back(reorder(output_memory, output_usr_memory)); + } + + Stream::Instance().get_stream().submit(net); + + return output; +} + +Tensor mkldnn_convolution_backward_input( + IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) +{ + auto grad_input = grad_output.type().tensor(input_size); + + auto cpu_engine = CpuEngine::Instance().get_engine(); + + int32_t g = groups; + + int32_t n = grad_input.size(0); + int32_t ic = grad_input.size(1); + int32_t ih = grad_input.size(2); + int32_t iw = grad_input.size(3); + + int32_t oc = grad_output.size(1); + int32_t oh = grad_output.size(2); + int32_t ow = grad_output.size(3); + + int32_t kh = weight.size(2); + int32_t kw = weight.size(3); + + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + auto data_t = memory::data_type::f32; + auto format_any = memory::format::any; + auto format_nchw = memory::format::nchw; + auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw; + + memory::dims input_tz = {n, ic, ih, iw}; + memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw}; + memory::dims bias_tz = {oc}; + memory::dims output_tz = {n, oc, oh, ow}; + memory::dims _stride = {sh, sw}; + memory::dims _padding = {ph, pw}; + + auto input_md = memory::desc({input_tz}, data_t, format_any); + auto weight_md = memory::desc({weight_tz}, data_t, format_any); + auto bias_md = memory::desc({bias_tz}, data_t, format_any); + auto output_md = memory::desc({output_tz}, data_t, format_any); + + // need to re-create conv_forward_pd to feed conv_backward_data_pd + std::shared_ptr conv_forward_desc; + if (bias_defined) { + conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, input_md, weight_md, bias_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } else { + conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, input_md, weight_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } + + std::shared_ptr conv_forward_pd; + conv_forward_pd.reset(new convolution_forward::primitive_desc( + *conv_forward_desc, cpu_engine)); + + std::shared_ptr conv_backward_data_desc; + conv_backward_data_desc.reset(new convolution_backward_data::desc( + convolution_direct, input_md, weight_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + + std::shared_ptr conv_backward_data_pd; + conv_backward_data_pd.reset(new convolution_backward_data::primitive_desc( + *conv_backward_data_desc, cpu_engine, *conv_forward_pd)); + + auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, + grad_output.data_ptr()); + auto weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine}, + weight.data_ptr()); + auto grad_input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, + grad_input.data_ptr()); + + std::vector net; + + auto grad_output_pd = conv_backward_data_pd->diff_dst_primitive_desc(); + auto grad_output_memory = grad_output_usr_memory; + if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) { + grad_output_memory = memory(grad_output_pd); + net.push_back(reorder(grad_output_usr_memory, grad_output_memory)); + } + + auto weight_pd = conv_backward_data_pd->weights_primitive_desc(); + auto weight_memory = weight_usr_memory; + if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) { + weight_memory = memory(weight_pd); + net.push_back(reorder(weight_usr_memory, weight_memory)); + } + + auto grad_input_pd = conv_backward_data_pd->diff_src_primitive_desc(); + auto grad_input_memory = grad_input_usr_memory; + if (grad_input_memory.get_primitive_desc() != memory::primitive_desc(grad_input_pd)) { + grad_input_memory = memory(grad_input_pd); + } + + std::shared_ptr conv_backward_data; + conv_backward_data.reset(new convolution_backward_data(*conv_backward_data_pd, + grad_output_memory, weight_memory, grad_input_memory)); + net.push_back(*conv_backward_data); + + if (grad_input_memory != grad_input_usr_memory) { + net.push_back(reorder(grad_input_memory, grad_input_usr_memory)); + } + + Stream::Instance().get_stream().submit(net); + + return grad_input; +} + +std::tuple mkldnn_convolution_backward_weights( + IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) +{ + auto grad_weight = grad_output.type().tensor(weight_size); + + Tensor grad_bias; + if (bias_defined) { + grad_bias = grad_output.type().tensor({grad_output.size(1)}); + } + + auto cpu_engine = CpuEngine::Instance().get_engine(); + + int32_t g = groups; + + int32_t n = input.size(0); + int32_t ic = input.size(1); + int32_t ih = input.size(2); + int32_t iw = input.size(3); + + int32_t oc = grad_output.size(1); + int32_t oh = grad_output.size(2); + int32_t ow = grad_output.size(3); + + int32_t kh = grad_weight.size(2); + int32_t kw = grad_weight.size(3); + + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + auto data_t = memory::data_type::f32; + auto format_any = memory::format::any; + auto format_nchw = memory::format::nchw; + auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw; + auto format_x = memory::format::x; + + memory::dims input_tz = {n, ic, ih, iw}; + memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw}; + memory::dims bias_tz = {oc}; + memory::dims output_tz = {n, oc, oh, ow}; + memory::dims _stride = {sh, sw}; + memory::dims _padding = {ph, pw}; + + memory::desc input_md({input_tz}, data_t, format_any); + memory::desc weight_md({weight_tz}, data_t, format_any); + memory::desc bias_md({bias_tz}, data_t, format_any); + memory::desc output_md({output_tz}, data_t, format_any); + + // need to re-create conv_forward_pd to feed conv_backward_weight_pd + std::shared_ptr conv_forward_desc; + if (bias_defined) { + conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, input_md, weight_md, bias_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } else { + conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, input_md, weight_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } + + std::shared_ptr conv_forward_pd; + conv_forward_pd.reset(new convolution_forward::primitive_desc( + *conv_forward_desc, cpu_engine)); + + std::shared_ptr conv_backward_weight_desc; + if (bias_defined) { + conv_backward_weight_desc.reset(new convolution_backward_weights::desc( + convolution_direct, input_md, weight_md, bias_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } else { + conv_backward_weight_desc.reset(new convolution_backward_weights::desc( + convolution_direct, input_md, weight_md, output_md, + _stride, _padding, _padding, padding_kind::zero)); + } + + std::shared_ptr conv_backward_weight_pd; + conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc( + *conv_backward_weight_desc, cpu_engine, *conv_forward_pd)); + + auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, + input.data_ptr()); + auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, + grad_output.data_ptr()); + auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine}, + grad_weight.data_ptr()); + std::shared_ptr grad_bias_memory; + + std::vector net; + + auto input_pd = conv_backward_weight_pd->src_primitive_desc(); + auto input_memory = input_usr_memory; + if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) { + input_memory = memory(input_pd); + net.push_back(reorder(input_usr_memory, input_memory)); + } + + auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc(); + auto grad_output_memory = grad_output_usr_memory; + if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) { + grad_output_memory = memory(grad_output_pd); + net.push_back(reorder(grad_output_usr_memory, grad_output_memory)); + } + + auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc(); + auto grad_weight_memory = grad_weight_usr_memory; + if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) { + grad_weight_memory = memory(grad_weight_pd); + } + + std::shared_ptr conv_backward_weight; + if (bias_defined) { + grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine}, + grad_bias.data_ptr())); + conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd, + input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory)); + } else { + conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd, + input_memory, grad_output_memory, grad_weight_memory)); + } + + net.push_back(*conv_backward_weight); + + if (grad_weight_memory != grad_weight_usr_memory) { + net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory)); + } + + Stream::Instance().get_stream().submit(net); + + return std::tuple{grad_weight, grad_bias}; +} + +std::tuple mkldnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntList padding, IntList stride, IntList dilation, int64_t groups, std::array output_mask) +{ + Tensor grad_output = grad_output_t.contiguous(); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = at::mkldnn_convolution_backward_input( + input.sizes(), grad_output, weight, padding, stride, dilation, groups, output_mask[2]); + } + if (output_mask[1] || output_mask[2]) { + std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights( + weight.sizes(), grad_output, input, padding, stride, dilation, groups, output_mask[2]); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +}} // namespace at::native + +#endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml new file mode 100644 index 0000000..edadf9b --- /dev/null +++ b/aten/src/ATen/native/native_functions.yaml @@ -0,0 +1,1926 @@ +# See README.md in this directory for more guidance + + +# Temporary type cast operators. These are needed to trace type-casts now since +# Type's are not supported in the IR. Instead, we call down to these +# specialized operators for each datatype. +# TODO: remove when we have Type support in the IR +- func: _cast_Byte(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Char(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Double(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Float(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Int(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Long(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Short(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor + variants: function, method + +- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor + variants: function + dispatch: + CUDA: _cudnn_rnn_flatten_weight + +- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + variants: function + dispatch: + CUDA: _cudnn_rnn + +- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array output_mask) -> (Tensor, Tensor, Tensor, TensorList) + variants: function + dispatch: + CUDA: _cudnn_rnn_backward + +- func: _cudnn_init_dropout_state(Type self_ty, double dropout, bool train, int64_t dropout_seed) -> Tensor + variants: function + dispatch: + CUDA: _cudnn_init_dropout_state + +- func: abs(Tensor self) -> Tensor + +- func: abs_(Tensor self) -> Tensor + dispatch: + CPU: _abs__cpu + CUDA: _abs__cuda + +- func: abs_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _abs_out_cpu + CUDA: _abs_out_cuda + +- func: acos(Tensor self) -> Tensor + +- func: acos_(Tensor self) -> Tensor + dispatch: + CPU: _acos__cpu + CUDA: _acos__cuda + +- func: acos_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _acos_out_cpu + CUDA: _acos_out_cuda + +- func: avg_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, bool ceil_mode=false, bool count_include_pad=true) -> Tensor + variants: function + +- func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor + variants: function + +- func: adaptive_max_pool1d(Tensor self, IntList[1] output_size) -> (Tensor, Tensor) + variants: function + +- func: allclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> bool + device_guard: false + +- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor + +- func: addmv_(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor + +- func: addmv_out(Tensor result, Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + +- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + +- func: addr_(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method + +- func: addr_out(Tensor result, Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + +- func: all(Tensor self, int64_t dim, bool keepdim=false) -> Tensor + +- func: all_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor + variants: function + +- func: any(Tensor self, int64_t dim, bool keepdim=false) -> Tensor + +- func: any_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor + variants: function + +- func: arange(Scalar start, Scalar end, TensorOptions options={}) -> Tensor + variants: function + +- func: arange(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor + variants: function + +- func: arange_out(Tensor result, Scalar start, Scalar end) -> Tensor + variants: function + +- func: arange_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor + variants: function + +- func: arange(Scalar end, TensorOptions options={}) -> Tensor + variants: function + +- func: arange_out(Tensor result, Scalar end) -> Tensor + variants: function + +- func: arange(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor + variants: function + deprecated: true + +- func: arange(Type dtype, Scalar end) -> Tensor + variants: function + deprecated: true + +# This function is a temporary hack to allow tracing of arange like constructs with dynamic +# bounds on arange. Normal arange is not traceable because it does not take any tensor inputs; +# if the range you need is based on another tensor, calling this function directly will +# preserve tracing. Get rid of this when arange can directly take tensors for bounds +# (so that it can be traced directly). +- func: _dim_arange(Tensor like, int64_t dim) -> Tensor + variants: function + +# `argmin` and `argmax` are exposed in C++ but not in Python, where we only +# expose `_argmin` and `_argmax` (which call the first versions). In Python, we +# then define our own `argmax` and `argmin` that handle passing `dim=None`, +# which gets the argmax/argmin of the flattened array. + +- func: argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor +- func: argmax(Tensor self) -> Tensor +- func: _argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor + +- func: argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor +- func: argmin(Tensor self) -> Tensor +- func: _argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor + +# The actual implementations live in Declarations.cwrap. These are just to +# provide default values for storage_offset=self.storage_offset() +- func: as_strided(Tensor self, IntList size, IntList stride) -> Tensor +- func: as_strided_(Tensor self, IntList size, IntList stride) -> Tensor + +- func: asin(Tensor self) -> Tensor + +- func: asin_(Tensor self) -> Tensor + dispatch: + CPU: _asin__cpu + CUDA: _asin__cuda + +- func: asin_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _asin_out_cpu + CUDA: _asin_out_cuda + +- func: atan(Tensor self) -> Tensor + +- func: atan_(Tensor self) -> Tensor + dispatch: + CPU: _atan__cpu + CUDA: _atan__cuda + +- func: atan_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _atan_out_cpu + CUDA: _atan_out_cuda + +- func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor + variants: function + +- func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor + variants: function + +- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double momentum, double eps, bool cudnn_enabled) -> Tensor + variants: function + +- func: bernoulli(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor + +- func: bernoulli(Tensor self, double p, Generator* generator=nullptr) -> Tensor + +- func: bernoulli(Tensor self) -> Tensor + +- func: bernoulli_(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor + +- func: bernoulli_(Tensor self, double p, Generator* generator=nullptr) -> Tensor + +- func: bernoulli_(Tensor self) -> Tensor + +- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor + variants: function + +- func: bincount(Tensor self, Tensor? weights={}, int64_t minlength=0) -> Tensor + dispatch: + CPU: _bincount_cpu + CUDA: _bincount_cuda + +- func: blackman_window(int64_t window_length, TensorOptions options={}) -> Tensor + variants: function + +- func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor + variants: function + +- func: cat(TensorList tensors, int64_t dim=0) -> Tensor + variants: function + +- func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor + variants: function + +- func: ceil(Tensor self) -> Tensor + +- func: ceil_(Tensor self) -> Tensor + dispatch: + CPU: _ceil__cpu + CUDA: _ceil__cuda + +- func: ceil_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _ceil_out_cpu + CUDA: _ceil_out_cuda + +- func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList + +- func: cudnn_is_acceptable(Tensor self) -> bool + variants: function + device_guard: false + +- func: convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) -> Tensor + variants: function + +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor + variants: function + +- func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) -> Tensor + variants: function + +# NB: We MUST call the input self, otherwise codegen will attempt to +# dispatch on ggI... which might be undefined. +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array output_mask) -> (Tensor, Tensor, Tensor) + variants: function + +- func: conv1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] dilation=1, int64_t groups=1) -> Tensor + variants: function + +- func: conv2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1, int64_t groups=1) -> Tensor + variants: function + +- func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor + variants: function + +- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor + +- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor) + +# NB: we inherit the goofy argument order from PyTorch torch.nn.functional +- func: conv_transpose1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] output_padding=0, int64_t groups=1, IntList[1] dilation=1) -> Tensor + variants: function + +- func: conv_transpose2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] output_padding=0, int64_t groups=1, IntList[2] dilation=1) -> Tensor + variants: function + +- func: conv_transpose3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] output_padding=0, int64_t groups=1, IntList[3] dilation=1) -> Tensor + variants: function + +- func: cos(Tensor self) -> Tensor + +- func: cos_(Tensor self) -> Tensor + dispatch: + CPU: _cos__cpu + CUDA: _cos__cuda + +- func: cos_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _cos_out_cpu + CUDA: _cos_out_cuda + +- func: cosh(Tensor self) -> Tensor + +- func: cosh_(Tensor self) -> Tensor + dispatch: + CPU: _cosh__cpu + CUDA: _cosh__cuda + +- func: cosh_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _cosh_out_cpu + CUDA: _cosh_out_cuda + +- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor + variants: function + +- func: cudnn_affine_grid_generator(Tensor theta, int64_t N, int64_t C, int64_t H, int64_t W) -> Tensor + return: + - type: Tensor + name: grid + variants: function + dispatch: + CUDA: cudnn_affine_grid_generator_forward + +# TODO: Why do I have to call this grad?! +- func: cudnn_affine_grid_generator_backward(Tensor grad, int64_t N, int64_t C, int64_t H, int64_t W) + return: + - type: Tensor + name: grad_theta + variants: function + dispatch: + CUDA: cudnn_affine_grid_generator_backward + +- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double exponential_average_factor, double epsilon) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CUDA: cudnn_batch_norm + +# NB: You can only use this if you used cudnn_batch_norm training=True +- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, double epsilon) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CUDA: cudnn_batch_norm_backward + +- func: cudnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution + +- func: cudnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_backward_input + +- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CUDA: cudnn_convolution_backward + +- func: cudnn_convolution_backward_bias(Tensor grad_output) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_backward_bias + +- func: cudnn_convolution_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_backward_weight + +- func: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_transpose + +# NB: output_padding not strictly needed here, but it's helpful for the double +# backwards +- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CUDA: cudnn_convolution_transpose_backward + +- func: cudnn_convolution_transpose_backward_bias(Tensor grad_output) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_backward_bias + +- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_transpose_backward_input + +- func: cudnn_convolution_transpose_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor + variants: function + dispatch: + CUDA: cudnn_convolution_transpose_backward_weight + +# NB: input is special cased in a way I don't quite understand +- func: cudnn_grid_sampler(Tensor self, Tensor grid) + return: + - type: Tensor + name: output + variants: function + dispatch: + CUDA: cudnn_grid_sampler_forward + +- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) + return: + - type: Tensor + name: grad_self + - type: Tensor + name: grad_grid + variants: function + dispatch: + CUDA: cudnn_grid_sampler_backward + +# FIXME: These could be combined as optional but for https://github.com/pytorch/pytorch/issues/6593. +- func: cumsum(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + +- func: cumsum(Tensor self, int64_t dim) -> Tensor + +- func: cumsum_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + variants: function + +- func: cumsum_out(Tensor result, Tensor self, int64_t dim) -> Tensor + variants: function + +# FIXME: These could be combined as optional but for https://github.com/pytorch/pytorch/issues/6593. +- func: cumprod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + +- func: cumprod(Tensor self, int64_t dim) -> Tensor + +- func: cumprod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + variants: function + +- func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor + variants: function + +- func: det(Tensor self) -> Tensor + +- func: diagflat(Tensor self, int64_t offset=0) -> Tensor + variants: function + +- func: diagonal(Tensor self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) -> Tensor + +- func: dot(Tensor self, Tensor tensor) -> Tensor + +- func: dot_out(Tensor result, Tensor self, Tensor tensor) -> Tensor + variants: function + +- func: einsum(std::string equation, TensorList tensors) -> Tensor + variants: function + +- func: embedding(Tensor weight, IndexTensor indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) -> Tensor + variants: function + +- func: embedding_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor + variants: function + +- func: embedding_dense_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor + variants: function + dispatch: + CPU: embedding_dense_backward_cpu + CUDA: embedding_dense_backward_cuda + +- func: embedding_renorm_(Tensor self, IndexTensor indices, double max_norm, double norm_type) -> Tensor + variants: function + dispatch: + CPU: embedding_renorm_cpu_ + CUDA: embedding_renorm_cuda_ + +- func: embedding_sparse_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor + variants: function + +# NOTE [ embedding_bag Native Functions ] +# The `_embedding_bag.*` variants assume that input tensors except for `weight`, +# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous. +# We really only need to enforce this for `_embedding_bag` (the forward) because +# the backward inputs are the same as forward ones. +# The above `embedding_bag` wrapper is created to achieve this, e.g., +# applying indices = indices.contiguous(). +# The backward functions apply a check that these input tensors are contiguous. + +- func: embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor) + variants: function + +- func: _embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: _embedding_bag_cpu + CUDA: _embedding_bag_cuda + +- func: _embedding_bag_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) -> Tensor + variants: function + +- func: _embedding_bag_sparse_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor + variants: function + +- func: _embedding_bag_dense_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor + variants: function + dispatch: + CPU: _embedding_bag_dense_backward_cpu + CUDA: _embedding_bag_dense_backward_cuda + +- func: empty(IntList size, TensorOptions options={}) -> Tensor + variants: function + +- func: empty_out(Tensor result, IntList size) -> Tensor + variants: function + +- func: empty_like(Tensor self) -> Tensor + variants: function + +- func: empty_like(Tensor self, *, TensorOptions options) -> Tensor + variants: function + +- func: empty(Type dtype, IntList size) -> Tensor + variants: function + deprecated: true + +- func: erf(Tensor self) -> Tensor + +- func: erf_(Tensor self) -> Tensor + dispatch: + CPU: _erf__cpu + CUDA: _erf__cuda + +- func: erf_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _erf_out_cpu + CUDA: _erf_out_cuda + +- func: erfc(Tensor self) -> Tensor + +- func: erfc_(Tensor self) -> Tensor + dispatch: + CPU: _erfc__cpu + CUDA: _erfc__cuda + +- func: erfc_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _erfc_out_cpu + CUDA: _erfc_out_cuda + +- func: exp(Tensor self) -> Tensor + +- func: exp_(Tensor self) -> Tensor + dispatch: + CPU: _exp__cpu + CUDA: _exp__cuda + +- func: exp_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _exp_out_cpu + CUDA: _exp_out_cuda + +- func: expm1(Tensor self) -> Tensor + +- func: expm1_(Tensor self) -> Tensor + dispatch: + CPU: _expm1__cpu + CUDA: _expm1__cuda + +- func: expm1_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _expm1_out_cpu + CUDA: _expm1_out_cuda + +- func: expand(Tensor self, IntList size, *, bool implicit=false) -> Tensor + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + +- func: expand_as(Tensor self, Tensor other) -> Tensor + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + +- func: eye(int64_t n, TensorOptions options={}) -> Tensor + variants: function + +- func: eye(int64_t n, int64_t m, TensorOptions options={}) -> Tensor + variants: function + +- func: eye_out(Tensor result, int64_t n) -> Tensor + variants: function + dispatch: + CPU: eye_out_cpu + CUDA: eye_out_cuda + +- func: eye_out(Tensor result, int64_t n, int64_t m) -> Tensor + variants: function + dispatch: + CPU: eye_out_cpu + CUDA: eye_out_cuda + +- func: eye(Type dtype, int64_t n, int64_t m=-1) -> Tensor + variants: function + deprecated: true + +- func: flatten(Tensor self, int64_t start_dim=0, int64_t end_dim=-1) -> Tensor + +- func: fill_(Tensor self, Scalar value) -> Tensor + +- func: fill_(Tensor self, Tensor value) -> Tensor + +- func: floor(Tensor self) -> Tensor + +- func: floor_(Tensor self) -> Tensor + dispatch: + CPU: _floor__cpu + CUDA: _floor__cuda + +- func: floor_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _floor_out_cpu + CUDA: _floor_out_cuda + +- func: full(IntList size, Scalar fill_value, TensorOptions options={}) -> Tensor + variants: function + +- func: full_out(Tensor result, IntList size, Scalar fill_value) -> Tensor + variants: function + +- func: full_like(Tensor self, Scalar fill_value) -> Tensor + variants: function + +- func: full_like(Tensor self, Scalar fill_value, *, TensorOptions options) -> Tensor + variants: function + +- func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor + variants: function + deprecated: true + +- func: grid_sampler(Tensor input, Tensor grid, int64_t padding_mode) -> Tensor + variants: function + +- func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor + variants: function + +- func: hann_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor + variants: function + +- func: hamming_window(int64_t window_length, TensorOptions options={}) -> Tensor + variants: function + +- func: hamming_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor + variants: function + +- func: hamming_window(int64_t window_length, bool periodic, double alpha, TensorOptions options={}) -> Tensor + variants: function + +- func: hamming_window(int64_t window_length, bool periodic, double alpha, double beta, TensorOptions options={}) -> Tensor + variants: function + +- func: hinge_embedding_loss(Tensor self, Tensor target, double margin=1.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor + variants: function + +- func: ger(Tensor self, Tensor vec2) -> Tensor + +- func: ger_out(Tensor result, Tensor self, Tensor vec2) -> Tensor + variants: function + +- func: gesv(Tensor self, Tensor A) -> (Tensor, Tensor) + +- func: gesv_out(Tensor solution, Tensor lu, Tensor self, Tensor A) -> (Tensor, Tensor) + variants: function + +# gesv handles broadcasting of arbitrary batch dims while _gesv_helper does not. +- func: _gesv_helper(Tensor self, Tensor A) -> (Tensor, Tensor) + dispatch: + CPU: _gesv_helper_cpu + CUDA: _gesv_helper_cuda + +- func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enabled=True) -> Tensor + variants: function + +# FFT + +- func: fft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor + +- func: ifft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor + +- func: rfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true) -> Tensor + +- func: irfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) -> Tensor + +- func: _fft_with_size(Tensor self, int64_t signal_ndim, bool complex_input, bool complex_output, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes) -> Tensor + dispatch: + CPU: _fft_mkl + CUDA: _fft_cufft + +- func: _cufft_get_plan_cache_size() -> int64_t + variants: function + device_guard: false + +- func: _cufft_get_plan_cache_max_size() -> int64_t + variants: function + device_guard: false + +- func: _cufft_set_plan_cache_max_size(int64_t max_size) + variants: function + device_guard: false + +- func: _cufft_clear_plan_cache() + variants: function + device_guard: false + +- func: index(Tensor self, TensorList indices) -> Tensor + # NB: This function is special-cased in tools/autograd/gen_variable_type.py + +- func: index_copy_(Tensor self, int64_t dim, IndexTensor index, Tensor source) -> Tensor + variants: method + +- func: index_put(Tensor self, TensorList indices, Tensor values) -> Tensor + +- func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor + +- func: inverse(Tensor self) -> Tensor + +- func: inverse_out(Tensor result, Tensor self) -> Tensor + variants: function + +- func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor + +- func: is_cuda(Tensor self) -> bool + device_guard: false + +- func: is_distributed(Tensor self) -> bool + device_guard: false + +- func: is_floating_point(Tensor self) -> bool + device_guard: false + +- func: is_nonzero(Tensor self) -> bool + device_guard: false + +- func: is_same_size(Tensor self, Tensor other) -> bool + device_guard: false + +- func: is_signed(Tensor self) -> bool + device_guard: false + +- func: is_sparse(Tensor self) -> bool + device_guard: false + +- func: kthvalue(Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor) + +- func: kthvalue_out(Tensor values, Tensor indices, Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor) + variants: function + +- func: layer_norm(Tensor input, IntList normalized_shape, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enable=True) -> Tensor + variants: function + +- func: linspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor + variants: function + +- func: linspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor + variants: function + +- func: linspace_out(Tensor result, Scalar start, Scalar end) -> Tensor + variants: function + +- func: linspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor + variants: function + +- func: linspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor + variants: function + deprecated: true + +- func: log(Tensor self) -> Tensor + +- func: log_(Tensor self) -> Tensor + dispatch: + CPU: _log__cpu + CUDA: _log__cuda + +- func: log_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _log_out_cpu + CUDA: _log_out_cuda + +- func: log10(Tensor self) -> Tensor + +- func: log10_(Tensor self) -> Tensor + dispatch: + CPU: _log10__cpu + CUDA: _log10__cuda + +- func: log10_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _log10_out_cpu + CUDA: _log10_out_cuda + +- func: log1p(Tensor self) -> Tensor + +- func: log1p_(Tensor self) -> Tensor + dispatch: + CPU: _log1p__cpu + CUDA: _log1p__cuda + SparseCPU: log1p_sparse_ + SparseCUDA: log1p_sparse_ + +- func: log1p_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _log1p_out_cpu + CUDA: _log1p_out_cuda + SparseCPU: log1p_out_sparse + SparseCUDA: log1p_out_sparse + +- func: log2(Tensor self) -> Tensor + +- func: log2_(Tensor self) -> Tensor + dispatch: + CPU: _log2__cpu + CUDA: _log2__cuda + +- func: log2_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _log2_out_cpu + CUDA: _log2_out_cuda + +- func: logdet(Tensor self) -> Tensor + +- func: logspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor + variants: function + +- func: logspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor + variants: function + +- func: logspace_out(Tensor result, Scalar start, Scalar end) -> Tensor + variants: function + +- func: logspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor + variants: function + +- func: logspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor + variants: function + deprecated: true + +- func: log_softmax(Tensor self, int64_t dim) -> Tensor + dispatch: + CPU: log_softmax_cpu + CUDA: log_softmax_cuda + +- func: log_softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor + dispatch: + CPU: log_softmax_backward_cpu + CUDA: log_softmax_backward_cuda + +- func: logsumexp(Tensor self, int64_t dim, bool keepdim=False) -> Tensor + +- func: logsumexp_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor + variants: function + +- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor + variants: function + +- func: matmul(Tensor self, Tensor other) -> Tensor + +- func: matmul_out(Tensor result, Tensor self, Tensor other) -> Tensor + variants: function + +- func: max(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor) + +- func: max_out(Tensor max, Tensor max_values, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor) + variants: function + +- func: max_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor + +- func: max_pool1d_with_indices(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor) + variants: function + +- func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor + variants: function + +- func: max_pool2d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor + variants: function + +- func: max_pool3d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor + variants: function + +# FIXME: These could be combined as optional but for https://github.com/pytorch/pytorch/issues/6593. +- func: mean(Tensor self, *, ScalarType dtype) -> Tensor + +- func: mean(Tensor self) -> Tensor + +- func: mean(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor + +- func: mean(Tensor self, int64_t dim, bool keepdim=False) -> Tensor + +- func: mean(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + +- func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor + variants: function + +- func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor + variants: function + +- func: mean_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + variants: function + +- func: median(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor) + +- func: median_out(Tensor values, Tensor indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor) + variants: function + +- func: min(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor) + +- func: min_out(Tensor min, Tensor min_indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor) + variants: function + +- func: min_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor + +- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups) -> Tensor + variants: function + +- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> Tensor + variants: function + +- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> (Tensor, Tensor) + variants: function + +- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array output_mask) -> (Tensor, Tensor, Tensor) + variants: function + +- func: mm(Tensor self, Tensor mat2) -> Tensor + +- func: mm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor + variants: function + +- func: mode(Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor) + +- func: mode_out(Tensor values, Tensor indices, Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor) + variants: function + +- func: mv(Tensor self, Tensor vec) -> Tensor + +- func: mv_out(Tensor result, Tensor self, Tensor vec) -> Tensor + variants: function + +- func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor + +- func: ones(IntList size, TensorOptions options={}) -> Tensor + variants: function + +- func: ones_out(Tensor result, IntList size) -> Tensor + variants: function + +- func: ones_like(Tensor self) -> Tensor + variants: function + +- func: ones_like(Tensor self, *, TensorOptions options) -> Tensor + variants: function + +- func: ones(Type dtype, IntList size) -> Tensor + variants: function + deprecated: true + +- func: pairwise_distance(Tensor x1, Tensor x2, double p=2, double eps=1e-6, bool keepdim=false) -> Tensor + variants: function + +- func: permute(Tensor self, IntList dims) -> Tensor + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + +- func: pin_memory(Tensor self) -> Tensor + +- func: pinverse(Tensor self, double rcond=1e-15) -> Tensor + +- func: rand(IntList size, *, TensorOptions options={}) -> Tensor + variants: function + +- func: rand(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor + variants: function + +- func: rand_out(Tensor result, IntList size, *) -> Tensor + variants: function + +- func: rand_out(Tensor result, IntList size, *, Generator* generator) -> Tensor + variants: function + +- func: rand_like(Tensor self) -> Tensor + variants: function + +- func: rand_like(Tensor self, *, TensorOptions options) -> Tensor + variants: function + +- func: rand(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor + variants: function + deprecated: true + +- func: randint(int64_t high, IntList size, *, TensorOptions options={}) -> Tensor + variants: function + +- func: randint(int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor + variants: function + +- func: randint(int64_t low, int64_t high, IntList size, *, TensorOptions options={}) -> Tensor + variants: function + +- func: randint(int64_t low, int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor + variants: function + +- func: randint(Type dtype, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor + variants: function + deprecated: true + +- func: randint(Type dtype, int64_t low, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor + variants: function + deprecated: true + +- func: randint_out(Tensor result, int64_t high, IntList size, *) -> Tensor + variants: function + +- func: randint_out(Tensor result, int64_t high, IntList size, *, Generator* generator) -> Tensor + variants: function + +- func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *) -> Tensor + variants: function + +- func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *, Generator* generator) -> Tensor + variants: function + +- func: randint_like(Tensor self, int64_t high) -> Tensor + variants: function + +- func: randint_like(Tensor self, int64_t low, int64_t high) -> Tensor + variants: function + +- func: randint_like(Tensor self, int64_t high, *, TensorOptions options) -> Tensor + variants: function + +- func: randint_like(Tensor self, int64_t low, int64_t high, *, TensorOptions options) -> Tensor + variants: function + +- func: randn(IntList size, *, TensorOptions options={}) -> Tensor + variants: function + +- func: randn(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor + variants: function + +- func: randn_out(Tensor result, IntList size, *) -> Tensor + variants: function + +- func: randn_out(Tensor result, IntList size, *, Generator* generator) -> Tensor + variants: function + +- func: randn_like(Tensor self) -> Tensor + variants: function + +- func: randn_like(Tensor self, *, TensorOptions options) -> Tensor + variants: function + +- func: randn(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor + variants: function + deprecated: true + +- func: randperm(int64_t n, *, TensorOptions options={}) -> Tensor + variants: function + +- func: randperm(int64_t n, *, Generator* generator, TensorOptions options={}) -> Tensor + variants: function + +- func: randperm_out(Tensor result, int64_t n, *) -> Tensor + variants: function + +- func: randperm_out(Tensor result, int64_t n, *, Generator* generator) -> Tensor + variants: function + dispatch: + CPU: randperm_out_cpu + CUDA: randperm_out_cuda + +- func: randperm(Type dtype, int64_t n, *, Generator* generator=nullptr) -> Tensor + variants: function + deprecated: true + +- func: range(Scalar start, Scalar end, TensorOptions options={}) -> Tensor + variants: function + +- func: range(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor + variants: function + +- func: range_out(Tensor result, Scalar start, Scalar end) -> Tensor + variants: function + +- func: range_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor + variants: function + +- func: range(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor + variants: function + deprecated: true + +- func: repeat(Tensor self, IntList repeats) -> Tensor + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + +- func: reshape(Tensor self, IntList shape) -> Tensor + +- func: reshape_as(Tensor self, Tensor other) -> Tensor + variants: method + +- func: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: RoiPooling2d_forward_cpu + CUDA: RoiPooling2d_forward_cuda + +- func: RoiPooling2d_backward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, Tensor gradOutput, Tensor argmaxes) -> Tensor + variants: function + dispatch: + CPU: RoiPooling2d_backward_cpu + CUDA: RoiPooling2d_backward_cuda + +- func: round(Tensor self) -> Tensor + +- func: round_(Tensor self) -> Tensor + dispatch: + CPU: _round__cpu + CUDA: _round__cuda + +- func: round_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _round_out_cpu + CUDA: _round_out_cuda + +- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor + variants: function + +- func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor + variants: function + +- func: relu(Tensor self) -> Tensor + +- func: relu_(Tensor self) -> Tensor + +- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor + dispatch: + CPU: hardshrink_cpu + CUDA: hardshrink_cuda + +- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor + dispatch: + CPU: hardshrink_backward_cpu + CUDA: hardshrink_backward_cuda + +- func: rsqrt(Tensor self) -> Tensor + +- func: rsqrt_(Tensor self) -> Tensor + dispatch: + CPU: _rsqrt__cpu + CUDA: _rsqrt__cuda + +- func: rsqrt_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _rsqrt_out_cpu + CUDA: _rsqrt_out_cuda + +- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor + +- func: selu(Tensor self) -> Tensor + variants: function + +- func: selu_(Tensor self) -> Tensor + variants: function + +- func: sigmoid(Tensor self) -> Tensor + +- func: sigmoid_(Tensor self) -> Tensor + dispatch: + CPU: _sigmoid__cpu + CUDA: _sigmoid__cuda + +- func: sigmoid_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _sigmoid_out_cpu + CUDA: _sigmoid_out_cuda + +- func: sin(Tensor self) -> Tensor + +- func: sin_(Tensor self) -> Tensor + dispatch: + CPU: _sin__cpu + CUDA: _sin__cuda + +- func: sin_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _sin_out_cpu + CUDA: _sin_out_cuda + +- func: sinh(Tensor self) -> Tensor + +- func: sinh_(Tensor self) -> Tensor + dispatch: + CPU: _sinh__cpu + CUDA: _sinh__cuda + +- func: sinh_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _sinh_out_cpu + CUDA: _sinh_out_cuda + +- func: size(Tensor self, int64_t dim) -> int64_t + device_guard: false + +- func: slice(Tensor self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) -> Tensor + +- func: slogdet(Tensor self) -> (Tensor, Tensor) + +- func: smm(Tensor self, Tensor mat2) -> Tensor + +- func: softmax(Tensor self, int64_t dim) -> Tensor + dispatch: + CPU: softmax_cpu + CUDA: softmax_cuda + +- func: softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor + dispatch: + CPU: softmax_backward_cpu + CUDA: softmax_backward_cuda + +- func: split(Tensor self, int64_t split_size, int64_t dim=0) -> TensorList + +- func: split_with_sizes(Tensor self, IntList split_sizes, int64_t dim=0) -> TensorList + +- func: squeeze(Tensor self) -> Tensor + +- func: squeeze(Tensor self, int64_t dim) -> Tensor + +- func: squeeze_(Tensor self) -> Tensor + variants: method + +- func: squeeze_(Tensor self, int64_t dim) -> Tensor + variants: method + +- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + +- func: sspaddmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: _sspaddmm_out_only_sparse + CUDA: _sspaddmm_out_only_sparse_cuda + SparseCPU: _sspaddmm_out_cpu + SparseCUDA: _sspaddmm_out_cuda + +- func: stack(TensorList tensors, int64_t dim=0) -> Tensor + variants: function + +- func: stack_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor + variants: function + +# The signature is designed to be consistent with librosa except that it is +# missing the `pad_mode` and `center` arguments, which are taken care of at +# `torch.functional.py`. They shall be moved here once we have mapping between +# Python strings and C++ Enum in codegen. +- func: stft(Tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, Tensor? window={}, bool normalized=false, bool onesided=true) -> Tensor + python_default_init: + hop_length: n_fft >> 2 + win_length: n_fft + +- func: stride(Tensor self, int64_t dim) -> int64_t + device_guard: false + +# FIXME: These could be combined as optional but for https://github.com/pytorch/pytorch/issues/6593. +- func: sum(Tensor self, *, ScalarType dtype) -> Tensor + +- func: sum(Tensor self) -> Tensor + +- func: _sum(Tensor self) -> Tensor + dispatch: + CPU: _sum_cpu + CUDA: _sum_cuda + +- func: sum(Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + +- func: sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor + +- func: sum(Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor + +- func: _sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor + +- func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + variants: function + +- func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor + variants: function + +- func: sum_out(Tensor result, Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor + variants: function + +- func: _sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor + variants: function + +- func: _sum_cuda_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor + variants: function + dispatch: + CUDA: _sum_out_cuda + +- func: sqrt(Tensor self) -> Tensor + +- func: sqrt_(Tensor self) -> Tensor + dispatch: + CPU: _sqrt__cpu + CUDA: _sqrt__cuda + +- func: sqrt_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _sqrt_out_cpu + CUDA: _sqrt_out_cuda + +- func: std(Tensor self, bool unbiased=true) -> Tensor + +- func: std(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor + +- func: std_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor + variants: function + +# FIXME: These could be combined as optional but for https://github.com/pytorch/pytorch/issues/6593. +- func: prod(Tensor self, *, ScalarType dtype) -> Tensor + +- func: prod(Tensor self) -> Tensor + +- func: _prod(Tensor self) -> Tensor + dispatch: + CPU: _prod_cpu + CUDA: _prod_cuda + +- func: prod(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor + +- func: prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor + +- func: prod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + +- func: _prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor + +- func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor + variants: function + +- func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor + variants: function + +- func: prod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor + variants: function + +- func: _prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor + variants: function + dispatch: + CPU: _prod_out_cpu + CUDA: _prod_out_cuda + +- func: t(Tensor self) -> Tensor + +- func: t_(Tensor self) -> Tensor + variants: method + +- func: tan(Tensor self) -> Tensor + +- func: tan_(Tensor self) -> Tensor + dispatch: + CPU: _tan__cpu + CUDA: _tan__cuda + +- func: tan_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _tan_out_cpu + CUDA: _tan_out_cuda + +- func: tanh(Tensor self) -> Tensor + +- func: tanh_(Tensor self) -> Tensor + dispatch: + CPU: _tanh__cpu + CUDA: _tanh__cuda + +- func: tanh_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _tanh_out_cpu + CUDA: _tanh_out_cuda + +- func: transpose(Tensor self, int64_t dim0, int64_t dim1) -> Tensor + +- func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor + variants: method + +- func: flip(Tensor self, IntList dims) -> Tensor + dispatch: + CPU: flip_cpu + CUDA: flip_cuda + +- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim=1) -> Tensor + variants: function + +- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, int64_t reduction=Reduction::ElementwiseMean) -> Tensor + variants: function + +- func: trunc(Tensor self) -> Tensor + +- func: trunc_(Tensor self) -> Tensor + dispatch: + CPU: _trunc__cpu + CUDA: _trunc__cuda + +- func: trunc_out(Tensor result, Tensor self) -> Tensor + variants: function + dispatch: + CPU: _trunc_out_cpu + CUDA: _trunc_out_cuda + +- func: type_as(Tensor self, Tensor other) -> Tensor + variants: method + +- func: _unique(Tensor self, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor) + dispatch: + CPU: _unique_cpu + CUDA: _unique_cuda + +- func: _unsafe_view(Tensor self, IntList size) -> Tensor + variants: function + +- func: unsqueeze(Tensor self, int64_t dim) -> Tensor + +- func: unsqueeze_(Tensor self, int64_t dim) -> Tensor + variants: method + +- func: var(Tensor self, bool unbiased=true) -> Tensor + +- func: var(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor + +- func: var_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor + variants: function + +- func: view_as(Tensor self, Tensor other) -> Tensor + variants: method + +# we define both of these because 'where' does the broadcast and '_s_where' doesn't; +# this allows us to implicitly calculate the broadcast derivative, while only dealing with the +# _s_where derivative. +- func: where(BoolTensor condition, Tensor self, Tensor other) -> Tensor +- func: _s_where(BoolTensor condition, Tensor self, Tensor other) -> Tensor + dispatch: + CPU: _s_where_cpu + CUDA: _s_where_cuda + +- func: zeros(IntList size, TensorOptions options={}) -> Tensor + variants: function + +- func: zeros_out(Tensor result, IntList size) -> Tensor + variants: function + +- func: zeros_like(Tensor self) -> Tensor + variants: function + +- func: zeros_like(Tensor self, *, TensorOptions options) -> Tensor + variants: function + +- func: zeros(Type dtype, IntList size) -> Tensor + variants: function + deprecated: true + +- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor + dispatch: + CPU: _standard_gamma_grad_cpu + CUDA: _standard_gamma_grad_cuda + +- func: _standard_gamma(Tensor self, Generator* generator=nullptr) -> Tensor + dispatch: + CPU: _s_gamma_cpu + CUDA: _s_gamma_cuda + +- func: poisson(Tensor self, Generator* generator=nullptr) -> Tensor + variants: function + dispatch: + CPU: _s_poisson_cpu + CUDA: _s_poisson_cuda + +# When more variants get ported to native, this dispatch will get more +# complicated + +- func: native_norm(Tensor self, Scalar p=2) -> Tensor + variants: function + dispatch: + SparseCPU: norm_sparse + SparseCUDA: norm_sparse + +- func: norm(Tensor self, Scalar p=2) -> Tensor + variants: method, function + +- func: norm(Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor + python_default_init: + p: 2 + +- func: norm_out(Tensor result, Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor + variants: function + python_default_init: + p: 2 + +- func: native_clone(Tensor self) -> Tensor + variants: function + dispatch: + SparseCPU: clone_sparse + SparseCUDA: clone_sparse + +- func: clone(Tensor self) -> Tensor + +- func: native_resize_as_(Tensor self, Tensor the_template) -> Tensor + variants: function + dispatch: + SparseCPU: resize_as_sparse_ + SparseCUDA: resize_as_sparse_ + +- func: resize_as_(Tensor self, Tensor the_template) -> Tensor + +- func: native_pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor + variants: function + dispatch: + SparseCPU: pow_out_sparse_scalar + SparseCUDA: pow_out_sparse_scalar + +- func: native_pow(Tensor self, Scalar exponent) -> Tensor + variants: function + dispatch: + SparseCPU: pow_sparse_scalar + SparseCUDA: pow_sparse_scalar + +- func: pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor + variants: function + +- func: pow(Tensor self, Scalar exponent) -> Tensor + variants: method, function + +- func: native_zero_(Tensor self) -> Tensor + variants: function + dispatch: + SparseCPU: zero_sparse_ + SparseCUDA: zero_sparse_ + +- func: zero_(Tensor self) -> Tensor + +- func: s_native_add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + SparseCPU: s_add_out_sparse_cpu + SparseCUDA: s_add_out_sparse_cuda + +- func: native_add_out(Tensor result, Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_out_dense_sparse_cpu + CUDA: add_out_dense_sparse_cuda + +- func: s_native_add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + SparseCPU: s_add_sparse_cpu + SparseCUDA: s_add_sparse_cuda + +- func: native_add(Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_dense_sparse_cpu + CUDA: add_dense_sparse_cuda + +- func: s_native_add_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + SparseCPU: s_add_sparse_cpu_ + SparseCUDA: s_add_sparse_cuda_ + +- func: native_add_(Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_dense_sparse_cpu_ + CUDA: add_dense_sparse_cuda_ + +- func: add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + +- func: add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: method, function + +- func: add_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: method + + + +- func: s_native_sub_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + SparseCPU: s_sub_out_sparse_cpu + SparseCUDA: s_sub_out_sparse_cuda + +- func: s_native_sub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + SparseCPU: s_sub_sparse_cpu + SparseCUDA: s_sub_sparse_cuda + +- func: s_native_sub_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + SparseCPU: s_sub_sparse_cpu_ + SparseCUDA: s_sub_sparse_cuda_ + +- func: sub_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + +- func: sub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: method, function + +- func: sub_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: method + + + +- func: s_native_mul_out(Tensor result, Tensor self, Tensor other) -> Tensor + variants: function + dispatch: + SparseCPU: s_mul_out_sparse_cpu + SparseCUDA: s_mul_out_sparse_cuda + +- func: s_native_mul(Tensor self, Tensor other) -> Tensor + variants: function + dispatch: + SparseCPU: s_mul_sparse_cpu + SparseCUDA: s_mul_sparse_cuda + +- func: s_native_mul_(Tensor self, Tensor other) -> Tensor + variants: function + dispatch: + SparseCPU: s_mul_sparse_cpu_ + SparseCUDA: s_mul_sparse_cuda_ + +- func: native_mul_out(Tensor result, Tensor self, Scalar other) -> Tensor + variants: function + dispatch: + SparseCPU: mul_out_sparse_scalar + SparseCUDA: mul_out_sparse_scalar + +- func: native_mul(Tensor self, Scalar other) -> Tensor + variants: function + dispatch: + SparseCPU: mul_sparse_scalar + SparseCUDA: mul_sparse_scalar + +- func: native_mul_(Tensor self, Scalar other) -> Tensor + variants: function + dispatch: + SparseCPU: mul_sparse_scalar_ + SparseCUDA: mul_sparse_scalar_ + +- func: mul_out(Tensor result, Tensor self, Tensor other) -> Tensor + variants: function + +- func: mul_out(Tensor result, Tensor self, Scalar other) -> Tensor + variants: function + +- func: mul(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: mul(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: mul_(Tensor self, Tensor other) -> Tensor + variants: method + +- func: mul_(Tensor self, Scalar other) -> Tensor + variants: method + + + +- func: native_div_out(Tensor result, Tensor self, Scalar other) -> Tensor + variants: function + dispatch: + SparseCPU: div_out_sparse_scalar + SparseCUDA: div_out_sparse_scalar + +- func: native_div(Tensor self, Scalar other) -> Tensor + variants: function + dispatch: + SparseCPU: div_sparse_scalar + SparseCUDA: div_sparse_scalar + +- func: native_div_(Tensor self, Scalar other) -> Tensor + variants: function + dispatch: + SparseCPU: div_sparse_scalar_ + SparseCUDA: div_sparse_scalar_ + +- func: div_out(Tensor result, Tensor self, Scalar other) -> Tensor + variants: function + +- func: div(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: div_(Tensor self, Scalar other) -> Tensor + variants: method + + +- func: s_native_addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: s_addmm_out_sparse_dense_cpu + CUDA: s_addmm_out_sparse_dense_cuda + +- func: s_native_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: s_addmm_sparse_dense_cpu + CUDA: s_addmm_sparse_dense_cuda + +- func: s_native_addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: s_addmm_sparse_dense_cpu_ + CUDA: s_addmm_sparse_dense_cuda_ + +- func: addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + +- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method, function + +- func: addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method + + +- func: native_tensor(Type self_ty) -> Tensor + variants: function + dispatch: + SparseCPU: new_sparse + SparseCUDA: new_sparse + +- func: native_tensor(Type self_ty, IntList size) -> Tensor + variants: function + dispatch: + SparseCPU: new_with_size_sparse + SparseCUDA: new_with_size_sparse + +- func: tensor(Type dtype) -> Tensor + variants: [] + +- func: tensor(Type dtype, IntList size) -> Tensor + variants: [] + + +# NB: I have to decompose sparse_coo_tensor into two functions, because +# it has custom dispatch logic for which Type to dispatch on (we must +# use the sparse equivalent of the type of the SECOND argument). +# +# The actual dispatcher, native_sparse_coo_tensor, has all of its overloads +# removed so you don't accidentally trigger the default behavior, which +# is to infer Type based on the first argument (indices), which is ~never +# what you want. (I guess hypothetically it would work; you'd +# just only ever dispatch to CPULongTensor or CUDALongTensor, but that +# seems a bit too finely balanced.) + +- func: native_sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor + variants: [] + dispatch: + SparseCPU: new_with_tensor_sparse + SparseCUDA: new_with_tensor_sparse + +- func: native_sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor + variants: [] + dispatch: + SparseCPU: new_with_tensor_and_size_sparse + SparseCUDA: new_with_tensor_and_size_sparse + +- func: sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor + variants: [] + +- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor + variants: [] + + +- func: _native_sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor + variants: [] + dispatch: + SparseCPU: new_with_tensor_and_size_unsafe_sparse + SparseCUDA: new_with_tensor_and_size_unsafe_sparse + +- func: _sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor + variants: function + + +- func: sparse_raw_resize_(Tensor self, IntList size, int64_t sparseDims, int64_t denseDims) -> Tensor + variants: method + dispatch: + SparseCPU: raw_resize_sparse_ + SparseCUDA: raw_resize_sparse_ + + +- func: _sparse_mask(Tensor self, SparseTensorRef mask) -> Tensor + variants: method + dispatch: + CPU: sparse_mask_cpu + CUDA: sparse_mask_cuda + + +- func: to_dense(Tensor self) -> Tensor + variants: method + dispatch: + SparseCPU: sparse_to_dense + SparseCUDA: sparse_to_dense + + +- func: _sparseDims(Tensor self) -> int64_t + variants: method + dispatch: + SparseCPU: _sparseDims_sparse + SparseCUDA: _sparseDims_sparse + device_guard: False + +# legacy method +- func: _dimI(Tensor self) -> int64_t + variants: method + dispatch: _sparseDims_sparse + device_guard: False + + +- func: _denseDims(Tensor self) -> int64_t + variants: method + dispatch: + SparseCPU: _denseDims_sparse + SparseCUDA: _denseDims_sparse + device_guard: False + +# legacy method +- func: _dimV(Tensor self) -> int64_t + variants: method + dispatch: _denseDims_sparse + device_guard: False + + +- func: _nnz(Tensor self) -> int64_t + variants: method + dispatch: + SparseCPU: _nnz_sparse + SparseCUDA: _nnz_sparse + device_guard: False + + +- func: coalesce(Tensor self) -> Tensor + variants: method + dispatch: + SparseCPU: coalesce_sparse_cpu + SparseCUDA: coalesce_sparse_cuda + + +- func: is_coalesced(Tensor self) -> bool + variants: method + dispatch: + SparseCPU: is_coalesced_sparse + SparseCUDA: is_coalesced_sparse + device_guard: False + + +- func: _indices(Tensor self) -> Tensor + variants: method + dispatch: + SparseCPU: _indices_sparse + SparseCUDA: _indices_sparse + device_guard: False + + +- func: _values(Tensor self) -> Tensor + variants: method + dispatch: + SparseCPU: _values_sparse + SparseCUDA: _values_sparse + device_guard: False + + +- func: hspmm_out(Tensor result, Tensor mat1, Tensor mat2) -> Tensor + variants: function + dispatch: + SparseCPU: hspmm_out_sparse_cpu + SparseCUDA: hspmm_out_sparse_cuda + +- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor + variants: function + dispatch: + SparseCPU: hspmm_sparse_cpu + SparseCUDA: hspmm_sparse_cuda + +# This "raw copy" doesn't handle conversions NOR does it handle non-blocking. +- func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor + variants: function + dispatch: + SparseCPU: copy_sparse_ + SparseCUDA: copy_sparse_ + +- func: numel(Tensor self) -> int64_t + variants: + - method + - function + device_guard: False + +- func: unbind(Tensor self, int64_t dim=0) -> TensorList + variants: + - method + - function + +- func: native_get_device(Tensor self) -> int64_t + variants: function + dispatch: + SparseCUDA: get_device_sparse_cuda + device_guard: False + +- func: get_device(Tensor self) -> int64_t + device_guard: False + +- func: meshgrid(TensorList tensors) -> TensorList + variants: function diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp new file mode 100644 index 0000000..0cac9bc --- /dev/null +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -0,0 +1,390 @@ +// Basic functions on sparse tensors + +#include +#include +#include +#include + +#include + +namespace at { namespace native { + +/****************************************************************************** + * access methods + ******************************************************************************/ + +int64_t _sparseDims_sparse(const SparseTensor& self) { + return _get_sparse_impl(self)->sparseDims(); +} + +int64_t _denseDims_sparse(const SparseTensor& self) { + return _get_sparse_impl(self)->denseDims(); +} + +bool is_coalesced_sparse(const SparseTensor& self) { + return _get_sparse_impl(self)->coalesced(); +} + +int64_t _nnz_sparse(const SparseTensor& self) { + return _get_sparse_impl(self)->nnz(); +} + +// TODO: This is wrong: if nnz == 0 but indices/values is not +// empty then we'll return all the values, even the ones that +// are "masked out" by nnz + +Tensor _indices_sparse(const SparseTensor& self) { + auto nnz = self._nnz(); + if (nnz == 0) { + // Narrows don't work on 0-length tensors + // TODO: When we handle zero-size dims correctly, this will work and + // we can remove the special case. + return _get_sparse_impl(self)->indices(); + } + return _get_sparse_impl(self)->indices().narrow(1, 0, nnz); +} + +Tensor _values_sparse(const SparseTensor& self) { + // See indices for some relevant notes + auto nnz = self._nnz(); + if (nnz == 0) { + return _get_sparse_impl(self)->values(); + } + return _get_sparse_impl(self)->values().narrow(0, 0, nnz); +} + +/****************************************************************************** + * creation methods + ******************************************************************************/ + +/* Empty init */ +SparseTensor new_sparse(const SparseType& dtype) { + AT_ASSERT(!dtype.is_undefined()); + AT_ASSERT(!dtype.is_variable()); + AT_ASSERT(dtype.is_sparse()); + // TODO: Hmm... this const_cast business seems a bit dodgy + return SparseTensor(new SparseTensorImpl(const_cast(&dtype)), /* retain */ false); +} + +/*** Helper methods ***/ + +/* Pointer-copy init */ +SparseTensor new_with_tensor_sparse(const LongTensor& indices, const Tensor& values_) { + Tensor values; + if (values_.dim() == 0) { + // Mimic Numpy behavior here and treat it as a 1D tensor + values = values_.expand({1}); + } else { + values = values_; + } + + // TODO: This is a temporary test until we support zero-size dims. + // I'm NOT adding the "obvious" bypass code, because it wasn't supported + // previously + AT_CHECK(indices.numel() != 0, "cannot construct sparse tensor with empty indices; use the nullary constructor instead"); + + const SparseType& dtype = values.type().toSparse(); + + // If sizes are not given, it is inferred as max index of each dim. + int64_t sparseDims = indices.size(0); + int64_t denseDims = values.dim() - 1; + + std::vector computed_sizes(sparseDims + denseDims); + // NB: It used to keepdim. I think that was wrong. + LongTensor computed_indices_sizes = std::get(indices.max(/* dim */ 1, /* keepdim */ false)); + computed_indices_sizes.add_(1); // len = max_index + 1 + LongTensor cpu_computed_indices_sizes; + if (computed_indices_sizes.is_cuda()) { + cpu_computed_indices_sizes = at::CPU(kLong).tensor(computed_indices_sizes.sizes()); + cpu_computed_indices_sizes.copy_(computed_indices_sizes); + } else { + cpu_computed_indices_sizes = computed_indices_sizes; + } + auto cpu_computed_indices_sizes_accessor = cpu_computed_indices_sizes.accessor(); + for (int64_t d = 0; d < sparseDims; d++) { + computed_sizes[static_cast(d)] = cpu_computed_indices_sizes_accessor[d]; + } + for (int64_t d = 0; d < denseDims; d++) { + computed_sizes[static_cast(sparseDims + d)] = values.size(d+1); + } + return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, computed_sizes, indices, values); +} + +SparseTensor new_with_size_sparse(const SparseType& dtype, ArrayRef size) { + SparseTensor self = new_sparse(dtype); + _raw_resize_sparse(self, size.size(), 0, size); + return self; +} + +// NB: Got rid of the sizes == NULL case +SparseTensor new_with_tensor_and_size_unsafe_sparse(const LongTensor& indices, const Tensor& values_, ArrayRef sizes) { + Tensor values; + if (values_.dim() == 0) { + // Mimic Numpy behavior here and treat it as a 1D tensor + values = values_.expand({1}); + } else { + values = values_; + } + + const SparseType& dtype = values.type().toSparse(); + // NB: used to be a dim() == 0 test, but that's legacy TH semantics + if (indices.numel() == 0 && values.numel() == 0) { + return new_with_size_sparse(dtype, sizes); + } + + int64_t sparseDims = indices.size(0); + int64_t denseDims = values.dim() - 1; + return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, sizes, indices, values); +} + +// NB: Got rid of the sizes == NULL case +SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Tensor& values_, ArrayRef sizes) { + Tensor values; + if (values_.dim() == 0) { + // Mimic Numpy behavior here and treat it as a 1D tensor + values = values_.expand({1}); + } else { + values = values_; + } + + const SparseType& dtype = values.type().toSparse(); + // NB: This used to be dims, but mumble TH handling zero-sized tensors + // incorrectly + if (indices.numel() == 0 && values.numel() == 0) { + return new_with_size_sparse(dtype, sizes); + } + + int64_t sparseDims = indices.size(0); + int64_t denseDims = values.dim() - 1; + AT_CHECK(sizes.size() == sparseDims + denseDims, "number of dimensions must be sparseDims (", sparseDims, ") + denseDims (", denseDims, "), but got ", sizes); + + LongTensor max_indices = std::get(indices.max(/* dim */ 1, /* keepdim */ false)); + LongTensor cpu_max_indices; + if (max_indices.is_cuda()) { + cpu_max_indices = at::CPU(kLong).copy(max_indices); + } else { + cpu_max_indices = max_indices; + } + auto cpu_max_indices_accessor = cpu_max_indices.accessor(); + for (int64_t d = 0; d < sparseDims; d++) { + // NB: This used to sync ndim times to access each entry; now we copy + // everything to CPU first and then access it. + int64_t max_index_in_dim = cpu_max_indices_accessor[d]; + int64_t dim_size = sizes[static_cast(d)]; + AT_CHECK(max_index_in_dim < dim_size, + "sizes is inconsistent with indices: for dim ", d, ", size is ", dim_size, " but found index ", max_index_in_dim); + } + for (int64_t d = 0; d < denseDims; d++) { + int64_t values_size = values.size(d+1); + int64_t specified_size = sizes[static_cast(sparseDims + d)]; + AT_CHECK(values_size <= specified_size, + "values and sizes are inconsistent: sizes[", d + sparseDims, "] is ", specified_size, + " but values.size(", d + 1, ") is ", values_size); + } + return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, sizes, indices, values); +} + +// NB: Deleted newWithSizeNd variants + +SparseTensor clone_sparse(const SparseTensor& self) { + SparseTensor other = new_sparse(self.type()); + _raw_resize_sparse(other, self._sparseDims(), self._denseDims(), self.sizes()); + // NB: This seems to preserve the size of the UN-narrowed indices and + // values. Veeery interesting. + _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values()); + _get_sparse_impl(other)->set_coalesced(self.is_coalesced()); + _get_sparse_impl(other)->set_nnz(self._nnz()); + return other; +} + +/****************************************************************************** + * reshaping methods + ******************************************************************************/ + +/* +// We should implement a utility function which: (1) sets nnz and (2) resizes +// indices/values to hold enough space to fit nnz, if nnz is larger than +// the previous amount. This ensures that we maintain the nnz invariant. +void _resize_nnz_(const SparseTensor& self, int64_t nnz) { +} +*/ + +void resize_sparse(const SparseTensor& self, ArrayRef size) { + _raw_resize_sparse(self, size.size(), 0, size); +} + +SparseTensor& raw_resize_sparse_(SparseTensor& self, ArrayRef size, int64_t sparseDims, int64_t denseDims) { + if (sparseDims == -1) { + sparseDims = self._indices().size(0); + } + if (denseDims == -1) { + denseDims = self._values().dim() - 1; + } + _raw_resize_sparse(self, sparseDims, denseDims, size); + return self; +} + +namespace { + bool _is_same_size_as_sparse(const SparseTensor& self, const SparseTensor& src) { + return self._sparseDims() == src._sparseDims() && self._denseDims() == src._denseDims() && self.sizes().equals(src.sizes()); + } +} + +SparseTensor& resize_as_sparse_(SparseTensor& self, const SparseTensor& src) { + if (!_is_same_size_as_sparse(self, src)) { + _raw_resize_sparse(self, src._sparseDims(), src._denseDims(), src.sizes()); + } + return self; +} + +// NB: Dropped the resizeNd variants + +Tensor sparse_to_dense(const SparseTensor& self) { + Tensor dst = at::zeros(self.sizes(), self.type().toDense()); + return dst.add_(self); +} + +SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src) { + if (isSameTensor(self, src)) return self; + _raw_resize_sparse(self, src._sparseDims(), src._denseDims(), src.sizes()); + // NB: This seems to copy the underlying full indices/values buffer + _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values()); + _get_sparse_impl(self)->set_coalesced(src.is_coalesced()); + _get_sparse_impl(self)->set_nnz(src._nnz()); + return self; +} + +SparseTensor coalesce_sparse_cpu(const SparseTensor& self) { + AT_ASSERT(self.defined()); + AT_ASSERT(!self.is_variable()); + AT_ASSERT(self.is_sparse()); + + if (self._nnz() < 2) { + _get_sparse_impl(self)->set_coalesced(true); + } + if (self.is_coalesced()) { + return self; + } + + LongTensor indices = self._indices(); + Tensor values = self._values().contiguous(); + int64_t sparseDims = self._sparseDims(); + int64_t denseDims = self._denseDims(); + int64_t nnz = self._nnz(); + + LongTensor indices_scalar = at::zeros({nnz}, kLong); + + int64_t factor = 1; + for (int64_t d = sparseDims - 1; d >= 0; d--) { + LongTensor indices_slice = indices.select(0, d); + indices_scalar.add_(indices_slice, factor); // cadd is swapped args + factor *= self.size(d); + } + + SparseTensor dst = new_sparse(self.type()); + _raw_resize_sparse(dst, sparseDims, denseDims, self.sizes()); + // TODO: is there a more idiomatic way to do this? + LongTensor newIndices = indices.type().tensor(indices.sizes()); + Tensor newValues = values.type().tensor(values.sizes()); + _alias_into_sparse(dst, newIndices, newValues); + + LongTensor indicesBuffer; + LongTensor indicesPermutation; + std::tie(indicesBuffer, indicesPermutation) = indices_scalar.sort(0); + // NB: The accessor accesses here rely on self._nnz() > 0 (tested earlier in this function) + auto newIndicesAccessor = newIndices.accessor(); + auto indicesAccessor = indices.accessor(); + auto indicesPermutationAccessor = indicesPermutation.accessor(); + auto indicesBufferAccessor = indicesBuffer.accessor(); + + int64_t i = -1; + AT_DISPATCH_ALL_TYPES( + values.type(), "coalesce", [&] { + int64_t prev = -1; + int64_t blockSize = values.stride(0); + scalar_t* values_ptr = values.data(); + scalar_t* newValues_ptr = newValues.data(); + for (int64_t j = 0; j < nnz; j++) { + int64_t pos = indicesPermutationAccessor[j]; + int64_t curr = indicesBufferAccessor[j]; + if (curr == prev) { + THBlas_axpy(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); + } else { + ++i; + for (int64_t d = 0; d < sparseDims; d++) { + newIndicesAccessor[d][i] = indicesAccessor[d][pos]; + } + THBlas_copy(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); + } + prev = curr; + } + }); + + _get_sparse_impl(dst)->set_coalesced(true); + _get_sparse_impl(dst)->set_nnz(i + 1); + + return dst; +} + +SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const SparseTensor& mask) { + AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced"); + AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ", + t.sizes(), " but mask has size ", mask.sizes()); + AT_ASSERT(!t.is_cuda()); // we were supposed to have dispatched on this + AT_CHECK(!r.is_cuda(), "sparse_mask: expected 'out' to be CPU, but got CUDA"); + AT_CHECK(!mask.is_cuda(), "sparse_mask: expected 'mask' to be CPU, but got CUDA"); + resize_as_sparse_(r, mask); + if (mask._nnz() == 0) { + r.zero_(); + return r; + } + int64_t dim = t.dim(); + int64_t sparseDims = mask._sparseDims(); + LongTensor mask_indices = mask._indices(); + Tensor mask_values = mask._values(); + Tensor r_values = r._values().type().tensor(mask_values.sizes()); + _alias_into_sparse(r, mask_indices.clone(), r_values); + _get_sparse_impl(r)->set_coalesced(mask.is_coalesced()); + int64_t r_nnz = mask._nnz(); + _get_sparse_impl(r)->set_nnz(r_nnz); + // NB: Relies on mask._nnz() == 0 test above + auto mask_indices_accessor = mask_indices.accessor(); + + if (dim > sparseDims) { + // NB: This used to reuse buffers, but I deoptimized it + for (int64_t i = 0; i < r_nnz; i++) { + Tensor srcBuffer = t; + for (int64_t d = 0; d < sparseDims; d++) { + srcBuffer = srcBuffer.select(0, mask_indices_accessor[d][i]); + } + Tensor dstBuffer = r_values.select(0, i); + dstBuffer.copy_(srcBuffer); + } + } else { + AT_DISPATCH_ALL_TYPES( + r_values.type(), "sparse_mask", [&] { + auto r_values_accessor = r_values.accessor(); + // NB: The old code did this pointer access in a weird way (going straight + // to storage + storageOffset.) Was there perhaps a method to the + // madness? + scalar_t* t_ptr = t.data(); + for (int64_t i = 0; i < r_nnz; i++) { + int64_t idx = 0; + for (int64_t d = 0; d < sparseDims; d++) { + idx += mask_indices_accessor[d][i] * t.stride(d); + } + scalar_t val = t_ptr[idx]; + r_values_accessor[i] = val; + } + }); + } + return r; +} + +SparseTensor sparse_mask_cpu(const Tensor& t, SparseTensorRef mask) { + SparseTensor r = t.type().toSparse().tensor(); + sparse_mask_out_cpu(r, t, mask.tref); + return r; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp new file mode 100644 index 0000000..4a25665 --- /dev/null +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -0,0 +1,870 @@ +#include +#include +#include +#include +#include + +#include + +namespace at { namespace native { + +// -------------------------------------------------------------------- +// Utility functions +// -------------------------------------------------------------------- + +namespace { + LongTensor _to_csr(const int64_t* indices, int64_t dim, int64_t nnz) { + int64_t h, i, hp0, hp1; + LongTensor csr = native::zeros({dim + 1}, kLong); + + // TODO: eliminate this conditional when zero-size dims supported correctly + if (nnz > 0) { + auto csr_accessor = csr.accessor(); + // Convert the sparse matrix to CSR format +#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) + for (i=0; iindices() + // and not self._indices(), because the latter will possibly + // return a view (which means that the in-place operation will + // not work). + if (_get_sparse_impl(self)->indices().numel()) { + // TODO: To be fixed when we support zero-size dims + _get_sparse_impl(self)->indices().resize_({0}); + } + + if (_get_sparse_impl(self)->values().numel()) { + _get_sparse_impl(self)->values().resize_({0}); + } + _get_sparse_impl(self)->set_nnz(0); + _get_sparse_impl(self)->set_coalesced(true); // NB: This is new + return self; +} + +// NB: Don't need zeros, zeros_like, already implemented in TensorFactories + +// -------------------------------------------------------------------- +// mul(SparseTensor, Scalar) +// -------------------------------------------------------------------- + +SparseTensor& mul_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scalar value) { + AT_ASSERT(r.is_sparse()); + AT_ASSERT(t.is_sparse()); + + if (isSameTensor(r, t)) { + r._values().mul_(value); + } else { + r.resize_as_(t); + r._indices().resize_as_(t._indices()); + r._indices().copy_(t._indices()); + Tensor r_values = r._values(); // Sigh... needed because mul_out takes Tensor& + at::mul_out(r_values, t._values(), value); + _get_sparse_impl(r)->set_nnz(t._nnz()); + _get_sparse_impl(r)->set_coalesced(t.is_coalesced()); + } + return r; +} + +SparseTensor mul_sparse_scalar(const SparseTensor& t, Scalar value) { + SparseTensor r = t.type().tensor(); + mul_out_sparse_scalar(r, t, value); + return r; +} + +SparseTensor& mul_sparse_scalar_(SparseTensor& t, Scalar v) { + return mul_out_sparse_scalar(t, t, v); +} + +// -------------------------------------------------------------------- +// log1p(SparseTensor) +// -------------------------------------------------------------------- + +// TODO: add in-place variant + +SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { + AT_ASSERT(r.is_sparse()); + AT_ASSERT(t.is_sparse()); + + if (isSameTensor(r, t)) { + // don't have in-place log1p for uncoalesced input because coalesce() is not in-place + AT_CHECK( + r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); + } + else { + r = raw_copy_sparse_(r, t.coalesce()); + } + r._values().log1p_(); + return r; +} + +SparseTensor& log1p_sparse_(SparseTensor& t) { + AT_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); + return log1p_out_sparse(t, t); +} + +// -------------------------------------------------------------------- +// pow(SparseTensor, Scalar) +// -------------------------------------------------------------------- + +// TODO: add in-place variant + +SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Scalar value) { + AT_ASSERT(r.is_sparse()); + AT_ASSERT(t_.is_sparse()); + AT_CHECK(value.toDouble() != 0, "pow: cannot raise to zeroth power on sparse tensor; it would make the result tensor dense"); + + // This coalesce is why we can't easily provide an inplace variant + SparseTensor t = t_.coalesce(); + + r.resize_as_(t); + r._indices().resize_as_(t._indices()); + r._indices().copy_(t._indices()); + Tensor r_values = r._values(); // Sigh... needed because pow_out takes Tensor& + at::pow_out(r_values, t._values(), value); + _get_sparse_impl(r)->set_nnz(t._nnz()); + _get_sparse_impl(r)->set_coalesced(t.is_coalesced()); + + return r; +} + +SparseTensor pow_sparse_scalar(const SparseTensor& t, Scalar value) { + SparseTensor r = t.type().tensor(); + pow_out_sparse_scalar(r, t, value); + return r; +} + +// -------------------------------------------------------------------- +// div(SparseTensor, Scalar) +// -------------------------------------------------------------------- + +SparseTensor& div_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scalar value) { + AT_ASSERT(r.is_sparse()); + AT_ASSERT(t.is_sparse()); + + if (isSameTensor(r, t)) { + r._values().div_(value); + } else { + r.resize_as_(t); + r._indices().resize_as_(t._indices()); + r._indices().copy_(t._indices()); + Tensor r_values = r._values(); // Sigh... needed because div_out takes Tensor& + at::div_out(r_values, t._values(), value); + _get_sparse_impl(r)->set_nnz(t._nnz()); + _get_sparse_impl(r)->set_coalesced(t.is_coalesced()); + } + return r; +} + +SparseTensor div_sparse_scalar(const SparseTensor& t, Scalar value) { + SparseTensor r = t.type().tensor(); + div_out_sparse_scalar(r, t, value); + return r; +} + +SparseTensor& div_sparse_scalar_(SparseTensor& t, Scalar value) { + return div_out_sparse_scalar(t, t, value); +} + +// -------------------------------------------------------------------- +// norm(SparseTensor, Scalar) +// -------------------------------------------------------------------- + +// Only supports floating point, FYI +Tensor norm_sparse(const SparseTensor& self, Scalar value) { + AT_ASSERT(self.is_sparse()); + + return self.coalesce()._values().norm(value); +} + +// -------------------------------------------------------------------- +// add(SparseTensor, SparseTensor, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +SparseTensor& s_add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) { + AT_ASSERT(r.is_sparse()); + AT_ASSERT(t.is_sparse()); + AT_ASSERT(!t.is_cuda()); // the dispatch argument + AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!src.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + + AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes()); + + if (src._nnz() == 0) { + return raw_copy_sparse_(r, t); + } + if (t._nnz() == 0) { + return mul_out_sparse_scalar(r, src, value); + } + + AT_CHECK(_is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t._sparseDims(), " sparse dimensions while 'other' has ", src._sparseDims(), " sparse dimensions"); + + // saving those because they can be overwritten when doing in-place operations + int64_t t_nnz = t._nnz(), s_nnz = src._nnz(), max_nnz = t_nnz + s_nnz; + bool t_coalesced = t.is_coalesced(), s_coalesced = src.is_coalesced(); + int64_t sparseDims = src._sparseDims(); + LongTensor t_indices = t._indices(); + Tensor t_values = t._values(); + LongTensor src_indices = src._indices(); + Tensor s_values = src._values(); + LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz}); + Tensor r_values = _new_values_with_size_of(s_values, max_nnz).zero_(); + r.resize_as_(src); + _get_sparse_impl(r)->set_indices_and_values(r_indices, r_values); // TODO: sigh + + int64_t blockSize = r_values.stride(0); + int64_t cmp, d; + int64_t r_i = 0, t_i = 0, s_i = 0; + + // NB: relies on nnz tests above + auto t_indices_accessor = t_indices.accessor(); + auto r_indices_accessor = r_indices.accessor(); + auto src_indices_accessor = src_indices.accessor(); + + AT_DISPATCH_ALL_TYPES( + t_values.type(), "cadd_sparse", [&] { + scalar_t* t_values_ptr = t_values.data(); + scalar_t* s_values_ptr = s_values.data(); + scalar_t* r_values_ptr = r_values.data(); + scalar_t cast_value = value.to(); + while (t_i < t_nnz || s_i < s_nnz) { + if (t_i >= t_nnz) { + cmp = -1; + } else if (s_i >= s_nnz) { + cmp = 1; + } else { + cmp = 0; + for (d = 0; d < sparseDims; d++) { + if (t_indices_accessor[d][t_i] < src_indices_accessor[d][s_i]) { + cmp = 1; + break; + } + if (t_indices_accessor[d][t_i] > src_indices_accessor[d][s_i]) { + cmp = -1; + break; + } + } + } + if (cmp >= 0) { + for (d = 0; d < sparseDims; d++) { + r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i]; + } + THBlas_axpy(blockSize, 1, + t_values_ptr + t_i * blockSize, 1, + r_values_ptr + r_i * blockSize, 1); + t_i++; + } + if (cmp <= 0) { + for (d = 0; d < sparseDims; d++) { + r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i]; + } + THBlas_axpy(blockSize, cast_value, + s_values_ptr + s_i * blockSize, 1, + r_values_ptr + r_i * blockSize, 1); + s_i++; + } + r_i++; + } + } + ); + + _get_sparse_impl(r)->set_nnz(r_i); + // TODO: I think it may be possible to track inside the loop and + // detect when we are uncoalesced (e.g., by observing that an + // index goes backwards) which may be more precise than using the + // coalesced flag here. But this is easy. + _get_sparse_impl(r)->set_coalesced(t_coalesced && s_coalesced); + + return r; +} + +SparseTensor s_add_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar alpha) { + SparseTensor r = t.type().tensor(); + s_add_out_sparse_cpu(r, t, src, alpha); + return r; +} + +SparseTensor& s_add_sparse_cpu_(SparseTensor& t, const SparseTensor& src, Scalar alpha) { + return s_add_out_sparse_cpu(t, t, src, alpha); +} + +// -------------------------------------------------------------------- +// add(Tensor, SparseTensor, Scalar) +// formerly known as spcadd +// -------------------------------------------------------------------- + +template +void add_dense_sparse_worker_cpu(Tensor& r, Scalar value, const SparseTensor& sparse, const Tensor& indices, const Tensor& values) { + int64_t k; + + auto indices_accessor = indices.accessor(); + auto values_accessor = values.accessor(); + + scalar_t* r_ptr = r.data(); + scalar_t cast_value = value.to(); + + #pragma omp parallel for private(k) + for (k = 0; k < sparse._nnz(); k++) { + int64_t index = r.storage_offset(); + for (int64_t d = 0; d < sparse._sparseDims(); d++) { + index += r.stride(d) * indices_accessor[d][k]; + } + r_ptr[index] += cast_value * values_accessor[k]; + } +} + +Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, SparseTensorRef sparse__, Scalar value) { + const SparseTensor& sparse_ = sparse__.tref; + + AT_ASSERT(!r.is_sparse()); + AT_ASSERT(!dense.is_sparse()); + AT_ASSERT(sparse_.is_sparse()); + + AT_ASSERT(!dense.is_cuda()); // dispatch argument + AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!sparse_.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + + AT_CHECK(dense.sizes().equals(sparse_.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ", + dense.sizes(), " while other has size ", sparse_.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)"); + + r.resize_as_(dense); + SparseTensor sparse = sparse_.coalesce(); + + LongTensor indices = sparse._indices(); + Tensor values = sparse._values(); + int64_t nDim = dense.dim(); + int64_t nDimI = sparse._sparseDims(); + + if (!isSameTensor(r, dense)) r.copy_(dense); + if (sparse._nnz() == 0) return r; + + // accessors rely on nnz test + if (nDim > nDimI) { + auto indices_accessor = indices.accessor(); + for (int64_t k = 0; k < sparse._nnz(); k++) { + Tensor dstBuffer = r; + for (int64_t d = 0; d < sparse._sparseDims(); d++) { + dstBuffer = dstBuffer.select(0, indices_accessor[d][k]); + } + Tensor srcBuffer = values.select(0, k); + dstBuffer.add_(srcBuffer, value); + } + } else { + AT_DISPATCH_ALL_TYPES( + values.type(), "add_dense_sparse", [&] { + add_dense_sparse_worker_cpu(r, value, sparse, indices, values); + }); + } + return r; +} + +Tensor add_dense_sparse_cpu(const Tensor& t, SparseTensorRef src, Scalar alpha) { + Tensor r = t.type().tensor(); + add_out_dense_sparse_cpu(r, t, src, alpha); + return r; +} + +Tensor& add_dense_sparse_cpu_(Tensor& t, SparseTensorRef src, Scalar alpha) { + return add_out_dense_sparse_cpu(t, t, src, alpha); +} + + +// -------------------------------------------------------------------- +// sub(SparseTensor, SparseTensor, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +SparseTensor& s_sub_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) { + AT_ASSERT(!t.is_cuda()); // dispatch argument + AT_CHECK(!r.is_cuda(), "sub: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!src.is_cuda(), "sub: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + + // UGH... We're doing two dispatches on scalar type here for no good reason. + // NB: I tried adding an operator- to Scalar, but there isn't any good way + // to negate the tensor, because I have a TensorBase... + AT_DISPATCH_ALL_TYPES( + t.type(), "sub_sparse", [&] { + scalar_t cast_value = value.to(); + s_add_out_sparse_cpu(r, t, src, -cast_value); + } + ); + return r; +} + +SparseTensor s_sub_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar alpha) { + SparseTensor r = t.type().tensor(); + s_sub_out_sparse_cpu(r, t, src, alpha); + return r; +} + +SparseTensor& s_sub_sparse_cpu_(SparseTensor& t, const SparseTensor& src, Scalar alpha) { + return s_sub_out_sparse_cpu(t, t, src, alpha); +} + +// -------------------------------------------------------------------- +// mul(SparseTensor, SparseTensor, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +SparseTensor& s_mul_out_sparse_cpu(SparseTensor& r, const SparseTensor& t_, const SparseTensor& src_) { + AT_CHECK(t_.sizes().equals(src_.sizes()), "mul operands have incompatible sizes"); + AT_ASSERT(!t_.is_cuda()); // dispatch argument + AT_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!src_.is_cuda(), "mul: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + + AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes()); + + if (src_._nnz() == 0 || t_._nnz() == 0) { + return r.zero_(); + } + + SparseTensor t = t_.coalesce(); + SparseTensor src = src_.coalesce(); + + // saving those because they can be overwritten when doing in-place operations + int64_t t_nnz = t._nnz(), s_nnz = src._nnz(); + int64_t max_nnz = std::min(t_nnz, s_nnz); // multiply by zero is zero, and can be dropped + int64_t sparseDims = src._sparseDims(); + LongTensor t_indices = t._indices(); + Tensor t_values = t._values(); + LongTensor src_indices = src._indices(); + Tensor s_values = src._values(); + LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz}); + Tensor r_values = _new_values_with_size_of(t_values, max_nnz).zero_(); + r.resize_as_(src); + _get_sparse_impl(r)->set_indices_and_values(r_indices, r_values); // TODO: sigh + + int64_t match, d; + int64_t r_i = 0, t_i = 0, s_i = 0; + + // NB: relies on nnz test above + auto t_indices_accessor = t_indices.accessor(); + auto r_indices_accessor = r_indices.accessor(); + auto src_indices_accessor = src_indices.accessor(); + + // Check if we can find matching indices, and if so, write an + // entry to the result indices vector. Returns true if matching + // indices were found. + auto index_preamble = [&]() { + match = 1; + for (d = 0; d < sparseDims; d++) { + if (t_indices_accessor[d][t_i] < src_indices_accessor[d][s_i]) { + t_i++; + match = 0; + break; + } + if (t_indices_accessor[d][t_i] > src_indices_accessor[d][s_i]) { + s_i++; + match = 0; + break; + } + } + if (!match) return false; + for (d = 0; d < sparseDims; d++) { + r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i]; + } + return true; + }; + + if (t_values.dim() > 1) { + while (t_i < t_nnz && s_i < s_nnz) { + if (!index_preamble()) continue; + r_values.select(0, r_i).addcmul_(t_values.select(0, t_i), s_values.select(0, s_i)); + r_i++; + t_i++; + s_i++; + } + } else { + AT_DISPATCH_ALL_TYPES( + r_values.type(), "mul_out_sparse", [&] { + auto r_accessor = r_values.accessor(); + auto t_accessor = t_values.accessor(); + auto s_accessor = s_values.accessor(); + + while (t_i < t_nnz && s_i < s_nnz) { + if (!index_preamble()) continue; + r_accessor[r_i] = t_accessor[t_i] * s_accessor[s_i]; + r_i++; + t_i++; + s_i++; + } + } + ); + } + + _get_sparse_impl(r)->set_nnz(r_i); + _get_sparse_impl(r)->set_coalesced(true); + + return r; +} + +SparseTensor s_mul_sparse_cpu(const SparseTensor& t, const SparseTensor& src) { + SparseTensor r = t.type().tensor(); + s_mul_out_sparse_cpu(r, t, src); + return r; +} + +SparseTensor& s_mul_sparse_cpu_(SparseTensor& t, const SparseTensor& src) { + return s_mul_out_sparse_cpu(t, t, src); +} + +// -------------------------------------------------------------------- +// addmm(Tensor, SparseTensorRef, Tensor, Scalar, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +// NB: OMP pragmas have to get their own functions; can't put them in lambdas +template +void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dim_k, Tensor& r, Scalar beta, const Tensor& t, Scalar alpha, const Tensor& csr, const Tensor& indices, const Tensor& values, const Tensor& dense) { + int64_t h, i; + + // r_ = alpha * sparse * dense + scalar_t cast_alpha = alpha.to(); + scalar_t cast_beta = beta.to(); + if (cast_beta == 0) { + r.zero_(); + } else if (cast_beta == 1) { + if (!isSameTensor(r, t)) { + r.copy_(t); + } + } else { + at::mul_out(r, t, beta); + } + + auto csr_accessor = csr.accessor(); + auto indices_accessor = indices.accessor(); + + auto values_accessor = values.accessor(); + scalar_t* dense_ptr = dense.data(); + scalar_t* r_ptr = r.data(); + + int64_t dense_stride0 = dense.stride(0); + int64_t dense_stride1 = dense.stride(1); + int64_t r_stride0 = r.stride(0); + int64_t r_stride1 = r.stride(1); +#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000) + for (h = 0; h < dim_i; h++) { + int64_t i_start = csr_accessor[h]; + int64_t i_end = csr_accessor[h+1]; + for (i = i_start; i < i_end; i++) { + scalar_t val = values_accessor[i]; + int64_t col = indices_accessor[1][i]; + if (col >= 0 && col < dim_j) { + THBlas_axpy(dim_k, + cast_alpha * val, + dense_ptr + col * dense_stride0, dense_stride1, + r_ptr + h * r_stride0, r_stride1); + } else { + AT_ERROR("addmm: index out of bound: ", col, " not between 1 and ", dim_j); + } + } + } +}; + +Tensor& s_addmm_out_sparse_dense_cpu( + Tensor& r, + const Tensor& t, + const SparseTensor& sparse_, + const Tensor& dense, + Scalar beta, + Scalar alpha +) { + // TODO: This error message seems awfully opaque + AT_ASSERT(!t.is_cuda()); + AT_CHECK(!r.is_cuda(), "addmm: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); + AT_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); + + AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor"); + AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values"); + AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor"); + AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); + + SparseTensor sparse = sparse_.coalesce(); + + // ixj * jxk = ixk + int64_t dim_i = sparse.size(0); + int64_t dim_j = sparse.size(1); + int64_t dim_k = dense.size(1); + + AT_CHECK(dense.size(0) == dim_j, + "addmm: Argument #3 (dense): Expected dim 0 size ", dim_j, ", got ", dense.size(0)); + AT_CHECK(t.size(0) == dim_i, + "addmm: Argument #1 (t): Expected dim 0 size ", dim_i, ", got ", t.size(0)); + AT_CHECK(t.size(1) == dim_k, + "addmm: Argument #1 (t): Expected dim 1 size ", dim_k, ", got ", t.size(1)); + + r.resize_({dim_i, dim_k}); + + int64_t nnz = sparse._nnz(); + + if (nnz == 0) { + at::mul_out(r, t, beta); + return r; + } + + LongTensor indices = sparse._indices(); + Tensor values = sparse._values(); + LongTensor csr = _to_csr(indices.data(), dim_i, nnz); + + AT_DISPATCH_ALL_TYPES( + values.type(), "addmm_sparse_dense", [&] { + s_addmm_out_sparse_dense_worker(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, csr, indices, values, dense); + } + ); + + return r; + +} + +Tensor s_addmm_sparse_dense_cpu( + const Tensor& t, + const SparseTensor& sparse, + const Tensor& dense, + Scalar beta, + Scalar alpha +) { + Tensor r = t.type().tensor(); + s_addmm_out_sparse_dense_cpu(r, t, sparse, dense, beta, alpha); + return r; +} + +Tensor& s_addmm_sparse_dense_cpu_( + Tensor& t, + const SparseTensor& sparse, + const Tensor& dense, + Scalar beta, + Scalar alpha +) { + return s_addmm_out_sparse_dense_cpu(t, t, sparse, dense, beta, alpha); +} + + +// -------------------------------------------------------------------- +// hspmm(SparseTensor mat1, Tensor mat2) +// -------------------------------------------------------------------- + +SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_, const Tensor& dense) { + // TODO: Make this a real argument + Scalar alpha = 1; + + AT_ASSERT(!sparse_.is_cuda()); // dispatch argument + AT_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + + AT_CHECK(sparse_._sparseDims() == 2, + "hspmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor"); + AT_CHECK(sparse_._denseDims() == 0, + "hspmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values"); + AT_CHECK(dense.dim() == 2, + "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor"); + + int64_t m = sparse_.size(0); + int64_t k = sparse_.size(1); + int64_t n = dense.size(1); + + AT_CHECK(dense.size(0) == k, + "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0)); + + _get_sparse_impl(r)->raw_resize_(1, 1, {m, n}); + + SparseTensor sparse = sparse_.coalesce(); + + int64_t nnz = sparse._nnz(); + + if (nnz == 0) { + r.zero_(); + return r; + } + + LongTensor indices = at::CPU(kLong).tensor({1, nnz}); + + // Initialize the sparse matrix that will be used with spaddmm to send rows + // from the dense matrix to rows of the output's value tensor + SparseTensor newSparse = sparse.clone(); + LongTensor spIndices = newSparse._indices(); + LongTensor valueIndices = spIndices.select(0, 0); + + // Compute output indices + auto valueIndices_accessor = valueIndices.accessor(); + auto indices_accessor = indices.accessor(); + + int64_t i = -1, prevIdx = -1; + for (int64_t j = 0; j < nnz; j++) { + int64_t currIdx = valueIndices_accessor[j]; + if (currIdx != prevIdx) { + indices_accessor[0][++i] = currIdx; + prevIdx = currIdx; + } + valueIndices_accessor[j] = i; + } + int64_t outNnz = i + 1; + indices.resize_({1, outNnz}); + Tensor values = dense.type().tensor({outNnz, n}); + _get_sparse_impl(newSparse)->_sizes_mut()[0] = outNnz; // TODO: use something safer + + // Compute output values tensor with sparse * dense multiplication + s_addmm_out_sparse_dense_cpu(values, values, newSparse, dense, 0, alpha); + _get_sparse_impl(r)->set_indices_and_values(indices, values); // TODO: sigh + + return r; +} + +SparseTensor hspmm_sparse_cpu(const SparseTensor& sparse, const Tensor& dense) { + SparseTensor r = sparse.type().tensor(); + hspmm_out_sparse_cpu(r, sparse, dense); + return r; +} + +// -------------------------------------------------------------------- +// sspaddmm +// -------------------------------------------------------------------- + +SparseTensor& _sspaddmm_out_cpu( + SparseTensor& r, + const SparseTensor& t, + const SparseTensor& sparse_, + const Tensor& dense, + Scalar beta, + Scalar alpha +) { + AT_ASSERT(!t.is_cuda()); // dispatch argument + AT_CHECK(!r.is_cuda(), "sspaddmm: expected 'out' to be CPU tensor, but got CUDA tensor"); + AT_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); + AT_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); + + AT_CHECK(sparse_._sparseDims() == 2, + "sspaddmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor"); + AT_CHECK(sparse_._denseDims() == 0, + "sspaddmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values"); + AT_CHECK(dense.dim() == 2, + "sspaddmm: Argument #2: matrices expected, got ", dense.dim(), "D tensor"); + + SparseTensor sparse = sparse_.coalesce(); + + // ixj * jxk = ixk + int64_t dim_i = sparse.size(0); + int64_t dim_j = sparse.size(1); + int64_t dim_k = dense.size(1); + + // NB: This has to occur before the checks, because r may alias t. + // See test_saddmm + r.sparse_raw_resize_({dim_i, dim_k}, 2, 0); + + AT_CHECK(dense.size(0) == dim_j, + "sspaddmm: Argument #3: Expected dim 0 size ", dim_j, ", got ", dense.size(0)); + AT_CHECK(t.size(0) == dim_i, + "sspaddmm: Argument #1: Expected dim 0 size ", dim_i, ", got ", t.size(0)); + AT_CHECK(t.size(1) == dim_k, + "sspaddmm: Argument #1: Expected dim 1 size ", dim_k, ", got ", t.size(1)); + + int64_t nnz = sparse._nnz(); + LongTensor indices = sparse._indices(); + Tensor values = sparse._values(); + + LongTensor csr = _to_csr(indices.data(), dim_i, nnz); + + int64_t t_nnz = t._nnz(); + int64_t r_nnz = nnz * dim_k + t_nnz; + LongTensor newi = native::empty({2, r_nnz}, kLong); + LongTensor newv = native::zeros({r_nnz}, values.options()); + + if (t_nnz != 0) { + LongTensor narrowi = newi.narrow(1, 0, t_nnz); + Tensor narrowv = newv.narrow(0, 0, t_nnz); + + narrowi.copy_(t._indices()); + narrowv.copy_(t._values()); + newv.mul_(beta); + } + + // sparse = sparse * dense + int64_t p = t_nnz; + + auto csr_accessor = csr.accessor(); + auto indices_accessor = indices.accessor(); + auto newi_accessor = newi.accessor(); + + int64_t dense_stride0 = dense.stride(0); + int64_t dense_stride1 = dense.stride(1); + int64_t newv_stride0 = newv.stride(0); + + AT_DISPATCH_ALL_TYPES( + values.type(), "sspmm", [&] { + auto values_accessor = values.accessor(); + scalar_t* dense_ptr = dense.data(); + scalar_t* newv_ptr = newv.data(); + scalar_t cast_alpha = alpha.to(); + + for (int64_t h = 0; h < dim_i; h++) { + int64_t i_start = csr_accessor[h]; + int64_t i_end = csr_accessor[h+1]; + for (int64_t i = i_start; i < i_end; i++) { + scalar_t val = values_accessor[i]; + int64_t col = indices_accessor[1][i]; + if (col >= 0 && col < dim_j) { + THBlas_axpy(dim_k, + cast_alpha * val, + dense_ptr + col * dense_stride0, dense_stride1, + newv_ptr + p * newv_stride0, 1); + } else { + AT_ERROR("index out of bound. sspmm: ", col, " not between 1 and ", dim_j); + } + } + // Fill up the indices with the right values + if (i_start != i_end) { + for (int64_t i = 0; i < dim_k; i++) { + newi_accessor[0][p+i] = h; + newi_accessor[1][p+i] = i; + } + p += dim_k; + } + } + } + ); + + // to avoid a clone + _get_sparse_impl(r)->set_indices(newi); + _get_sparse_impl(r)->set_values(newv); + _get_sparse_impl(r)->set_nnz(p); + + return r; +} + +// sparse, sparse, sparse, dense, real, real -> sparse +Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self, + const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { + AT_ERROR("tensor.sspaddmm(...) can only be called on sparse tensors"); +} + +// sparse, dense -> sparse +Tensor smm(const Tensor& self, const Tensor& mat2) { + auto result = self.type().tensor(); + self.type().sspaddmm_out(result, result, self, mat2, 0.0, 1.0); + return result; +} + +// sparse, sparse, dense, real, real -> sparse +Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2, + Scalar beta, Scalar alpha) { + auto result = self.type().tensor(); + self.type().sspaddmm_out(result, self, mat1, mat2, beta, alpha); + return result; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h new file mode 100644 index 0000000..226b908 --- /dev/null +++ b/aten/src/ATen/native/sparse/SparseUtils.h @@ -0,0 +1,131 @@ +#include +#include + +#include + +namespace at { namespace native { + +// Just for documentary purposes +using SparseTensor = Tensor; +using LongTensor = Tensor; +using IntTensor = Tensor; +using SparseType = Type; + +namespace { + +// This is an internal utility function for getting at the SparseTensorImpl, +// so that we can write sparse tensor specific accessors for special fields +// in SparseTensor. You should only use this for writing low level +// setters/getters for SparseTensorImpl fields; otherwise, you should use +// the low level setters/getters that were implemented using this. +// +// This may be called repeatedly, so make sure it's pretty cheap. +SparseTensorImpl* _get_sparse_impl(const SparseTensor& self) { + if (!self.is_sparse()) AT_ERROR("_internal_get_SparseTensorImpl: not a sparse tensor"); + return static_cast(self.unsafeGetTensorImpl()); +} + +// Port of the old THCSTensor_(checkGPU), but it doesn't really belong here +// because it is more general +// NB: I dropped kernelP2PEnabled support +// NB: This only works if the tensors are KNOWN to be CUDA. +// TODO: Generalize it so it works on CPU as well +inline bool _check_device(ArrayRef ts) { + if (ts.empty()) { + return true; + } + const Tensor& ref_t = ts.front(); + int64_t curDevice = current_device(); + for (const Tensor& t : ts) { + if (t.get_device() != curDevice) return false; + } + return true; +} + +inline void _raw_resize_sparse(const SparseTensor& self, int64_t sparseDims, int64_t denseDims, IntList size) { + _get_sparse_impl(self)->raw_resize_(sparseDims, denseDims, size); +} + +// Takes indices and values and directly puts them into the sparse tensor, no +// copy. This used to be called THSTensor_(_move) +inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) { + _get_sparse_impl(self)->set_indices_and_values(indices, values); +} + +// Take indices and values and makes a (data) copy of them to put into the sparse +// indices/values. This used to be called THSTensor_(_set) +inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) { + _alias_into_sparse(self, indices.clone(), values.clone()); +} + +// Does NOT make copies of indices/values +inline SparseTensor _new_with_dims_and_tensor_sparse( + const SparseType& dtype, + int64_t sparseDims, + int64_t denseDims, + ArrayRef sizes, + const LongTensor& indices, + const Tensor& values) { + SparseTensor self = new_sparse(dtype); + _raw_resize_sparse(self, sparseDims, denseDims, sizes); + _alias_into_sparse(self, indices, values); + return self; +} + +// TODO: put this into the public API +inline bool isSameTensor(const Tensor& lhs, const Tensor& rhs) { + return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl(); +} + +inline bool _is_same_density(const SparseTensor& self, const SparseTensor& src) { + return self._sparseDims() == src._sparseDims() && self._denseDims() == src._denseDims(); +} + +// if forceClone is true, the result will forced to be a clone of self. +inline LongTensor _newFlattenedIndices(const SparseTensor& self, bool forceClone) { + LongTensor indices = self._indices(); + int64_t sparseDims = self._sparseDims(); + if (sparseDims == 1) { + if (forceClone) { + return indices.clone(); + } else { + return indices; + } + } else { + // FIXME TH_INDEX_BASE + int64_t factor = 1; + LongTensor indices1D = at::empty({1, self._nnz()}, indices.options()); + indices1D.fill_(TH_INDEX_BASE); + for (int64_t d = sparseDims - 1; d >= 0; d--) { + indices1D.add_(indices.select(0, d), factor); + if (TH_INDEX_BASE != 0) { + indices1D.add_(-TH_INDEX_BASE); + } + factor *= self.size(d); + } + return indices1D; + } +} + +// Give us a new values tensor, with the same dimensionality +// as 'values' but with a new number of non-zero elements. +// TODO: Expose this for real in ATen, some day? +// NB: Doesn't preserve data. +inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) { + if (values.numel() == 0) { // values tensor uninitialized + // TODO: This logic looks bogus; if we have an uninitialized + // values tensor, why should we believe that denseDims == 0? + // That's the assumption this code makes. + return values.type().tensor({nnz}); + } else { + std::vector size = values.sizes(); + size[0] = nnz; + return values.type().tensor(size); + } +} + + + +} // anonymous namespace + +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh new file mode 100644 index 0000000..44bd3ab --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh @@ -0,0 +1,323 @@ +#pragma once + +#include + +namespace at { namespace native { + +namespace apply { + +using at::cuda::detail::TensorInfo; +using indexT = int64_t; + +const int WARP_SIZE = 32; + +template +__device__ void applyOp2( + Op op, IndexType blockSize, + TensorInfo values1, IndexType idx1, + TensorInfo values2, IndexType idx2) { + for (IndexType k = blockIdx.x * blockDim.x + threadIdx.x; + k < blockSize; + k += gridDim.x * blockDim.x) { + op(values1.data + idx1 * blockSize + k, values2.data + idx2 * blockSize + k); + } +} + +template +__device__ void applyOp3( + Op op, IndexType blockSize, + TensorInfo values1, IndexType idx1, + TensorInfo values2, IndexType idx2, + TensorInfo values3, IndexType idx3) { + for (IndexType k = blockIdx.x * blockDim.x + threadIdx.x; + k < blockSize; + k += gridDim.x * blockDim.x) { + op(values1.data + idx1 * blockSize + k, + values2.data + idx2 * blockSize + k, + values3.data + idx3 * blockSize + k); + } +} + +template +__global__ void sparseElementwiseKernel( + Op op, + TensorInfo dense, + TensorInfo indices, + TensorInfo values, + const IndexType nnz) { + IndexType indskip = indices.strides[0]; + IndexType valueSize = values.strides[0]; + for (IndexType linearId = blockIdx.x; + linearId < nnz; + linearId += gridDim.x) { + IndexType index = 0; + for (IndexType d = 0; d < indices.sizes[0]; d++) { + index = dense.sizes[d] * index + indices.data[d * indskip + linearId]; + } + Real *dst = dense.data + index * valueSize; + Real *src = values.data + linearId * valueSize; + for (IndexType linearId2 = threadIdx.x; linearId2 < valueSize; linearId2 += blockDim.x) { + op(dst + linearId2, src + linearId2); + } + } +} + +template +__global__ void sparseElementwiseKernelScalar( + Op op, + TensorInfo dense, + TensorInfo indices, + TensorInfo values, + const IndexType nnz) { + IndexType indskip = indices.strides[0]; + for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; + linearId < nnz; + linearId += gridDim.x * blockDim.x) { + IndexType index = 0; + for (IndexType d = 0; d < indices.sizes[0]; d++) { + index = dense.sizes[d] * index + indices.data[d * indskip + linearId]; + } + op(dense.data + index, values.data + linearId); + } +} + +template +__global__ void valueSparseUnionKernel( + OpBoth opBoth, + OpLeft opLeft, + OpRight opRight, + TensorInfo r_indices, + TensorInfo t_indices, + TensorInfo s_indices, + TensorInfo r_values, + TensorInfo t_values, + TensorInfo s_values, + const IndexType t_nnz, const IndexType s_nnz) { + IndexType t_indskip = t_indices.strides[0]; + IndexType s_indskip = s_indices.strides[0]; + int64_t cmp, d; + int64_t nDimI = r_indices.sizes[0]; + IndexType valueSize = r_values.strides[0]; + IndexType r_i = 0, t_i = 0, s_i = 0; + while (t_i < t_nnz || s_i < s_nnz) { + if (t_i >= t_nnz) { + cmp = -1; + } else if (s_i >= s_nnz) { + cmp = 1; + } else { + cmp = 0; + for (d = 0; d < nDimI; d++) { + if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) { + cmp = 1; + break; + } + if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) { + cmp = -1; + break; + } + } + } + if (cmp == 0) applyOp3(opBoth, valueSize, r_values, r_i, t_values, t_i++, s_values, s_i++); + else if (cmp > 0) applyOp2(opLeft, valueSize, r_values, r_i, t_values, t_i++); + else if (cmp < 0) applyOp2(opRight, valueSize, r_values, r_i, s_values, s_i++); + r_i++; + } +} + +// TODO find a way to parallelize this... +template +__global__ void indexSparseUnionKernel( + TensorInfo r_indices, + TensorInfo t_indices, + TensorInfo s_indices, + const IndexType t_nnz, const IndexType s_nnz, IndexType *resultNnz) { + IndexType r_indskip = r_indices.strides[0]; + IndexType t_indskip = t_indices.strides[0]; + IndexType s_indskip = s_indices.strides[0]; + int64_t cmp, d; + int64_t nDimI = r_indices.sizes[0]; + IndexType r_i = 0, t_i = 0, s_i = 0; + while (t_i < t_nnz || s_i < s_nnz) { + if (t_i >= t_nnz) { + cmp = -1; + } else if (s_i >= s_nnz) { + cmp = 1; + } else { + cmp = 0; + for (d = 0; d < nDimI; d++) { + if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) { + cmp = 1; + break; + } + if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) { + cmp = -1; + break; + } + } + } + if (cmp >= 0) { + for (d = 0; d < nDimI; d++) { + r_indices.data[d * r_indskip + r_i] = t_indices.data[d * t_indskip + t_i]; + } + t_i++; + } + if (cmp <= 0) { + for (d = 0; d < nDimI; d++) { + r_indices.data[d * r_indskip + r_i] = s_indices.data[d * s_indskip + s_i]; + } + s_i++; + } + r_i++; + } + *resultNnz = r_i; +} + +template +__global__ void valueSparseIntersectionKernel( + Op op, + TensorInfo r_indices, + TensorInfo t_indices, + TensorInfo s_indices, + TensorInfo r_values, + TensorInfo t_values, + TensorInfo s_values, + const IndexType t_nnz, const IndexType s_nnz) { + IndexType t_indskip = t_indices.strides[0]; + IndexType s_indskip = s_indices.strides[0]; + int64_t match, d; + int64_t nDimI = r_indices.sizes[0]; + IndexType valueSize = r_values.strides[0]; + IndexType r_i = 0, t_i = 0, s_i = 0; + while (t_i < t_nnz && s_i < s_nnz) { + match = 1; + for (d = 0; d < nDimI; d++) { + if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) { + t_i++; + match = 0; + break; + } + if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) { + s_i++; + match = 0; + break; + } + } + if (!match) continue; + applyOp3(op, valueSize, r_values, r_i++, t_values, t_i++, s_values, s_i++); + } +} + +// TODO find a way to parallelize this... +template +__global__ void indexSparseIntersectionKernel( + TensorInfo r_indices, + TensorInfo t_indices, + TensorInfo s_indices, + const IndexType t_nnz, const IndexType s_nnz, IndexType *resultNnz) { + IndexType r_indskip = r_indices.strides[0]; + IndexType t_indskip = t_indices.strides[0]; + IndexType s_indskip = s_indices.strides[0]; + int64_t match, d; + int64_t nDimI = r_indices.sizes[0]; + IndexType r_i = 0, t_i = 0, s_i = 0; + while (t_i < t_nnz && s_i < s_nnz) { + match = 1; + for (d = 0; d < nDimI; d++) { + if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) { + t_i++; + match = 0; + break; + } + if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) { + s_i++; + match = 0; + break; + } + } + if (!match) continue; + for (d = 0; d < nDimI; d++) { + r_indices.data[d * r_indskip + r_i] = t_indices.data[d * t_indskip + t_i]; + } + r_i++; t_i++; s_i++; + } + *resultNnz = r_i; +} + +// template +// __global__ void coalesceValuesKernel_gridStrided( +// long *segment_offsets, long *value_indices, +// Dtype *values, Dtype *newValues, +// long nnz, long newNnz, long stride) { +// +// long chunksPerSeg = THCCeilDiv(stride, (long) blockDim.x); +// long numChunks = newNnz * chunksPerSeg; +// long chunkOffset = blockIdx.x * blockDim.y + threadIdx.y; +// long chunkStride = gridDim.x * blockDim.y; +// +// for (long chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) { +// long featureDim = (chunk % chunksPerSeg) * blockDim.x + threadIdx.x; +// if (featureDim < stride) { +// auto valFeat = values + featureDim; +// long seg = chunk / chunksPerSeg; +// auto begin = segment_offsets[seg]; +// auto end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz; +// Acctype valSum = static_cast::to(0); +// for (long valIdx = begin; valIdx < end; valIdx++) { +// const long valRow = value_indices[valIdx] * stride; +// valSum += static_cast::to(valFeat[valRow]); +// } +// newValues[seg * stride + featureDim] = static_cast::to(valSum); +// } +// } +// } + +template +__global__ void coalesceValuesKernel( + int64_t *segment_offsets, int64_t *value_indices, + Dtype *values, Dtype *newValues, + int64_t nnz, int64_t newNnz, int64_t stride) { + + int seg = blockIdx.x * 4 + threadIdx.y; + + // Number of values processed by each thread (grain size) + const int SZ = 4; + + if (seg < newNnz) { + const int newValueRow = seg * stride; + const int begin = segment_offsets[seg]; + const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz; + const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; + Acctype tmp[SZ]; + #pragma unroll + for (int ii = 0; ii < SZ; ii++) { + tmp[ii] = 0; + } + for (int row = begin; row < end; row++) { + const int valueRow = ((int) value_indices[row]) * stride; + + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) + { + tmp[ii] += static_cast(values[valueRow + featureDim]); + } + } + } + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) + { + newValues[newValueRow + featureDim] = static_cast(tmp[ii]); + } + } + } +} + +} // namespace apply + +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu new file mode 100644 index 0000000..0ed53be --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu @@ -0,0 +1,228 @@ +#include +#include +#include + +#include + +#include + +namespace at { namespace native { namespace sparse { namespace cuda { + +#ifndef __HIP_PLATFORM_HCC__ + +std::string cusparseGetErrorString(cusparseStatus_t status) { + switch(status) + { + case CUSPARSE_STATUS_SUCCESS: + return "success"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "library not initialized"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "resource allocation failed"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "an invalid numeric value was used as an argument"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "an absent device architectural feature is required"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "an access to GPU memory space failed"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "the GPU program failed to execute"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "an internal operation failed"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "the matrix type is not supported by this function"; + + case CUSPARSE_STATUS_ZERO_PIVOT: + return "an entry of the matrix is either structural zero or numerical zero (singular block)"; + + default: + { + std::ostringstream oss; + oss << "unknown error " << static_cast(status); + return oss.str(); + } + } +} + +inline void CUSPARSE_CHECK(cusparseStatus_t status) +{ + if (status != CUSPARSE_STATUS_SUCCESS) { + AT_ERROR("cusparse runtime error: ", cusparseGetErrorString(status)); + } +} + +inline cusparseHandle_t setCUDASparseStream() { + cusparseHandle_t handle = globalContext().getCurrentCUDASparseHandle(); + cusparseSetStream(handle, globalContext().getCurrentCUDAStream()); + return handle; +} + +void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr) { + AT_CHECK((m <= INT_MAX) && (nnz <= INT_MAX), + "cusparseXcoo2csr only supports m, nnz with the bound [val] <= ", + INT_MAX); + auto handle = setCUDASparseStream(); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, coorowind, nnz, m, csrrowptr, + TH_INDEX_BASE ? CUSPARSE_INDEX_BASE_ONE : CUSPARSE_INDEX_BASE_ZERO + )); +} + +cusparseOperation_t convertTransToCusparseOperation(char trans) { + if (trans == 't') return CUSPARSE_OPERATION_TRANSPOSE; + else if (trans == 'n') return CUSPARSE_OPERATION_NON_TRANSPOSE; + else if (trans == 'c') return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; + else { + AT_ERROR("trans must be one of: t, n, c"); + } +} + +void adjustLd(char transb, int64_t m, int64_t n, int64_t k, int64_t *ldb, int64_t *ldc) +{ + int transb_ = ((transb == 't') || (transb == 'T')); + + if(n == 1) + *ldc = m; + + if(transb_) + { + if(k == 1) + *ldb = n; + } + else + { + if(n == 1) + *ldb = k; + } +} + +/* Level 3 */ +void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) +{ + adjustLd(transb, m, n, k, &ldb, &ldc); + cusparseOperation_t opa = convertTransToCusparseOperation(transa); + cusparseOperation_t opb = convertTransToCusparseOperation(transb); + + AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX), + "cusparseScsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX); + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_nnz = (int)nnz; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + auto handle = setCUDASparseStream(); + cusparseMatDescr_t desc; + cusparseCreateMatDescr(&desc); +#if TH_INDEX_BASE == 1 + cusparseSetMatIndexBase(&desc, CUSPARSE_INDEX_BASE_ONE); +#endif + CUSPARSE_CHECK(cusparseScsrmm2(handle, opa, opb, i_m, i_n, i_k, i_nnz, &alpha, desc, csrvala, csrrowptra, csrcolinda, b, i_ldb, &beta, c, i_ldc)); +} + +void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) +{ + adjustLd(transb, m, n, k, &ldb, &ldc); + cusparseOperation_t opa = convertTransToCusparseOperation(transa); + cusparseOperation_t opb = convertTransToCusparseOperation(transb); + + AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX), + "cusparseDcsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX); + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_nnz = (int)nnz; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + auto handle = setCUDASparseStream(); + cusparseMatDescr_t desc; + cusparseCreateMatDescr(&desc); +#if TH_INDEX_BASE == 1 + cusparseSetMatIndexBase(&desc, CUSPARSE_INDEX_BASE_ONE); +#endif + CUSPARSE_CHECK(cusparseDcsrmm2(handle, opa, opb, i_m, i_n, i_k, i_nnz, &alpha, desc, csrvala, csrrowptra, csrcolinda, b, i_ldb, &beta, c, i_ldc)); + // TODO: I think this leaks the matrix descriptor. Proper fix is to create + // real descriptor classes +} + +/* format conversion */ +void CreateIdentityPermutation(int64_t nnz, int *P) { + AT_CHECK((nnz <= INT_MAX), + "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ", + INT_MAX); + int i_nnz = (int)nnz; + + auto handle = setCUDASparseStream(); + cusparseCreateIdentityPermutation(handle, i_nnz, P); +} + +void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes) +{ + AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <=", + INT_MAX); + int i_m = (int)m; + int i_n = (int)n; + int i_nnz = (int)nnz; + + auto handle = setCUDASparseStream(); + CUSPARSE_CHECK(cusparseXcsrsort_bufferSizeExt(handle, i_m, i_n, i_nnz, csrRowPtr, csrColInd, pBufferSizeInBytes)); +} + +void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer) +{ + AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + "Xcsrsort only supports m, n, nnz with the bound [val] <= ", + INT_MAX); + int i_m = (int)m; + int i_n = (int)n; + int i_nnz = (int)nnz; + + auto handle = setCUDASparseStream(); + cusparseMatDescr_t desc; + cusparseCreateMatDescr(&desc); +#if TH_INDEX_BASE == 1 + cusparseSetMatIndexBase(&desc, CUSPARSE_INDEX_BASE_ONE); +#endif + CUSPARSE_CHECK(cusparseXcsrsort(handle, i_m, i_n, i_nnz, desc, csrRowPtr, csrColInd, P, pBuffer)); + // TODO: I think this leaks the matrix descriptor. +} + +void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes) +{ + AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + "Xcoosort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ", + INT_MAX); + int i_m = (int)m; + int i_n = (int)n; + int i_nnz = (int)nnz; + + auto handle = setCUDASparseStream(); + CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, i_m, i_n, i_nnz, cooRows, cooCols, pBufferSizeInBytes)); +} + +void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer) +{ + AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + "XcoosortByRow only supports m, n, nnz with the bound [val] <= ", + INT_MAX); + int i_m = (int)m; + int i_n = (int)n; + int i_nnz = (int)nnz; + + auto handle = setCUDASparseStream(); + CUSPARSE_CHECK(cusparseXcoosortByRow(handle, i_m, i_n, i_nnz, cooRows, cooCols, P, pBuffer)); +} + +#endif + +}}}} // namespace at::native::sparse::cuda diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh new file mode 100644 index 0000000..ed800fc --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh @@ -0,0 +1,24 @@ +#pragma once + +#include + +namespace at { namespace native { namespace sparse { namespace cuda { + +AT_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr); + +/* Level 3 */ +AT_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc); +AT_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); + +// overloaded version +inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { Scsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); } +inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { Dcsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); } + +/* format conversion */ +AT_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P); +AT_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes); +AT_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer); +AT_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes); +AT_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer); + +}}}} // namespace at::native::sparse::cuda diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp new file mode 100644 index 0000000..68ab33a --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp @@ -0,0 +1,61 @@ +#include +#include + +#include + +namespace at { namespace native { + +SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const SparseTensor& mask) { + AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced"); + AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ", + t.sizes(), " but mask has size ", mask.sizes()); + AT_ASSERT(t.is_cuda()); // dispatch argument + AT_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU"); + AT_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU"); + AT_CHECK(_check_device({r, t, mask}), + "sparse_mask: arguments are located on different devices; self is on device ", t.get_device(), + ", mask is on device ", mask.get_device(), ", out is on device ", r.get_device()); + resize_as_sparse_(r, mask); + if (mask._nnz() == 0) { + return r.zero_(); + } + LongTensor mask_indices = mask._indices(); + Tensor mask_values = mask._values(); + Tensor r_values = r._values().type().tensor(mask_values.sizes()); + _alias_into_sparse(r, mask_indices.clone(), r_values); + _get_sparse_impl(r)->set_coalesced(mask.is_coalesced()); + _get_sparse_impl(r)->set_nnz(mask._nnz()); + + LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options()); + + for (int64_t d = 0; d < mask._sparseDims(); d++) { + indices.mul_(mask.size(d)); + // This used to use a buffer but I deoptimized it + indices.add_(mask_indices.select(0, d)); + } + + std::vector view_size(1 + mask._denseDims()); + view_size[0] = -1; + for (int64_t d = 0; d < mask._denseDims(); d++) { + view_size[d + 1] = mask.size(mask._sparseDims() + d); + } + + Tensor t_view = t.view(view_size); + // TODO: Re-audit this; it used to be an indexSelect directly into r_values + at::index_select_out(r_values, t_view, 0, indices); + + return r; +} + +SparseTensor sparse_mask_cuda(const Tensor& t, SparseTensorRef mask) { + SparseTensor r = t.type().toSparse().tensor(); + sparse_mask_out_cuda(r, t, mask.tref); + return r; +} + +// Technically, this is not actually CUDA specific +int64_t get_device_sparse_cuda(const Tensor& self) { + return self._values().get_device(); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu new file mode 100644 index 0000000..a12edc9 --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -0,0 +1,154 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif + +namespace at { namespace native { + +SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { +#ifndef __HIP_PLATFORM_HCC__ + int64_t nnz = self._nnz(); + if (nnz < 2) { + _get_sparse_impl(self)->set_coalesced(true); + } + if (self.is_coalesced()) { + return self; + } + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + // Replace instances with + + // For indices, a simple sort + unique suffices + // For values, we use a custom kernel for segmented reduction (can't use Thrust due to indirection). + + // TODO: I'm not sure if this could ever be non-contiguous + LongTensor values = self._values().contiguous(); + + int64_t sparseDims = self._sparseDims(); + int64_t stride = values.stride(0); + + // indices will be modified by Thrust, so we have to clone or use new storage + // here. + LongTensor indices1D = _newFlattenedIndices(self, true); + + LongTensor origIndices = at::empty({nnz}, self._indices().options()); + LongTensor uniqueOffsets = at::empty({nnz}, self._indices().options()); + + typedef thrust::device_ptr thrust_ptr; + thrust_ptr indicesIter(indices1D.data()); + thrust_ptr origIndicesIter(origIndices.data()); + thrust_ptr uniqueOffsetsIter(uniqueOffsets.data()); + + + // Fill sortedOrigIndices with sequential indices + thrust::counting_iterator countIterI(TH_INDEX_BASE); + thrust::counting_iterator countIterO(TH_INDEX_BASE); + + thrust::copy(policy, countIterI, countIterI + nnz, origIndicesIter); + thrust::copy(policy, countIterO, countIterO + nnz, uniqueOffsetsIter); + + thrust::sort_by_key(policy, + indicesIter, indicesIter + nnz, + origIndicesIter, ThrustLTOp() + ); + + // this forces device-host synchronization! + thrust::pair newEnd = thrust::unique_by_key(policy, + indicesIter, indicesIter + nnz, + uniqueOffsetsIter + ); + int64_t newNnz = newEnd.first - indicesIter; + + indices1D.resize_({1, newNnz}); + std::vector newValues_size(values.sizes()); + newValues_size[0] = newNnz; + Tensor newValues = at::empty(newValues_size, values.options()); + + dim3 grid(THCCeilDiv(newNnz, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128)); + dim3 block(32, 4); + AT_DISPATCH_ALL_TYPES_AND_HALF( + values.type(), "coalesce_sparse_cuda", [&] { + using accscalar_t = acc_type; + apply::coalesceValuesKernel<<>>( + uniqueOffsets.data(), + origIndices.data(), + values.data(), + newValues.data(), + nnz, + newNnz, + stride + ); + }); + +// this grid-strided version is slower but probably more flexible + // to different sizes + // int64_t blockX = min(stride, (int64_t) 512); + // dim3 block(blockX, 512 / blockX); + // int64_t grid = min((int64_t) 1024, THCCeilDiv((int64_t) newNnz * stride, (int64_t) block.x * block.y)); + // THCSTensor_coalesceValuesKernel_gridStrided<<>>( + // THCIndexTensor_(data)(state, uniqueOffsets), + // THCIndexTensor_(data)(state, origIndices), + // THCTensor_(data)(state, values), + // THCTensor_(data)(state, newValues), + // nnz, + // newNnz, + // stride + // ); + + //////////////////////////////////////////////////////////// + // unflatten indices if necessary + LongTensor newIndices; + if (sparseDims == 1) { + newIndices = indices1D; + } else { + newIndices = at::empty({sparseDims, newNnz}, origIndices.options()); + if (TH_INDEX_BASE != 0) { + indices1D.add_(-1); + } + for (int64_t d = sparseDims - 1; d >= 0; d--) { + // NB: Not a select, so I can preserve the outer dimension + LongTensor indicesSlice = newIndices.narrow(0, d, 1); + // Note for the porting guide: THCTensor_(copy) does NOT do normal + // broadcasting logic; instead, it will blast the elements from one + // to the other so long as the numel is the same + indicesSlice.copy_(indices1D); + indices1D.div_(self.size(d)); + indicesSlice.add_(indices1D, -self.size(d)); + } + if (TH_INDEX_BASE != 0) { + indices1D.add_(1); // "lol" + } + } + //////////////////////////////////////////////////////////// + + SparseTensor dst = ::at::native::sparse_coo_tensor(newIndices, newValues, self.sizes()); + _get_sparse_impl(dst)->set_coalesced(true); + + THCudaCheck(cudaGetLastError()); + return dst; +#else + AT_ERROR("coalesce_sparse_cuda: HIP not supported"); +#endif +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu new file mode 100644 index 0000000..3521fc3 --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -0,0 +1,530 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define I_INFO(tensor) cuda::detail::getTensorInfo(tensor) +#define V_INFO(tensor) cuda::detail::getTensorInfo(tensor) + +namespace at { namespace native { + +// -------------------------------------------------------------------- +// Utility functions +// -------------------------------------------------------------------- + +#ifndef __HIP_PLATFORM_HCC__ +namespace { + IntTensor _to_csr_int(const LongTensor& rowIndices, int64_t dim, int64_t nnz) { + IntTensor csr = at::empty({dim+1}, CUDA(kInt)); + IntTensor rowIndicesInt = at::empty({rowIndices.size(0)}, CUDA(kInt)); + rowIndicesInt.copy_(rowIndices); + sparse::cuda::Xcoo2csr(rowIndicesInt.data(), nnz, dim, csr.data()); + return csr; + } +} +#endif + +// NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not +// wired at all) + +// -------------------------------------------------------------------- +// addmm(Tensor, SparseTensorRef, Tensor, Scalar, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseTensor& sparse_, const Tensor& dense, Scalar beta, Scalar alpha) { +#ifndef __HIP_PLATFORM_HCC__ + AT_ASSERT(t.is_cuda()); // dispatch argument + AT_CHECK(r_.is_cuda(), "addmm: expected 'out' to be CUDA, but got CPU"); + AT_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU"); + AT_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU"); + + AT_CHECK(_check_device({sparse_, r_, t, dense})); + + // TODO: This error message seems awfully opaque + AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor"); + AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values"); + AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); + + // mxk * kxn = mxn + int64_t m = sparse_.size(0); + int64_t k = sparse_.size(1); + int64_t n = dense.size(1); + + AT_CHECK(t.size(0) == m, + "addmm: Argument #1 (t): Expected dim 0 size ", m, ", got ", t.size(0)); + AT_CHECK(t.size(1) == n, + "addmm: Argument #1 (t): Expected dim 1 size ", n, ", got ", t.size(1)); + AT_CHECK(dense.size(0) == k, + "addmm: Argument #3 (dense): Expected dim 0 size ", k, ", got ", dense.size(0)); + + r_.resize_({m, n}); + + SparseTensor sparse = sparse_.coalesce(); + + int64_t nnz = sparse._nnz(); + LongTensor indices = sparse._indices(); + Tensor values = sparse._values(); + + LongTensor rowIndices = indices.select(0, 0); + LongTensor colIndices = indices.select(0, 1); + IntTensor csr = _to_csr_int(rowIndices, m, nnz); + IntTensor colIndicesInt = at::empty({colIndices.size(0)}, indices.type().toScalarType(kInt)); + colIndicesInt.copy_(colIndices); + + // No half support, so we don't have to use CUDATypeConversion + Tensor r__; + AT_DISPATCH_FLOATING_TYPES( + values.type(), "addmm_sparse_cuda", [&] { + scalar_t cast_beta = beta.to(); + scalar_t cast_alpha = alpha.to(); + if (cast_beta == 0) { + r_.zero_(); + } else if (cast_beta == 1) { + if (!isSameTensor(t, r_)) { + r_.copy_(t); + } + } else { + at::mul_out(r_, t, beta); + } + + /* r_ */ + if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) { + r__ = r_; + } else { + // TODO: how... strange + r__ = r_.transpose(0, 1).clone(); + r__.transpose_(0, 1); + } + + /* dense */ + Tensor dense_; + char transpose_dense; + if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { + transpose_dense = 'n'; + dense_ = dense; + } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) { + transpose_dense = 't'; + dense_ = dense; + } else { + transpose_dense = 't'; + dense_ = dense.contiguous(); + } + + sparse::cuda::csrmm2( + 'n', + transpose_dense, + m, + n, + k, + nnz, + cast_alpha, + values.data(), + csr.data(), + colIndicesInt.data(), + dense_.data(), + (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), + cast_beta, + r__.data(), + r__.stride(1)); + + }); + + r_.copy_(r__); + return r_; +#else + AT_ERROR("s_addmm_out_sparse_dense_cuda: HIP not supported"); +#endif +} + +Tensor s_addmm_sparse_dense_cuda( + const Tensor& t, + const SparseTensor& sparse, + const Tensor& dense, + Scalar beta, + Scalar alpha +) { + Tensor r = t.type().tensor(); + s_addmm_out_sparse_dense_cuda(r, t, sparse, dense, beta, alpha); + return r; +} + +Tensor& s_addmm_sparse_dense_cuda_( + Tensor& t, + const SparseTensor& sparse, + const Tensor& dense, + Scalar beta, + Scalar alpha +) { + return s_addmm_out_sparse_dense_cuda(t, t, sparse, dense, beta, alpha); +} + +// Deleted sspaddmm (sparse, dense) -> sparse + +// -------------------------------------------------------------------- +// hspmm(SparseTensor mat1, Tensor mat2) +// -------------------------------------------------------------------- + +SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse_, const Tensor& dense/* , Scalar alpha */) { +#ifndef __HIP_PLATFORM_HCC__ + AT_ASSERT(sparse_.is_cuda()); // dispatch argument + AT_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU"); + AT_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU"); + + AT_CHECK(_check_device({r_, sparse_, dense})); + + AT_CHECK(sparse_._sparseDims() == 2, + "hspmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor"); + AT_CHECK(sparse_._denseDims() == 0, + "hspmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values"); + AT_CHECK(dense.dim() == 2, + "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor"); + + int64_t m = sparse_.size(0); + int64_t k = sparse_.size(1); + int64_t n = dense.size(1); + + AT_CHECK(dense.size(0) == k, + "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0)); + + _get_sparse_impl(r_)->raw_resize_(1, 1, {m, n}); + + cudaStream_t stream = globalContext().getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + SparseTensor sparse = sparse_.coalesce(); + + int64_t nnz = sparse._nnz(); + + LongTensor indices = at::empty({1, nnz}, CUDA(kLong)); + // create values in column-major format to avoid copying in spaddmm + Tensor values = at::empty({n, nnz}, dense.type()); + values.transpose_(0, 1); + + // why does sparse need to be cloned? If this is really necessary maybe we + // need to fuse this with newCoalesce + SparseTensor newSparse = sparse.clone(); + LongTensor spIndices = newSparse._indices(); + LongTensor dstIndices = spIndices.select(0, 0); + // Save destination indices to output hybrid tensor + indices.copy_(dstIndices); + // Replace destination indices with 0, 1, 2, 3, ... and compute output values + // tensor with sparse * dense multiplication + thrust::device_ptr indicesIter(dstIndices.data()); + thrust::sequence(policy, indicesIter, indicesIter + nnz); + _get_sparse_impl(newSparse)->_sizes_mut()[0] = nnz; // TODO: use something safer) + s_addmm_out_sparse_dense_cuda(values, values, newSparse, dense, 0, /*alpha*/ 1); + _get_sparse_impl(r_)->set_indices_and_values(indices, values); + + return r_; +#else + AT_ERROR("hspmm_out_sparse_cuda: HIP not supported"); +#endif +} + +SparseTensor hspmm_sparse_cuda(const SparseTensor& sparse, const Tensor& dense) { + SparseTensor r = sparse.type().tensor(); + hspmm_out_sparse_cuda(r, sparse, dense); + return r; +} + +// -------------------------------------------------------------------- +// add(Tensor, SparseTensorRef, Scalar) +// formerly known as spcadd +// -------------------------------------------------------------------- + +Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorRef sparse_, at::Scalar value) { +#ifndef __HIP_PLATFORM_HCC__ + const SparseTensor& sparse = sparse_.tref; + + AT_ASSERT(dense.is_cuda()); // dispatch argument + AT_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU"); + AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU"); + + AT_CHECK(_check_device({sparse, r_, dense})); + + AT_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ", + dense.sizes(), " while other has size ", sparse.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)"); + + const int64_t nnz = sparse._nnz(); + if (nnz == 0) { + r_.resize_as_(dense); + r_.copy_(dense); + return r_; + } + + Tensor r = r_; + if (!isSameTensor(r, dense)) { + r_.resize_as_(dense); + r_.copy_(dense); + } else { + AT_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )"); + r = r_.contiguous(); + } + + LongTensor indices = sparse._indices(); + Tensor values = sparse._values(); + int64_t nDim = dense.dim(); + int64_t nDimI = sparse._sparseDims(); + + if (sparse.is_coalesced()) { + // TODO benchmark to decide whether to remove this special case + const dim3 block = cuda::getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice); + if (sparse._denseDims() == 0) { + AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); + + AT_DISPATCH_ALL_TYPES_AND_HALF( + values.type(), "add_out_dense_sparse_cuda", [&] { + apply::sparseElementwiseKernelScalar, uint64_t, scalar_t> + <<>>( + TensorCAddOp(value.to()), + V_INFO(r_), I_INFO(indices), V_INFO(values), + static_cast(nnz)); + }); + } else { + AT_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); + + AT_DISPATCH_ALL_TYPES_AND_HALF( + values.type(), "add_out_dense_sparse_cuda", [&] { + apply::sparseElementwiseKernel, uint64_t, scalar_t> + <<>>( + TensorCAddOp(value.to()), + V_INFO(r_), I_INFO(indices), V_INFO(values), + static_cast(nnz)); + }); + } + } else { + LongTensor indices1D = _newFlattenedIndices(sparse, 0).squeeze_(0).narrow(0, 0, nnz); + + // FIXME: at some point we can wrap the scale into indexAdd + // NB: Purposely not inplace! + AT_DISPATCH_ALL_TYPES_AND_HALF( + values.type(), "add_out_dense_sparse_cuda", [&] { + if (value.to() != static_cast(1)) { + values = values.mul(value); + } + }); + + int64_t view_rows = 1; + int64_t view_columns = 1; + for (int i = 0; i < nDimI; i++) { + view_rows *= r.size(i); + } + for (int i = nDimI; i < nDim; i++) { + view_columns *= r.size(i); + } + + Tensor r_view = r.view({view_rows, view_columns}); + values = values.narrow(0, 0, nnz).reshape({nnz, view_columns}); + r_view.index_add_(0, indices1D, values); + } + THCudaCheck(cudaGetLastError()); + + return r_; +#else + AT_ERROR("add_out_dense_sparse_cuda: HIP not supported"); +#endif +} + +Tensor add_dense_sparse_cuda(const Tensor& t, SparseTensorRef src, Scalar alpha) { + Tensor r = t.type().tensor(); + add_out_dense_sparse_cuda(r, t, src, alpha); + return r; +} + +Tensor& add_dense_sparse_cuda_(Tensor& t, SparseTensorRef src, Scalar alpha) { + return add_out_dense_sparse_cuda(t, t, src, alpha); +} + +// -------------------------------------------------------------------- +// add(SparseTensor, SparseTensor, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +SparseTensor& s_add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) { +#ifndef __HIP_PLATFORM_HCC__ + AT_ASSERT(t.is_cuda()); // dispatch argument + AT_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU"); + AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU"); + + AT_CHECK(_check_device({r_, t, src})); + AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes()); + + if (src._nnz() == 0) { + return raw_copy_sparse_(r_, t); + } + if (t._nnz() == 0) { + return mul_out_sparse_scalar(r_, src, value); + } + + AT_CHECK(_is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t._sparseDims(), " sparse dimensions while 'other' has ", src._sparseDims(), " sparse dimensions"); + + // We deliberately choose to simply concat the indices and values tensors + // rather than merging them. This removes the need to synchronously fetch nnz + // at the end of the operation, at the cost of having a non-coalesced result. + // This trade-off is preferable for the common use-case of gradient accumulation. + LongTensor t_indices_ = t._indices(); + Tensor t_values_ = t._values(); + LongTensor s_indices_ = src._indices(); + Tensor s_values_ = src._values(); + + AT_DISPATCH_ALL_TYPES_AND_HALF( + s_values_.type(), "s_add_out_sparse_cuda", [&] { + if (value.to() != static_cast(1)) { + s_values_ = s_values_.mul(value); + } + }); + + LongTensor r_indices_ = at::cat({t_indices_, s_indices_}, 1); + Tensor r_values_ = at::cat({t_values_, s_values_}, 0); + r_.resize_as_(src); + _alias_into_sparse(r_, r_indices_, r_values_); + + // FIXME: add some heuristic about when to call coalesce() here, so that + // tensors don't totally blow up in size by concatenation; e.g. + // r->minUnique = max(a->minUnique + b->minUnique); + // if (r->nnz / r->minUnique > COMPACTION_THRESHOLD) { + // THCSTensor_(contiguous)(r); + // r->minUnique = r->nnz; + // } + + return r_; +#else + AT_ERROR("s_add_out_sparse_cuda: HIP not supported"); +#endif +} + +SparseTensor s_add_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar alpha) { + SparseTensor r = t.type().tensor(); + s_add_out_sparse_cuda(r, t, src, alpha); + return r; +} + +SparseTensor& s_add_sparse_cuda_(SparseTensor& t, const SparseTensor& src, Scalar alpha) { + return s_add_out_sparse_cuda(t, t, src, alpha); +} + +// -------------------------------------------------------------------- +// sub(SparseTensor, SparseTensor, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +SparseTensor& s_sub_out_sparse_cuda(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) { + AT_ASSERT(t.is_cuda()); // dispatch argument + AT_CHECK(src.is_cuda(), "sub: expected 'other' to be CUDA, but got CPU"); + AT_CHECK(r.is_cuda(), "sub: expected 'out' to be CUDA, but got CPU"); + + AT_DISPATCH_ALL_TYPES( + t.type(), "sub_sparse", [&] { + scalar_t cast_value = value.to(); + s_add_out_sparse_cuda(r, t, src, -cast_value); + } + ); + return r; +} + +SparseTensor s_sub_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar alpha) { + SparseTensor r = t.type().tensor(); + s_sub_out_sparse_cuda(r, t, src, alpha); + return r; +} + +SparseTensor& s_sub_sparse_cuda_(SparseTensor& t, const SparseTensor& src, Scalar alpha) { + return s_sub_out_sparse_cuda(t, t, src, alpha); +} + +// -------------------------------------------------------------------- +// mul(SparseTensor, SparseTensor, Scalar) [broadcasts] +// -------------------------------------------------------------------- + +SparseTensor& s_mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, const SparseTensor& src_) { +#ifndef __HIP_PLATFORM_HCC__ + AT_ASSERT(t_.is_cuda()); // dispatch argument + AT_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU"); + AT_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU"); + + AT_CHECK(_check_device({r_, t_, src_})); + AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes()); + + SparseTensor t = t_.coalesce(); + SparseTensor src = src_.coalesce(); + + if (src_._nnz() == 0 || t_._nnz() == 0) { + return r_.zero_(); + } + + // saving those because they can be overwritten when doing in-place operations + int64_t t_nnz = t._nnz(), s_nnz = src._nnz(); + int64_t max_nnz = std::min(t_nnz, s_nnz); // multiply by zero is zero, and can be dropped + int64_t sparseDims = src._sparseDims(); + LongTensor t_indices_ = t._indices(); + Tensor t_values_ = t._values(); + LongTensor s_indices_ = src._indices(); + Tensor s_values_ = src._values(); + LongTensor r_indices_ = t_indices_.type().tensor({sparseDims, max_nnz}); + Tensor r_values_ = _new_values_with_size_of(t_values_, max_nnz).zero_(); + r_.resize_as_(src); + _get_sparse_impl(r_)->set_indices_and_values(r_indices_, r_values_); // TODO: sigh + + int64_t valueSize = t_values_.stride(0); + const dim3 block = dim3(std::min(static_cast(cuda::getApplyBlock().x), valueSize)); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice); + AT_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions"); + + LongTensor resultNnz = at::empty({1}, CUDA(kLong)); + AT_DISPATCH_ALL_TYPES_AND_HALF( + t_values_.type(), "s_mul_out_sparse_cuda", [&] { + apply::valueSparseIntersectionKernel, uint64_t, scalar_t> + <<>>( + TensorMulOp(), + I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_), + V_INFO(r_values_), V_INFO(t_values_), V_INFO(s_values_), + static_cast(t_nnz), static_cast(s_nnz)); + THCudaCheck(cudaGetLastError()); + + apply::indexSparseIntersectionKernel + <<<1, 1, 0, stream>>>( + I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_), + // reinterpret_cast shenanigans, because we don't actually have + // unsigned tensors... + static_cast(t_nnz), static_cast(s_nnz), reinterpret_cast(resultNnz.data_ptr())); + THCudaCheck(cudaGetLastError()); + }); + + // sync! (surely there is a more idiomatic way to do this...) + LongTensor cpu_resultNnz = at::empty({1}, CPU(kLong)); + cpu_resultNnz.copy_(resultNnz); + _get_sparse_impl(r_)->set_nnz(cpu_resultNnz.accessor()[0]); + _get_sparse_impl(r_)->set_coalesced(true); + + return r_; +#else + AT_ERROR("s_mul_out_sparse_cuda: HIP not supported"); +#endif +} + +SparseTensor s_mul_sparse_cuda(const SparseTensor& t, const SparseTensor& src) { + SparseTensor r = t.type().tensor(); + s_mul_out_sparse_cuda(r, t, src); + return r; +} + +SparseTensor& s_mul_sparse_cuda_(SparseTensor& t, const SparseTensor& src) { + return s_mul_out_sparse_cuda(t, t, src); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h new file mode 100644 index 0000000..3b42b61 --- /dev/null +++ b/aten/src/ATen/native/utils/ParamsHash.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +// Hashing machinery for Params +// Fowler–Noll–Vo hash function +// see https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct ParamsHash { + // Params must be a POD because we read out its memory + // contenst as char* when hashing + static_assert(std::is_pod::value, "Params is not POD"); + + size_t operator()(const Params& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (int i = 0; i < (int)sizeof(Params); ++i) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +template +struct ParamsEqual { + // Params must be a POD because we read out its memory + // contenst as char* when comparing + static_assert(std::is_pod::value, "Params is not POD"); + + bool operator()(const Params& a, const Params& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Params)) == 0; + } +}; + + +}} // at::native diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py new file mode 100644 index 0000000..13d852d --- /dev/null +++ b/aten/src/ATen/native_parse.py @@ -0,0 +1,147 @@ +from __future__ import print_function +import re +import yaml +import pprint +import sys + +try: + # use faster C loader if available + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + + +def parse_default(s): + if s.lower() == 'true': + return True + elif s.lower() == 'false': + return False + elif s == 'nullptr': + return s + elif s == '{}': + return '{}' + elif s == 'nullopt': + return s + try: + return int(s) + except Exception: + try: + return float(s) + except Exception: + return s + + +def sanitize_types(typ): + # split tuples into constituent list + if typ[0] == '(' and typ[-1] == ')': + return [x.strip() for x in typ[1:-1].split(',')] + elif typ == 'Generator*': + return ['Generator *'] + return [typ] + + +def parse_arguments(args, func_decl, func_name, func_return): + arguments = [] + python_default_inits = func_decl.get('python_default_init', {}) + is_out_fn = func_name.endswith('_out') + if is_out_fn and func_decl.get('variants', []) not in ['function', ['function']]: + raise RuntimeError("Native functions suffixed with _out MUST be declared with only the function variant; " + "e.g., variants: function; otherwise you will tickle a Python argument binding bug " + "(which usually manifests itself as the result variable being undefined.) " + "The culprit was: {}".format(func_name)) + kwarg_only = False + + if len(args.strip()) == 0: + return arguments + + # TODO: Use a real parser here; this will get bamboozled + # by signatures that contain things like std::array (note the space) + for arg_idx, arg in enumerate(args.split(', ')): + type_and_name = [a.strip() for a in arg.rsplit(' ', 1)] + if type_and_name == ['*']: + assert not kwarg_only + kwarg_only = True + continue + + t, name = type_and_name + default = None + python_default_init = None + + if '=' in name: + ns = name.split('=', 1) + name, default = ns[0], parse_default(ns[1]) + + if name in python_default_inits: + assert default is None + python_default_init = python_default_inits[name] + + typ = sanitize_types(t) + assert len(typ) == 1 + argument_dict = {'type': typ[0].rstrip('?'), 'name': name, 'is_nullable': typ[0].endswith('?')} + match = re.match(r'IntList\[(\d+)\]', argument_dict['type']) + if match: + argument_dict['type'] = 'IntList' + argument_dict['size'] = int(match.group(1)) + if default is not None: + argument_dict['default'] = default + if python_default_init is not None: + argument_dict['python_default_init'] = python_default_init + # TODO: convention is that the ith-argument correspond to the i-th return, but it would + # be better if we just named everything and matched by name. + if is_out_fn and arg_idx < len(func_return): + argument_dict['output'] = True + if kwarg_only: + argument_dict['kwarg_only'] = True + + arguments.append(argument_dict) + return arguments + + +def has_sparse_dispatches(dispatches): + for dispatch in dispatches: + if 'Sparse' in dispatch: + return True + return False + + +def parse_native_yaml(path): + with open(path, 'r') as f: + return yaml.load(f, Loader=Loader) + + +def run(paths): + declarations = [] + for path in paths: + for func in parse_native_yaml(path): + declaration = {'mode': 'native'} + try: + if '->' in func['func']: + func_decl, return_type = [x.strip() for x in func['func'].split('->')] + return_type = sanitize_types(return_type) + else: + func_decl = func['func'] + return_type = [None] + fn_name, arguments = func_decl.split('(') + arguments = arguments.split(')')[0] + declaration['name'] = func.get('name', fn_name) + return_type = list(func.get('return', return_type)) + arguments = parse_arguments(arguments, func, declaration['name'], return_type) + output_arguments = [x for x in arguments if x.get('output')] + declaration['return'] = return_type if len(output_arguments) == 0 else output_arguments + declaration['variants'] = func.get('variants', ['method', 'function']) + declaration['deprecated'] = func.get('deprecated', False) + declaration['device_guard'] = func.get('device_guard', True) + declaration['arguments'] = func.get('arguments', arguments) + declaration['type_method_definition_dispatch'] = func.get('dispatch', declaration['name']) + declaration['aten_sparse'] = has_sparse_dispatches( + declaration['type_method_definition_dispatch']) + declarations.append(declaration) + except Exception as e: + msg = '''Exception raised in processing function: +{func} +Generated partial declaration: +{decl}'''.format(func=pprint.pformat(func), decl=pprint.pformat(declaration)) + print(msg, file=sys.stderr) + raise e + + return declarations diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml new file mode 100644 index 0000000..4590777 --- /dev/null +++ b/aten/src/ATen/nn.yaml @@ -0,0 +1,284 @@ +# Loss functions + +- name: binary_cross_entropy(Tensor self, Tensor target, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean) + cname: BCECriterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +- name: kl_div(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) + cname: DistKLDivCriterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +- name: l1_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) + cname: AbsCriterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +- name: mse_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) + cname: MSECriterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +- name: multi_margin_loss(Tensor self, LongTensor target, Scalar p=1, Scalar margin=1, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean) + cname: MultiMarginCriterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +- name: multilabel_margin_loss(Tensor self, LongTensor target, int64_t reduction=Reduction::ElementwiseMean) + cname: MultiLabelMarginCriterion + buffers: [is_target] + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + is_target: target_->isScalar() + +- name: nll_loss(Tensor self, LongTensor target, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean, int64_t ignore_index=-100) + cname: ClassNLLCriterion + buffers: [total_weight] + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + total_weight: 'true' + +- name: nll_loss2d(Tensor self, LongTensor target, Tensor weight={}, int64_t reduction=Reduction::ElementwiseMean, int64_t ignore_index=-100) + cname: SpatialClassNLLCriterion + buffers: [total_weight] + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + total_weight: 'true' + +- name: smooth_l1_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) + cname: SmoothL1Criterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +- name: soft_margin_loss(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) + cname: SoftMarginCriterion + scalar_check: + output: reduction != Reduction::None || self_->isScalar() + +# Activation functions + +- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1) + cname: ELU + has_inplace: True + scalar_check: + output: self_->isScalar() + grad_input: output_->isScalar() + +- name: glu(Tensor self, int64_t dim=-1) + cname: GatedLinear + wrap_dim: + dim: self + scalar_check: + output: 'false' + +- name: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) + cname: HardTanh + has_inplace: True + scalar_check: + output: self_->isScalar() + +- name: leaky_relu(Tensor self, Scalar negative_slope=0.01) + cname: LeakyReLU + has_inplace: True + scalar_check: + output: self_->isScalar() + +- name: log_sigmoid(Tensor self) + cname: LogSigmoid + buffers: [buffer] + scalar_check: + output: self_->isScalar() + buffer: self_->isScalar() + +- name: prelu(Tensor self, Tensor weight) + cname: PReLU + scalar_check: + output: self_->isScalar() + +# NOTE: we treat noise as an input (it's really a buffer) because the codegen +# can't handle in-place functions that have buffers +- name: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) + cname: RReLU + has_inplace: True + scalar_check: + output: self_->isScalar() + +- name: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) + cname: SoftPlus + scalar_check: + output: self_->isScalar() + +- name: softshrink(Tensor self, Scalar lambd=0.5) + cname: SoftShrink + scalar_check: + output: self_->isScalar() + +- name: threshold(Tensor self, Scalar threshold, Scalar value) + cname: Threshold + has_inplace: True + scalar_check: + output: self_->isScalar() + +# Pooling + +- name: adaptive_avg_pool2d(Tensor self, IntList[2] output_size) + cname: SpatialAdaptiveAveragePooling + +- name: adaptive_avg_pool3d(Tensor self, IntList[3] output_size) + cname: VolumetricAdaptiveAveragePooling + +- name: adaptive_max_pool2d(Tensor self, IntList[2] output_size) + cname: SpatialAdaptiveMaxPooling + +- name: adaptive_max_pool3d(Tensor self, IntList[3] output_size) + cname: VolumetricAdaptiveMaxPooling + +- name: avg_pool2d(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, bool ceil_mode=false, bool count_include_pad=true) + cname: SpatialAveragePooling + default_init: + stride: kernel_size + +- name: avg_pool3d(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, bool ceil_mode=false, bool count_include_pad=true) + cname: VolumetricAveragePooling + default_init: + stride: kernel_size + +- name: fractional_max_pool2d(Tensor self, IntList[2] kernel_size, IntList[2] output_size, Tensor random_samples) + cname: SpatialFractionalMaxPooling + scalar_check: + output: 'false' + +- name: max_pool2d_with_indices(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, IntList[2] dilation=1, bool ceil_mode=false) + cname: SpatialDilatedMaxPooling + default_init: + stride: kernel_size + +- name: max_pool3d_with_indices(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, IntList[3] dilation=1, bool ceil_mode=false) + cname: VolumetricDilatedMaxPooling + default_init: + stride: kernel_size + +- name: max_unpool2d(Tensor self, LongTensor indices, IntList[2] output_size) + cname: SpatialMaxUnpooling + +- name: max_unpool3d(Tensor self, LongTensor indices, IntList[3] output_size, IntList[3] stride, IntList[3] padding) + cname: VolumetricMaxUnpooling + +# Padding + +- name: reflection_pad1d(Tensor self, IntList[2] padding) + cname: TemporalReflectionPadding + +- name: reflection_pad2d(Tensor self, IntList[4] padding) + cname: SpatialReflectionPadding + +- name: replication_pad1d(Tensor self, IntList[2] padding) + cname: TemporalReplicationPadding + +- name: replication_pad2d(Tensor self, IntList[4] padding) + cname: SpatialReplicationPadding + +- name: replication_pad3d(Tensor self, IntList[6] padding) + cname: VolumetricReplicationPadding + +# Upsampling + +# Note: The upsampling backwards functions also include an IntList input_size +# parameter, which is added by nn_parse.py + +- name: upsample_linear1d(Tensor self, IntList[1] output_size, bool align_corners) + cname: TemporalUpSamplingLinear + scalar_check: + grad_input: 'false' + +- name: upsample_bilinear2d(Tensor self, IntList[2] output_size, bool align_corners) + cname: SpatialUpSamplingBilinear + scalar_check: + grad_input: 'false' + +- name: upsample_trilinear3d(Tensor self, IntList[3] output_size, bool align_corners) + cname: VolumetricUpSamplingTrilinear + scalar_check: + grad_input: 'false' + +- name: upsample_nearest1d(Tensor self, IntList[1] output_size) + cname: TemporalUpSamplingNearest + scalar_check: + grad_input: 'false' + +- name: upsample_nearest2d(Tensor self, IntList[2] output_size) + cname: SpatialUpSamplingNearest + scalar_check: + grad_input: 'false' + +- name: upsample_nearest3d(Tensor self, IntList[3] output_size) + cname: VolumetricUpSamplingNearest + scalar_check: + grad_input: 'false' + + +# Private functions. These also exist in TH, but we want the backwards functions +# to implement derivatives. + +- name: _sigmoid(Tensor self) + cname: Sigmoid + scalar_check: + output: self_->isScalar() + grad_input: output_->isScalar() + +- name: _tanh(Tensor self) + cname: Tanh + scalar_check: + output: self_->isScalar() + grad_input: output_->isScalar() + +# Batch normalization + +# The buffers here are somewhat hazardous, because their type will be +# based off of self, even though you may plausibly wish running_mean +# and running_var to have different precision than self (e.g., +# BatchNorm on half). Fortunately, THNN doesn't actually ever do this, +# so the buffer allocation code is "correct". If you ever do fix this, +# you should just port the function entirely to a native ATen function. +- name: thnn_batch_norm(Tensor self, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, bool training, double momentum, double eps) + cname: BatchNormalization + buffers: [save_mean, save_std] + +# Convolutions + +- name: thnn_conv_transpose2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] output_padding=0, IntList[2] dilation=1) + cname: SpatialFullDilatedConvolution + buffers: [columns, ones] + +- name: thnn_conv_transpose3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] output_padding=0, IntList[3] dilation=1) + cname: VolumetricFullDilatedConvolution + buffers: [finput, fgrad_input] + +- name: thnn_conv2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0) + cname: SpatialConvolutionMM + buffers: [finput, fgrad_input] + +- name: thnn_conv_depthwise2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1) + cname: SpatialDepthwiseConvolution + buffers: [] + +- name: thnn_conv3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0) + cname: VolumetricConvolutionMM + buffers: [finput, fgrad_input] + +- name: thnn_conv_dilated2d(Tensor self, Tensor weight, IntList[2] kernel_size, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1) + cname: SpatialDilatedConvolution + buffers: [columns, ones] + +- name: thnn_conv_dilated3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1) + cname: VolumetricDilatedConvolution + buffers: [columns, ones] + +# Vision + +- name: thnn_grid_sampler_bilinear2d(Tensor self, Tensor grid, int64_t padding_mode) + cname: SpatialGridSamplerBilinear + +- name: thnn_grid_sampler_bilinear3d(Tensor self, Tensor grid, int64_t padding_mode) + cname: VolumetricGridSamplerBilinear diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py new file mode 100644 index 0000000..d3e46f8 --- /dev/null +++ b/aten/src/ATen/nn_parse.py @@ -0,0 +1,415 @@ +import copy +import re +import common_with_cwrap +import yaml +from collections import OrderedDict, defaultdict + +try: + # use faster C loader if available + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + + +# matches `name`, `params` in `name(params)` +NAME_PARAM_REGEX = r'(\w+)\((.*)\)' + + +def argument_to_declaration(param, func=None): + arg = {} + arg['type'], name = param.split(' ') + if arg['type'] == 'Tensor': + arg['type'] = 'THTensor*' + elif arg['type'] == 'LongTensor': + arg['type'] = 'THIndexTensor*' + elif arg['type'] == 'Scalar': + arg['type'] = 'accreal' + elif arg['type'] == 'Generator*': + arg['type'] = 'THGenerator*' + + match = re.match(r'IntList\[(\d+)\]', arg['type']) + if match: + arg['type'] = 'IntList' + arg['size'] = int(match.group(1)) + + if '=' in name: + name, default = name.split('=') + arg['optional'] = True + arg['default'] = default + arg['name'] = name + + if func is not None: + default_inits = func.get('default_init', {}) + wrap_dims = func.get('wrap_dim', {}) + if name in default_inits: + # non constexpr defaults + arg['default_init'] = default_inits[name] + if name in wrap_dims: + arg['wrap_dim'] = wrap_dims[name] + + return arg + + +def output_arguments(thnn_function): + cname = thnn_function.name + output_args = [] + + # function_wrapper expects everything in a declaration to be in + # the base type (i.e. THTensor*), but if we pull a THCUNN only + # implementation, it will have THCTensor* as the arg type. So we + # strip the THC here before returning + def map_to_th_type(t): + if t.startswith('THC'): + t = t.replace('THC', 'TH') + return t + + def is_output_arg(arg_name, func_name): + if arg_name == 'output' and 'updateOutput' in cname: + return True + if name in {'gradInput', 'gradWeight', 'gradBias', 'gradGrid'}: + return True + if arg_name == 'indices' and 'updateOutput' in cname and 'Unpool' not in cname: + # indices is an output argument in pooling and an input in unpooling + return True + return False + + for arg in thnn_function.arguments: + name = arg.name + if is_output_arg(name, cname): + desc = { + 'type': map_to_th_type(arg.type), + 'name': camel_to_snake(name), + 'output': True, + } + if name.startswith('grad_'): + desc['is_nullable'] = True + output_args.append(desc) + return output_args + + +def get_return(args): + indices = [str(idx) for idx, arg in enumerate(args) if arg.get('output')] + return 'argument {}'.format(','.join(indices)) + + +ARGUMENT_MAPPINGS = { + 'k': 'kernel_size', + 'd': 'stride', + 'pad': 'padding', + 'p': 'padding', + 'o': 'output_size', + 'osize': 'output_size', + 'output': 'output_size', # as a prefix e.g. outputW + 'isize': 'input_size', + 'dilation': 'dilation', + 'adj': 'output_padding', + 'a': 'output_padding', +} + +DIMENSION_OFFSET = { + 'width': -1, + 'height': -2, + 'B': 0, + 'C': 1, + 'W': -1, + 'H': -2, + 'T': -3, + 'left': 0, + 'right': 1, + 'top': 2, + 'bottom': 3, + 'front': 4, + 'back': 5, +} + +SUBSTITUTIONS = { + 'input': 'self', + 'weights': 'weight', + 'train': 'training', + 'val': 'value', + 'lambda': 'lambd', + 'negval': 'negative_slope', +} + + +def camel_to_snake(name): + # from https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + +def get_thnn_args(thnn_function, params, inplace): + params_by_name = {p['name']: p for p in params} + + def arg_expr(prefix, suffix): + # e.g kW, kH + name = ARGUMENT_MAPPINGS[prefix] + if name not in params_by_name: + raise RuntimeError('missing arg "{}" in {}'.format(name, thnn_function.name)) + param = params_by_name[name] + if param['type'] == 'IntList' and 'size' in param: + name = name + '_' + index = DIMENSION_OFFSET[suffix] + if index < 0: + index += param['size'] + expr = '{}[{}]'.format(name, index) + return {'type': 'EXPRESSION', 'name': expr} + + thnn_args = [] + for arg in thnn_function.arguments: + name = arg.name + if name == 'state': + continue + if inplace and name == 'output': + name = 'self' + aten_name = camel_to_snake(SUBSTITUTIONS.get(name, name)) + parts = aten_name.split('_') + if aten_name in params_by_name: + param = params_by_name[aten_name] + if arg.is_optional: + param['is_nullable'] = True + thnn_args.append(copy.deepcopy(param)) + elif len(parts) == 2 and parts[0] in ARGUMENT_MAPPINGS and parts[1] in DIMENSION_OFFSET: + # e.g. pad_left + thnn_args.append(arg_expr(parts[0], parts[1])) + elif name[-1] in DIMENSION_OFFSET and name[:-1] in ARGUMENT_MAPPINGS: + # e.g kW, kH + thnn_args.append(arg_expr(name[:-1], name[-1])) + elif name == 'owidth' or name == 'oheight': + thnn_args.append(arg_expr(name[0], name[1:])) + elif name == 'scale': + thnn_args.append({'type': 'EXPRESSION', 'name': '1'}) + elif name == 'inplace': + thnn_args.append({'type': 'EXPRESSION', 'name': str(inplace).lower()}) + else: + raise RuntimeError("{}: can't find binding for '{}'" + .format(thnn_function.name, name)) + return thnn_args + + +def remove_unused_args(args, thnn_args): + """Returns the subset of args whose name appears in thnn_args""" + def clean_name(name): + name = name[:name.index('[')] if '[' in name else name + if name.endswith('_'): + name = name[:-1] + return name + uses = set([clean_name(arg['name']) for arg in thnn_args]) + uses.add('output_mask') + args = [arg for arg in args if arg['name'] in uses] + for arg in args: + if 'default' in arg: + del arg['default'] + return args + + +def unique_args(argslist): + result = [] + seen = set() + for args in argslist: + for arg in args: + if arg['name'] in seen: + continue + seen.add(arg['name']) + result.append(arg) + return result + + +def function_info(name, arguments, cimpls, buffers, backends, inplace, scalar_check): + """ + cimpls contains information use to call into THNN: + cname: THNN function name + arguments: arguments to functional call + condition: [optional] guard around call + """ + return { + 'mode': 'NN', + 'name': name, + 'types': ['Float', 'Double', 'Half'], # Half will be stripped for CPU backend + 'arguments': arguments, + 'return': 'argument 0' if inplace else get_return(arguments), + 'buffers': buffers, + 'backends': backends, + 'cimpls': cimpls, + 'scalar_check': scalar_check, + 'variants': ['function'], + } + + +def base_declaration(func, thnn_function, backends, inplace=False): + """Creates the NN function without any buffers in it's signature""" + name, params = re.match(NAME_PARAM_REGEX, func['name']).groups() + if inplace: + name += '_' + params = params.split(', ') + arguments = [argument_to_declaration(a, func) for a in params] + if not inplace: + arguments += output_arguments(thnn_function) + buffers = [argument_to_declaration('Tensor ' + buf) + for buf in func.get('buffers', [])] + + return function_info(name, arguments, None, buffers, backends, inplace, func.get('scalar_check')) + + +def forward_declaration(base, thnn_function, inplace=False): + name = '{}_forward'.format(base['name']) + if inplace: + name += '_' + + arguments = [copy.deepcopy(arg) for arg in base['arguments'] + if not arg.get('output')] + + arguments += output_arguments(thnn_function) + for buffer in base['buffers']: + buffer = copy.deepcopy(buffer) + buffer['output'] = True + arguments.append(buffer) + + thnn_args = get_thnn_args(thnn_function, arguments, inplace) + arguments = remove_unused_args(arguments, thnn_args) + cimpl = {'cname': thnn_function.name, 'arguments': thnn_args} + + scalar_check = base['scalar_check'] + if scalar_check is not None: + output_arg_names = [arg['name'] for arg in arguments if arg.get('output', False)] + scalar_check = {k: v for (k, v) in scalar_check.items() if k in output_arg_names} + + return function_info(name, arguments, [cimpl], [], base['backends'], inplace, scalar_check) + + +def backward_declaration(base, thnn_functions): + name = '{}_backward'.format(base['name']) + + arguments = [] + arguments.append({'type': 'THTensor*', 'name': 'grad_output'}) + arguments += [copy.deepcopy(arg) for arg in base['arguments'] + if arg['name'] != 'inplace'] + arguments += base['buffers'] + + if 'upsample' in base['name']: + # Add input_size as parameter to upsample backwards functions + # Note that input_size is 4-dim for upsample_xxx2d + size = 2 + int(re.search(r'(\d+)d', base['name']).group(1)) + input_size_arg = {'type': 'IntList', 'name': 'input_size', 'size': size} + for output_size_idx, arg in enumerate(arguments): + if arg['name'] == 'output_size': + break + arguments.insert(output_size_idx + 1, input_size_arg) + + # outputs from the forward may be inputs to the backwards + for arg in arguments: + if 'output' in arg: + del arg['output'] + + arguments += unique_args([output_arguments(f) for f in thnn_functions]) + + def initialize_output_arg(arg): + # the mask array specifies which return values to compute + arg['mask'] = True + arg['is_nullable'] = True + + # grad_weight and grad_bias need to be resized and zeroed + if arg['name'] == 'grad_weight': + arg['resize'] = 'weight' + arg['zero'] = True + if arg['name'] == 'grad_bias': + dim = 1 if 'transpose' in name else 0 + arg['resize'] = [('weight', dim)] + arg['zero'] = True + + is_batch_norm_backward = '_backward' in thnn_functions[0].name + grad_params = [] + if len(thnn_functions) > 1 or is_batch_norm_backward: + for arg in arguments: + if arg.get('output', False): + initialize_output_arg(arg) + if 'Tensor' in arg['type'] and arg['name'].startswith('grad_') and \ + 'input' not in arg['name'] and 'output' not in arg['name']: + grad_params.append(arg['name']) + + thnn_args = [get_thnn_args(f, arguments, False) for f in thnn_functions] + arguments = remove_unused_args(arguments, unique_args(thnn_args)) + cimpls = [] + + def get_condition(func): + # only call into the THNN functions if the output args are not null + if '_updateGradInput' in func.name: + return 'grad_input_' + if '_accGradParameters' in func.name: + return ' || '.join(p + '_' for p in grad_params) + return None + + for func, args in zip(thnn_functions, thnn_args): + cimpl = {'cname': func.name, 'arguments': args} + if len(thnn_functions) > 1: + cimpl['condition'] = get_condition(func) + cimpls.append(cimpl) + + output_args = [arg for arg in arguments if arg.get('output', False)] + scalar_check_arg = base['scalar_check'] if base['scalar_check'] is not None else dict() + scalar_check = {k: v for (k, v) in scalar_check_arg.items() if k in [a['name'] for a in output_args]} + for arg in output_args: + # resize automatically sets scalar_check + if scalar_check.get(arg['name']) is not None or arg.get('resize', False): + pass + else: + base_name = arg['name'][len('grad_'):] if arg['name'] != 'grad_input' else 'self' + if base_name in [a['name'] for a in arguments]: + scalar_check[arg['name']] = base_name + '_->isScalar()' + else: + raise ValueError(("Could not infer scalar_check for {} argument of func {} because {} " + "does not exist. Please explicitly specify scalar_check." + .format(arg['name'], name, base_name))) + + return function_info(name, arguments, cimpls, [], base['backends'], False, scalar_check) + + +def parse_nn_yaml(filename): + with open(filename, 'r') as f: + return yaml.load(f, Loader=Loader) + + +include_only = '(updateOutput|updateGradInput|accGradParameters|backward)$' +exclude = 'LookupTable' + + +def run(paths): + function_backends = defaultdict(list) + header_functions = OrderedDict() + + headers = [p for p in paths if p.endswith('.h')] + yamls = [p for p in paths if p.endswith('.yaml')] + + for path in headers: + backend = 'CUDA' if re.search('THCU', path) else 'CPU' + for func in common_with_cwrap.parse_header(path): + if re.search(include_only, func.name) is None or re.search(exclude, func.name) is not None: + continue + function_backends[func.name].append(backend) + if func.name not in header_functions: + header_functions[func.name] = func + + bwd_suffixes = ['_updateGradInput', '_accGradParameters', '_backward'] + + declarations = [] + for path in yamls: + for func in parse_nn_yaml(path): + cname = func['cname'] + backends = function_backends[cname + '_updateOutput'] + + fwd_function = header_functions[cname + '_updateOutput'] + bwd_functions = [] + for suffix in bwd_suffixes: + if cname + suffix in header_functions: + bwd_functions.append(header_functions[cname + suffix]) + + base = base_declaration(func, fwd_function, backends) + declarations.append(base) + declarations.append(forward_declaration(base, fwd_function)) + declarations.append(backward_declaration(base, bwd_functions)) + + if func.get('has_inplace', False): + declarations.append(base_declaration(func, fwd_function, backends, True)) + declarations.append(forward_declaration(base, fwd_function, True)) + + return declarations diff --git a/aten/src/ATen/optional.h b/aten/src/ATen/optional.h new file mode 100644 index 0000000..287ddd8 --- /dev/null +++ b/aten/src/ATen/optional.h @@ -0,0 +1,982 @@ +// Copyright (C) 2011 - 2012 Andrzej Krzemienski. +// +// Use, modification, and distribution is subject to the Boost Software +// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// The idea and interface is based on Boost.Optional library +// authored by Fernando Luis Cacciola Carballal +// +// From https://github.com/akrzemi1/Optional +// +// ATen: +// - Move to `at` namespace. +// - Remove macro use in line 478 because the nvcc device compiler cannot handle it. + +#pragma once + +# include +# include +# include +# include +# include +# include +# include + +# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false + +# if defined __GNUC__ // NOTE: GNUC is also defined for Clang +# if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) +# define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +# elif (__GNUC__ > 4) +# define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +# endif +# +# if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7) +# define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +# elif (__GNUC__ > 4) +# define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +# endif +# +# if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1) +# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) +# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# elif (__GNUC__ > 4) +# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# endif +# endif +# +# if defined __clang_major__ +# if (__clang_major__ == 3 && __clang_minor__ >= 5) +# define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +# elif (__clang_major__ > 3) +# define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +# endif +# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +# define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +# elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2) +# define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +# endif +# endif +# +# if defined _MSC_VER +# if (_MSC_VER >= 1900) +# define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +# endif +# endif + +# if defined __clang__ +# if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9) +# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +# else +# define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +# endif +# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +# else +# define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +# endif + + +# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1 +# define OPTIONAL_CONSTEXPR_INIT_LIST constexpr +# else +# define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0 +# define OPTIONAL_CONSTEXPR_INIT_LIST +# endif + +# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L) +# define OPTIONAL_HAS_MOVE_ACCESSORS 1 +# else +# define OPTIONAL_HAS_MOVE_ACCESSORS 0 +# endif + +# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr +# if (defined __cplusplus) && (__cplusplus == 201103L) +# define OPTIONAL_MUTABLE_CONSTEXPR +# else +# define OPTIONAL_MUTABLE_CONSTEXPR constexpr +# endif + +namespace at { + +// 20.5.4, optional for object types +template class optional; + +// 20.5.5, optional for lvalue reference types +template class optional; + + +// workaround: std utility functions aren't constexpr yet +template inline constexpr T&& constexpr_forward(typename std::remove_reference::type& t) noexcept +{ + return static_cast(t); +} + +template inline constexpr T&& constexpr_forward(typename std::remove_reference::type&& t) noexcept +{ + static_assert(!std::is_lvalue_reference::value, "!!"); + return static_cast(t); +} + +template inline constexpr typename std::remove_reference::type&& constexpr_move(T&& t) noexcept +{ + return static_cast::type&&>(t); +} + + +#if defined NDEBUG +# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR) +#else +# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR))) +#endif + + +namespace detail_ +{ + +// static_addressof: a constexpr version of addressof +template +struct has_overloaded_addressof +{ + template + constexpr static bool has_overload(...) { return false; } + + template ().operator&()) > + constexpr static bool has_overload(bool) { return true; } + + constexpr static bool value = has_overload(true); +}; + +template )> +constexpr T* static_addressof(T& ref) +{ + return &ref; +} + +template )> +T* static_addressof(T& ref) +{ + return std::addressof(ref); +} + + +// the call to convert(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A +template +constexpr U convert(U v) { return v; } + +} // namespace detail + + +constexpr struct trivial_init_t{} trivial_init{}; + + +// 20.5.6, In-place construction +constexpr struct in_place_t{} in_place{}; + + +// 20.5.7, Disengaged state indicator +struct nullopt_t +{ + struct init{}; + constexpr explicit nullopt_t(init){} +}; +constexpr nullopt_t nullopt{nullopt_t::init()}; + + +// 20.5.8, class bad_optional_access +class bad_optional_access : public std::logic_error { +public: + explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {} + explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {} +}; + + +template +union storage_t +{ + unsigned char dummy_; + T value_; + + constexpr storage_t( trivial_init_t ) noexcept : dummy_() {}; + + template + constexpr storage_t( Args&&... args ) : value_(constexpr_forward(args)...) {} + + ~storage_t(){} +}; + + +template +union constexpr_storage_t +{ + unsigned char dummy_; + T value_; + + constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {}; + + template + constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward(args)...) {} + + ~constexpr_storage_t() = default; +}; + + +template +struct optional_base +{ + bool init_; + storage_t storage_; + + constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {}; + + explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {} + + explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {} + + template explicit optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template >)> + explicit optional_base(in_place_t, std::initializer_list il, Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~optional_base() { if (init_) storage_.value_.T::~T(); } +}; + + +template +struct constexpr_optional_base +{ + bool init_; + constexpr_storage_t storage_; + + constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {}; + + explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {} + + explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {} + + template explicit constexpr constexpr_optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template >)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list il, Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~constexpr_optional_base() = default; +}; + +template +using OptionalBase = typename std::conditional< + std::is_trivially_destructible::value, // if possible + constexpr_optional_base::type>, // use base with trivial destructor + optional_base::type> +>::type; + + + +template +class optional : private OptionalBase +{ + static_assert( !std::is_same::type, nullopt_t>::value, "bad T" ); + static_assert( !std::is_same::type, in_place_t>::value, "bad T" ); + + + constexpr bool initialized() const noexcept { return OptionalBase::init_; } + typename std::remove_const::type* dataptr() { return std::addressof(OptionalBase::storage_.value_); } + constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase::storage_.value_); } + +# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + constexpr const T& contained_val() const& { return OptionalBase::storage_.value_; } +# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase::storage_.value_); } + OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase::storage_.value_; } +# else + T& contained_val() & { return OptionalBase::storage_.value_; } + T&& contained_val() && { return std::move(OptionalBase::storage_.value_); } +# endif +# else + constexpr const T& contained_val() const { return OptionalBase::storage_.value_; } + T& contained_val() { return OptionalBase::storage_.value_; } +# endif + + void clear() noexcept { + if (initialized()) dataptr()->T::~T(); + OptionalBase::init_ = false; + } + + template + void initialize(Args&&... args) noexcept(noexcept(T(std::forward(args)...))) + { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(std::forward(args)...); + OptionalBase::init_ = true; + } + + template + void initialize(std::initializer_list il, Args&&... args) noexcept(noexcept(T(il, std::forward(args)...))) + { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(il, std::forward(args)...); + OptionalBase::init_ = true; + } + +public: + typedef T value_type; + + // 20.5.5.1, constructors + constexpr optional() noexcept : OptionalBase() {}; + constexpr optional(nullopt_t) noexcept : OptionalBase() {}; + + optional(const optional& rhs) + : OptionalBase() + { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(*rhs); + OptionalBase::init_ = true; + } + } + + optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible::value) + : OptionalBase() + { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(std::move(*rhs)); + OptionalBase::init_ = true; + } + } + + constexpr optional(const T& v) : OptionalBase(v) {} + + constexpr optional(T&& v) : OptionalBase(constexpr_move(v)) {} + + template + explicit constexpr optional(in_place_t, Args&&... args) + : OptionalBase(in_place_t{}, constexpr_forward(args)...) {} + + template >)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list il, Args&&... args) + : OptionalBase(in_place_t{}, il, constexpr_forward(args)...) {} + + // 20.5.4.2, Destructor + ~optional() = default; + + // 20.5.4.3, assignment + optional& operator=(nullopt_t) noexcept + { + clear(); + return *this; + } + + optional& operator=(const optional& rhs) + { + if (initialized() == true && rhs.initialized() == false) clear(); + else if (initialized() == false && rhs.initialized() == true) initialize(*rhs); + else if (initialized() == true && rhs.initialized() == true) contained_val() = *rhs; + return *this; + } + + optional& operator=(optional&& rhs) + noexcept(std::is_nothrow_move_assignable::value && std::is_nothrow_move_constructible::value) + { + if (initialized() == true && rhs.initialized() == false) clear(); + else if (initialized() == false && rhs.initialized() == true) initialize(std::move(*rhs)); + else if (initialized() == true && rhs.initialized() == true) contained_val() = std::move(*rhs); + return *this; + } + + template + auto operator=(U&& v) + -> typename std::enable_if + < + std::is_same::type, T>::value, + optional& + >::type + { + if (initialized()) { contained_val() = std::forward(v); } + else { initialize(std::forward(v)); } + return *this; + } + + + template + void emplace(Args&&... args) + { + clear(); + initialize(std::forward(args)...); + } + + template + void emplace(std::initializer_list il, Args&&... args) + { + clear(); + initialize(il, std::forward(args)...); + } + + // 20.5.4.4, Swap + void swap(optional& rhs) noexcept(std::is_nothrow_move_constructible::value && noexcept(swap(std::declval(), std::declval()))) + { + if (initialized() == true && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); } + else if (initialized() == false && rhs.initialized() == true) { initialize(std::move(*rhs)); rhs.clear(); } + else if (initialized() == true && rhs.initialized() == true) { using std::swap; swap(**this, *rhs); } + } + + // 20.5.4.5, Observers + + explicit constexpr operator bool() const noexcept { return initialized(); } + constexpr bool has_value() const noexcept { return initialized(); } + + constexpr T const* operator ->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr()); + } + +# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() { + assert (initialized()); + return dataptr(); + } + + constexpr T const& operator *() const& { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & { + assert (initialized()); + return contained_val(); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && { + assert (initialized()); + return constexpr_move(contained_val()); + } + + constexpr T const& value() const& { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& value() & { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& value() && { + if (!initialized()) throw bad_optional_access("bad optional access"); + return std::move(contained_val()); + } + +# else + + T* operator ->() { + assert (initialized()); + return dataptr(); + } + + constexpr T const& operator *() const { + return contained_val(); + } + + T& operator *() { + assert (initialized()); + return contained_val(); + } + + constexpr T const& value() const { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + + T& value() { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + +# endif + +# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + + template + constexpr T value_or(V&& v) const& + { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + template + OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && + { + return *this ? constexpr_move(const_cast&>(*this).contained_val()) : detail_::convert(constexpr_forward(v)); + } + +# else + + template + T value_or(V&& v) && + { + return *this ? constexpr_move(const_cast&>(*this).contained_val()) : detail_::convert(constexpr_forward(v)); + } + +# endif + +# else + + template + constexpr T value_or(V&& v) const + { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +# endif + + // 20.6.3.6, modifiers + void reset() noexcept { clear(); } +}; + + +template +class optional +{ + static_assert( !std::is_same::value, "bad T" ); + static_assert( !std::is_same::value, "bad T" ); + T* ref; + +public: + + // 20.5.5.1, construction/destruction + constexpr optional() noexcept : ref(nullptr) {} + + constexpr optional(nullopt_t) noexcept : ref(nullptr) {} + + constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {} + + optional(T&&) = delete; + + constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {} + + explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {} + + explicit optional(in_place_t, T&&) = delete; + + ~optional() = default; + + // 20.5.5.2, mutation + optional& operator=(nullopt_t) noexcept { + ref = nullptr; + return *this; + } + + // optional& operator=(const optional& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + // optional& operator=(optional&& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + template + auto operator=(U&& rhs) noexcept + -> typename std::enable_if + < + std::is_same::type, optional>::value, + optional& + >::type + { + ref = rhs.ref; + return *this; + } + + template + auto operator=(U&& rhs) noexcept + -> typename std::enable_if + < + !std::is_same::type, optional>::value, + optional& + >::type + = delete; + + void emplace(T& v) noexcept { + ref = detail_::static_addressof(v); + } + + void emplace(T&&) = delete; + + + void swap(optional& rhs) noexcept + { + std::swap(ref, rhs.ref); + } + + // 20.5.5.3, observers + constexpr T* operator->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref); + } + + constexpr T& operator*() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref); + } + + constexpr T& value() const { + return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref); + } + + explicit constexpr operator bool() const noexcept { + return ref != nullptr; + } + + constexpr bool has_value() const noexcept { + return ref != nullptr; + } + + template + constexpr typename std::decay::type value_or(V&& v) const + { + return *this ? **this : detail_::convert::type>(constexpr_forward(v)); + } + + // x.x.x.x, modifiers + void reset() noexcept { ref = nullptr; } +}; + + +template +class optional +{ + static_assert( sizeof(T) == 0, "optional rvalue references disallowed" ); +}; + + +// 20.5.8, Relational operators +template constexpr bool operator==(const optional& x, const optional& y) +{ + return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y; +} + +template constexpr bool operator!=(const optional& x, const optional& y) +{ + return !(x == y); +} + +template constexpr bool operator<(const optional& x, const optional& y) +{ + return (!y) ? false : (!x) ? true : *x < *y; +} + +template constexpr bool operator>(const optional& x, const optional& y) +{ + return (y < x); +} + +template constexpr bool operator<=(const optional& x, const optional& y) +{ + return !(y < x); +} + +template constexpr bool operator>=(const optional& x, const optional& y) +{ + return !(x < y); +} + + +// 20.5.9, Comparison with nullopt +template constexpr bool operator==(const optional& x, nullopt_t) noexcept +{ + return (!x); +} + +template constexpr bool operator==(nullopt_t, const optional& x) noexcept +{ + return (!x); +} + +template constexpr bool operator!=(const optional& x, nullopt_t) noexcept +{ + return bool(x); +} + +template constexpr bool operator!=(nullopt_t, const optional& x) noexcept +{ + return bool(x); +} + +template constexpr bool operator<(const optional&, nullopt_t) noexcept +{ + return false; +} + +template constexpr bool operator<(nullopt_t, const optional& x) noexcept +{ + return bool(x); +} + +template constexpr bool operator<=(const optional& x, nullopt_t) noexcept +{ + return (!x); +} + +template constexpr bool operator<=(nullopt_t, const optional&) noexcept +{ + return true; +} + +template constexpr bool operator>(const optional& x, nullopt_t) noexcept +{ + return bool(x); +} + +template constexpr bool operator>(nullopt_t, const optional&) noexcept +{ + return false; +} + +template constexpr bool operator>=(const optional&, nullopt_t) noexcept +{ + return true; +} + +template constexpr bool operator>=(nullopt_t, const optional& x) noexcept +{ + return (!x); +} + + + +// 20.5.10, Comparison with T +template constexpr bool operator==(const optional& x, const T& v) +{ + return bool(x) ? *x == v : false; +} + +template constexpr bool operator==(const T& v, const optional& x) +{ + return bool(x) ? v == *x : false; +} + +template constexpr bool operator!=(const optional& x, const T& v) +{ + return bool(x) ? *x != v : true; +} + +template constexpr bool operator!=(const T& v, const optional& x) +{ + return bool(x) ? v != *x : true; +} + +template constexpr bool operator<(const optional& x, const T& v) +{ + return bool(x) ? *x < v : true; +} + +template constexpr bool operator>(const T& v, const optional& x) +{ + return bool(x) ? v > *x : true; +} + +template constexpr bool operator>(const optional& x, const T& v) +{ + return bool(x) ? *x > v : false; +} + +template constexpr bool operator<(const T& v, const optional& x) +{ + return bool(x) ? v < *x : false; +} + +template constexpr bool operator>=(const optional& x, const T& v) +{ + return bool(x) ? *x >= v : false; +} + +template constexpr bool operator<=(const T& v, const optional& x) +{ + return bool(x) ? v <= *x : false; +} + +template constexpr bool operator<=(const optional& x, const T& v) +{ + return bool(x) ? *x <= v : true; +} + +template constexpr bool operator>=(const T& v, const optional& x) +{ + return bool(x) ? v >= *x : true; +} + + +// Comparison of optional with T +template constexpr bool operator==(const optional& x, const T& v) +{ + return bool(x) ? *x == v : false; +} + +template constexpr bool operator==(const T& v, const optional& x) +{ + return bool(x) ? v == *x : false; +} + +template constexpr bool operator!=(const optional& x, const T& v) +{ + return bool(x) ? *x != v : true; +} + +template constexpr bool operator!=(const T& v, const optional& x) +{ + return bool(x) ? v != *x : true; +} + +template constexpr bool operator<(const optional& x, const T& v) +{ + return bool(x) ? *x < v : true; +} + +template constexpr bool operator>(const T& v, const optional& x) +{ + return bool(x) ? v > *x : true; +} + +template constexpr bool operator>(const optional& x, const T& v) +{ + return bool(x) ? *x > v : false; +} + +template constexpr bool operator<(const T& v, const optional& x) +{ + return bool(x) ? v < *x : false; +} + +template constexpr bool operator>=(const optional& x, const T& v) +{ + return bool(x) ? *x >= v : false; +} + +template constexpr bool operator<=(const T& v, const optional& x) +{ + return bool(x) ? v <= *x : false; +} + +template constexpr bool operator<=(const optional& x, const T& v) +{ + return bool(x) ? *x <= v : true; +} + +template constexpr bool operator>=(const T& v, const optional& x) +{ + return bool(x) ? v >= *x : true; +} + +// Comparison of optional with T +template constexpr bool operator==(const optional& x, const T& v) +{ + return bool(x) ? *x == v : false; +} + +template constexpr bool operator==(const T& v, const optional& x) +{ + return bool(x) ? v == *x : false; +} + +template constexpr bool operator!=(const optional& x, const T& v) +{ + return bool(x) ? *x != v : true; +} + +template constexpr bool operator!=(const T& v, const optional& x) +{ + return bool(x) ? v != *x : true; +} + +template constexpr bool operator<(const optional& x, const T& v) +{ + return bool(x) ? *x < v : true; +} + +template constexpr bool operator>(const T& v, const optional& x) +{ + return bool(x) ? v > *x : true; +} + +template constexpr bool operator>(const optional& x, const T& v) +{ + return bool(x) ? *x > v : false; +} + +template constexpr bool operator<(const T& v, const optional& x) +{ + return bool(x) ? v < *x : false; +} + +template constexpr bool operator>=(const optional& x, const T& v) +{ + return bool(x) ? *x >= v : false; +} + +template constexpr bool operator<=(const T& v, const optional& x) +{ + return bool(x) ? v <= *x : false; +} + +template constexpr bool operator<=(const optional& x, const T& v) +{ + return bool(x) ? *x <= v : true; +} + +template constexpr bool operator>=(const T& v, const optional& x) +{ + return bool(x) ? v >= *x : true; +} + + +// 20.5.12, Specialized algorithms +template +void swap(optional& x, optional& y) noexcept(noexcept(x.swap(y))) +{ + x.swap(y); +} + + +template +constexpr optional::type> make_optional(T&& v) +{ + return optional::type>(constexpr_forward(v)); +} + +template +constexpr optional make_optional(std::reference_wrapper v) +{ + return optional(v.get()); +} + + +} // namespace at + +namespace std +{ + template + struct hash> + { + typedef typename hash::result_type result_type; + typedef at::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } + }; + + template + struct hash> + { + typedef typename hash::result_type result_type; + typedef at::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } + }; +} + +# undef TR2_OPTIONAL_REQUIRES +# undef TR2_OPTIONAL_ASSERTED_EXPRESSION diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py new file mode 100644 index 0000000..1bc33e5 --- /dev/null +++ b/aten/src/ATen/preprocess_declarations.py @@ -0,0 +1,242 @@ +import re +from copy import deepcopy +from function_wrapper import TYPE_FORMAL_GENERIC +import common_with_cwrap + +type_map = { + 'floating_point': [ + 'Float', + 'Double', + 'Half', + ], + 'integral': [ + 'Byte', + 'Char', + 'Short', + 'Int', + 'Long' + ], +} + +all_types = type_map['floating_point'] + type_map['integral'] +type_map['all'] = all_types + +all_backends = ['CPU', 'CUDA', 'SparseCPU', 'SparseCUDA'] +default_backends = ['CPU', 'CUDA'] + +sparse_map = { + 'CPU': 'SparseCPU', + 'CUDA': 'SparseCUDA', +} + + +def process_types_and_backends(option): + # if specific pairs were not listed, then enumerate them + # based on the backend and type attributes + # if backend or type is not defined, it is assumed to be all of them + if 'backend_type_pairs' not in option: + backends = option.get('backends', default_backends) + if option.get('aten_sparse', False): + backends.extend([sparse_map[p] for p in backends if p in sparse_map]) + backends = set(backends) + + types = option.get('types', all_types) + + pairs = [[p, t] for p in backends for t in types] + else: + pairs = option['backend_type_pairs'] + + # expand type alias (integral, floating_point, all) + def expand(pair): + p, t = pair + assert(p in all_backends) + if t in type_map: + return [(p, tt) for tt in type_map[t]] + assert(t in all_types) + return [(p, t)] + pairs = set(p for pair in pairs for p in expand(pair)) + + # disable CUDA Half if there is a Sparse argument + for arg in option.get('arguments', []): + if arg['type'] == 'THSTensor*': + pairs.discard(('CUDA', 'Half')) + + # special case remove Half for cpu unless it is explicitly enabled, + if not option.get('cpu_half', False): + pairs.discard(('CPU', 'Half')) + + # sort the result for easy reading + option['backend_type_pairs'] = sorted([p for p in pairs]) + + +def exclude(declaration): + return 'only_register' in declaration or declaration.get('python_name') == 'ndimension' + + +def add_variants(option): + option.setdefault('variants', ['method']) + +# if we have 'output' arguments, generate a variant where +# we mark oututs as allocate = True, and where the method variant +# is disabled... + + +def handle_outputs_taken_as_arguments(options): + new_options = [] + + def is_nullable(arg): + return (arg['type'] in {'THIntegerTensor*', 'THTensor*'} and + arg.get('default', '') in {None, 'NULL', 'nullptr'}) + + def should_generate_out_variant(option): + if 'function' in option['variants'] and option['mode'] != 'native': + # don't generate _out variants for in-place functions + return re.search('(^__i|[^_]_$)', option['api_name']) is None + return False + + for option in options: + for arg in option['arguments']: + # mark arguments which can be null + if is_nullable(arg): + arg['is_nullable'] = True + + if any('output' in arg for arg in option['arguments']): + allocate_option = deepcopy(option) + # the allocating option needs to be marked + for arg in allocate_option['arguments']: + if 'output' in arg: + arg['allocate'] = True + + # the original option, which takes arguments for the results, + # is no longer a method, and has _out added to indicte it takes + # output arguments + if should_generate_out_variant(option): + if 'method' in option['variants']: + option['variants'].remove('method') + option['api_name'] += '_out' + new_options.append(option) + + new_options.append(allocate_option) + else: + new_options.append(option) + return new_options + + +def sanitize_return(option): + ret = option['return'] + m = re.match('argument (\d+(,\d+)*)', ret) + if m is not None: + arguments = [int(x) for x in m.group(1).split(',')] + option['return'] = {'kind': 'arguments', 'arguments': arguments} + elif ret == 'self': + option['return'] = {'kind': 'arguments', 'arguments': []} + for i, x in enumerate(option['arguments']): + if x['name'] == 'self': + option['return']['arguments'].append(i) + break + else: + option['return'] = {'kind': 'type', 'type': option['return']} + + +def set_mode(option): + option['mode'] = option.get('mode', 'TH') + +# To enable 0-dim support in TH operations +# we find all places where a single Scalar replaced with a Tensor +# as an argument is still a valid function +# we then mark the tensor variant with a key zero_dim_dispatch_when_scalar: name +# where 'name' is the name of the argument that should be a scalar +# during dispatch, if that argument is marked internally as holding a scalar +# then the method will dispatch to that function. + + +def discover_zero_dim_tensor_operations(declaration): + def exclude(arg): + return arg.get('ignore_check') + + def signature(option, i=None, value=None): + elements = [TYPE_FORMAL_GENERIC.get(arg['type'], arg['type']) + if i is None or j != i else value + for j, arg in enumerate(option['arguments']) + if not exclude(arg)] + return '#'.join(elements) + signature_to_option = {signature(option): option + for option in declaration['options']} + + for option in declaration['options']: + for i, arg in enumerate(option['arguments']): + if arg['type'] == 'real': + signature_of_tensor_version = signature(option, i, 'Tensor &') + if signature_of_tensor_version in signature_to_option: + tensor_version = \ + signature_to_option[signature_of_tensor_version] + names = [arg['name'] for arg in tensor_version['arguments'] + if not exclude(arg)] + tensor_version['zero_dim_dispatch_when_scalar'] = names[i] + # print("FOUND "+str(i) ) + # print("Scalar Version ===== ") + # print(yaml.dump(option)) + # print("Tensor Version ===== ") + # print(yaml.dump(tensor_version)) + # print("SHARED "+names[i]) + + +def discover_sparse_tensor_operations(declaration): + def exclude(arg): + return arg.get('ignore_check') + + def signature(option, i=None, value=None): + elements = [TYPE_FORMAL_GENERIC.get(arg['type'], arg['type']) + if i is None or j != i else value + for j, arg in enumerate(option['arguments']) + if not exclude(arg)] + return '#'.join(elements) + + # Determine if any options have the 'aten_dense_sparse' flag + dense_sparse_options = [option + for option in declaration['options'] + if option.get('aten_dense_sparse', False)] + if len(dense_sparse_options) > 0: + signature_to_option = {signature(option): option + for option in declaration['options']} + + for option in declaration['options']: + for i, arg in enumerate(option['arguments']): + if (arg['type'] == 'THSTensor*' and + option.get('aten_dense_sparse', False)): + signature_of_tensor_version = signature( + option, i, 'Tensor &') + if signature_of_tensor_version in signature_to_option: + tensor_version = \ + signature_to_option[signature_of_tensor_version] + raw_args = len(tensor_version['arguments']) + names = [arg['name'] for arg in tensor_version['arguments'] + if not exclude(arg)] + filtered_args = len(names) + tensor_version['when_sparse_dispatch'] = names[i - + (raw_args - filtered_args)] + + +def run(declarations): + declarations = [d for d in declarations if not exclude(d)] + for declaration in declarations: + common_with_cwrap.set_declaration_defaults(declaration) + declaration['options'] = [deepcopy(o) for o in declaration['options']] + declaration['options'] = common_with_cwrap.filter_unique_options( + declaration['options'], + allow_kwarg=False, + type_to_signature=TYPE_FORMAL_GENERIC, + remove_self=True) + common_with_cwrap.sort_by_number_of_options(declaration) + discover_zero_dim_tensor_operations(declaration) + discover_sparse_tensor_operations(declaration) + + for option in declaration['options']: + set_mode(option) + if option['mode'] != 'native': + sanitize_return(option) + process_types_and_backends(option) + add_variants(option) + declaration['options'] = handle_outputs_taken_as_arguments( + declaration['options']) + return declarations diff --git a/aten/src/ATen/stub/CombinedStub.cpp b/aten/src/ATen/stub/CombinedStub.cpp new file mode 100644 index 0000000..e69de29 diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h new file mode 100644 index 0000000..2c510a4 --- /dev/null +++ b/aten/src/ATen/templates/Functions.h @@ -0,0 +1,36 @@ +#pragma once + +// ${generated_comment} + +#include "ATen/Scalar.h" +#include "ATen/Type.h" +#include "ATen/Tensor.h" +#include "ATen/Storage.h" +#include "ATen/Generator.h" +#include "ATen/Deprecated.h" +#include "ATen/NativeFunctions.h" +#include "ATen/DeviceGuard.h" +#include "ATen/TensorOptions.h" +#include "THNN/Reduction.h" + +namespace at { + +using native::from_blob; +using native::tensor; + +${function_declarations} + +static inline Type & infer_type(const Tensor & t) { + AT_CHECK(t.defined(), "undefined Tensor"); + return t.type(); +} +static inline Type & infer_type(const TensorList & tl) { + AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); + return tl[0].type(); +} +// function definitions are all static inline because +// they are one-line statically dispatched functions that +// invoke the actual dynamic dispatch on the correct argument +${function_definitions} + +} diff --git a/aten/src/ATen/templates/GeneratorDerived.h b/aten/src/ATen/templates/GeneratorDerived.h new file mode 100644 index 0000000..9fde183 --- /dev/null +++ b/aten/src/ATen/templates/GeneratorDerived.h @@ -0,0 +1,31 @@ +#pragma once + +// ${generated_comment} + +#include <$header> + +#include "ATen/Generator.h" + +namespace at { + +class Context; +struct ${name}Generator : public Generator { + ${name}Generator(Context * context); + virtual ~${name}Generator(); + + virtual ${name}Generator& copy(const Generator& from) override; + virtual ${name}Generator& free() override; + + virtual uint64_t seed() override; + virtual uint64_t initialSeed() override; + virtual ${name}Generator& manualSeed(uint64_t seed) override; + virtual ${name}Generator& manualSeedAll(uint64_t seed) override; + virtual void * unsafeGetTH() override; + +//TODO(zach): figure out friends later +public: + Context * context; + ${th_generator} +}; + +} diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h new file mode 100644 index 0000000..2c84f21 --- /dev/null +++ b/aten/src/ATen/templates/NativeFunctions.h @@ -0,0 +1,66 @@ +#pragma once + +// ${generated_comment} + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace at { +struct Generator; +class Scalar; +struct Tensor; +struct Type; +} // namespace at + +namespace at { +namespace native { + +inline Tensor from_blob( + void* data, + IntList sizes, + const std::function& deleter, + const TensorOptions& options = {}) { + return options.type().tensorFromBlob(data, sizes, deleter); +} + +inline Tensor from_blob( + void* data, + IntList sizes, + const TensorOptions& options = {}) { + return native::from_blob(data, sizes, [](void*) {}, options); +} + +// These functions are defined in native/TensorFactories.cpp. +#define TENSOR(T, S, _1) \ + Tensor tensor(ArrayRef values, const TensorOptions& options); \ + inline Tensor tensor( \ + std::initializer_list values, const TensorOptions& options) { \ + return native::tensor(ArrayRef(values), options); \ + } \ + inline Tensor tensor(T value, const TensorOptions& options) { \ + return native::tensor(ArrayRef(value), options); \ + } \ + inline Tensor tensor(ArrayRef values) { \ + return native::tensor(std::move(values), at::dtype(k##S)); \ + } \ + inline Tensor tensor(std::initializer_list values) { \ + return native::tensor(ArrayRef(values)); \ + } \ + inline Tensor tensor(T value) { \ + return native::tensor(ArrayRef(value)); \ + } +AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR) +#undef TENSOR + +${native_function_declarations} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/templates/RegisterCUDA.cpp b/aten/src/ATen/templates/RegisterCUDA.cpp new file mode 100644 index 0000000..40c00c1 --- /dev/null +++ b/aten/src/ATen/templates/RegisterCUDA.cpp @@ -0,0 +1,17 @@ +#include + +// ${generated_comment} + +#include +#include +#include + +${cuda_type_headers} + +namespace at { + +void register_cuda_types(Context * context) { + ${cuda_type_registrations} +} + +} // namespace at diff --git a/aten/src/ATen/templates/RegisterCUDA.h b/aten/src/ATen/templates/RegisterCUDA.h new file mode 100644 index 0000000..3fa97c6 --- /dev/null +++ b/aten/src/ATen/templates/RegisterCUDA.h @@ -0,0 +1,10 @@ +#pragma once + +// ${generated_comment} + +namespace at { + +class Context; +void register_cuda_types(Context * context); + +} // namespace at diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp new file mode 100644 index 0000000..42a6ec9 --- /dev/null +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -0,0 +1,84 @@ +// required for old g++ to compile PRId64 macros, see +// https://github.com/pytorch/pytorch/issues/3571 +// for context +#define __STDC_FORMAT_MACROS + +#include "ATen/${Type}.h" + +// ${generated_comment} + +#include "ATen/${Generator}.h" +#include "ATen/${DenseTensor}.h" +#include "ATen/${DenseBackend}LongTensor.h" +#include "ATen/Allocator.h" +#include "ATen/Half.h" +#include "ATen/WrapDimUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/THLongStorageView.h" +#include "ATen/UndefinedTensor.h" +#include "ATen/Utils.h" +#include "ATen/DeviceGuard.h" +#include "ATen/optional.h" + +#include +#include +#include +#include + +#include "ATen/Config.h" +$extra_cuda_headers + +namespace at { + +${Type}::${Type}(Context* context) + : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {} +ScalarType ${Type}::scalarType() const { + return ScalarType::${ScalarName}; +} +Backend ${Type}::backend() const { + return Backend::${Backend}; +} +bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; } +bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; } +bool ${Type}::is_distributed() const { return false; } + +std::unique_ptr ${Type}::storage() const { + AT_ERROR("storage not supported on sparse"); +} +std::unique_ptr ${Type}::storage(size_t size) const { + AT_ERROR("storage not supported on sparse"); +} +std::unique_ptr ${Type}::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { + AT_ERROR("storage not supported on sparse"); +} +std::unique_ptr ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const { + AT_ERROR("storage not supported on sparse"); +} +Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { + AT_ERROR("unsafeTensorFromTH not supported on sparse"); +} +std::unique_ptr ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { + AT_ERROR("unsafeTensorFromTH not supported on sparse"); +} +std::unique_ptr ${Type}::generator() const { + return std::unique_ptr(new ${Generator}(context)); +} + +const char * ${Type}::toString() const { + return ${Type}::typeString(); +} +TypeID ${Type}::ID() const { + return ${TypeID}; +} + +size_t ${Type}::elementSizeInBytes() const { + return sizeof(${ScalarType}); +} + +const char * ${Type}::typeString() { + return "${Type}"; +} + +${type_derived_method_definitions} + +} diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp new file mode 100644 index 0000000..83e10b9 --- /dev/null +++ b/aten/src/ATen/templates/StorageDerived.cpp @@ -0,0 +1,146 @@ +#include "ATen/${Storage}.h" + +// ${generated_comment} + +#include "ATen/Half.h" +#include "ATen/Allocator.h" + +#include "ATen/Config.h" +$extra_cuda_headers + +namespace at { + +${Storage}::${Storage}(Context* context): + storage(${THStorage}_new(${state})), context(context) {} + +${Storage}::${Storage}(Context* context, THStorage* storage): + storage(storage), context(context) {} + +${Storage}::${Storage}(Context* context, size_t storage_size) + : storage(${THStorage}_newWithSize(${state,} storage_size)), context(context) {} + +${Storage}::${Storage}(Context* context, size_t size, Allocator* allocator) + : storage(nullptr), + context(context) { + storage = ${THStorage}_newWithAllocator(${state,} size, allocator); + ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE); +} + +// TODO: Take in Device as an input to the std::function constructor + +#if ${isCUDA} +static int getPointerDevice(void* ptr) { + struct cudaPointerAttributes attr; + THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); + return attr.device; +} +#endif + +${Storage}::${Storage}(Context* context, + void * data, size_t size, const std::function & deleter) + : storage(${THStorage}_newWithDataAndAllocator(${state,} + InefficientStdFunctionContext::makeDataPtr(data, deleter, +#if ${isCUDA} + Device(kCUDA, getPointerDevice(data)) +#else + kCPU +#endif + ), size, + /* allocator */ nullptr + )), + context(context) { + ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE); +} + +${Storage}::~${Storage}() { + ${THStorage}_free(${state,} storage); +} + +size_t ${Storage}::elementSize() const { + return sizeof(${ScalarType}); +} + +size_t ${Storage}::size() const { + return storage->size; +} + +void* ${Storage}::data() { + return storage->data_ptr.get(); +} + +const void* ${Storage}::data() const { + return storage->data_ptr.get(); +} + +auto ${Storage}::retain() -> ${Storage}& { + ${THStorage}_retain(${state,} storage); + return *this; +} + +auto ${Storage}::free() -> ${Storage}& { + ${THStorage}_free(${state,} storage); + return *this; +} + +void* ${Storage}::unsafeGetTH(bool retain) const { + if (retain) { + ${THStorage}_retain(${state,} storage); + } + return storage; +} + +auto ${Storage}::resize(int64_t new_size) -> ${Storage}& { + ${THStorage}_resize(${state,} storage, new_size); + return *this; +} + +auto ${Storage}::fill(Scalar value) -> ${Storage}& { + ${THStorage}_fill(${state,} storage, ${to_th_type}(value.to${ScalarName}())); + return *this; +} + +auto ${Storage}::set(size_t ind, Scalar value) -> ${Storage}& { + ${THStorage}_set(${state,} storage, ind, ${to_th_type}(value.to${ScalarName}())); + return *this; +} + +auto ${Storage}::fast_set(size_t ind, Scalar value) -> ${Storage}& { + throw std::runtime_error("unsupported operation 'fast_set'"); +} + +auto ${Storage}::get(size_t ind) -> Scalar { + // static cast to fix long -> int64_t issues + return static_cast<${ScalarType}>(${to_at_type}(${THStorage}_get(${state,} storage, ind))); +} + +auto ${Storage}::fast_get(size_t ind) -> Scalar { + if(${isCUDA}) + throw std::runtime_error("unsupported operation 'fast_get'"); + return static_cast<${ScalarType}>(${to_at_type}(storage->unsafe_data<${THScalarType}>()[ind])); +} + +void ${Storage}::set_flag(char flag) { + ${THStorage}_setFlag(${state,} storage, flag); +} + +void ${Storage}::clear_flag(char flag) { + ${THStorage}_clearFlag(${state,} storage, flag); +} + +int ${Storage}::getDevice() const { + return storage->data_ptr.device().index(); +} + +Type& ${Storage}::type() const { + return context->getType(Backend::${Backend},ScalarType::${ScalarName}); +} + +const char * ${Storage}::toString() const { + return "${Storage}"; +} + +const char * ${Storage}::typeString() { + return "${Type}"; +} + +} diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h new file mode 100644 index 0000000..d97d397 --- /dev/null +++ b/aten/src/ATen/templates/StorageDerived.h @@ -0,0 +1,57 @@ +#pragma once + +// ${generated_comment} + +$th_headers + +#include "ATen/Storage.h" +#include "ATen/Context.h" + +#include + +namespace at { + +struct Allocator; + +struct ${Storage} final : public Storage { +public: + explicit ${Storage}(Context* context); + ${Storage}(Context* context, THStorage *wrapped); + ${Storage}(Context* context, size_t size); + ${Storage}(Context* context, size_t size, Allocator* allocator); + ${Storage}(Context* context, + void * data, size_t size, const std::function & deleter); + virtual ~${Storage}(); + + virtual size_t elementSize() const override; + virtual size_t size() const override; + virtual void* data() override; + virtual const void* data() const override; + virtual ${Storage}& retain() override; + virtual ${Storage}& free() override; + virtual void * unsafeGetTH(bool retain) const override; + + virtual ${Storage}& resize(int64_t new_size) override; + virtual ${Storage}& fill(Scalar value) override; + virtual ${Storage}& set(size_t ind, Scalar value) override; + virtual ${Storage}& fast_set(size_t ind, Scalar value) override; + virtual Scalar get(size_t ind) override; + virtual Scalar fast_get(size_t ind) override; + + virtual void set_flag(char flag) override; + virtual void clear_flag(char flag) override; + + virtual Type& type() const override; + virtual int getDevice() const override; + virtual const char * toString() const override; + + static const char * typeString(); + + +protected: + friend struct ${Type}; + THStorage *storage; + Context* context; +}; + +} // namespace at diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h new file mode 100644 index 0000000..31e952e --- /dev/null +++ b/aten/src/ATen/templates/Tensor.h @@ -0,0 +1,250 @@ +#pragma once + +// ${generated_comment} + +#include "ATen/Generator.h" +#include "ATen/Scalar.h" +#include "ATen/ScalarType.h" +#include "ATen/SparseTensorRef.h" +#include "ATen/Storage.h" +#include "ATen/TensorAccessor.h" +#include "ATen/TensorBase.h" +#include "ATen/TensorImpl.h" +#include "ATen/Utils.h" +#include "ATen/Device.h" +#include "ATen/Layout.h" +#include "ATen/optional.h" + +namespace at { +struct Type; +struct Tensor; +struct TensorOptions; +namespace detail { +void set_data(Tensor& tensor, Tensor new_data); +} // namespace detail +} // namespace at + +namespace at { +// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which +// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr. +// +// For example: +// +// void func(Tensor a) { +// Tensor b = a; +// ... +// } +// +// In this example, when we say Tensor b = a, we are creating a new object that points to the +// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the +// destructor decrements the reference count by calling release() on the TensorImpl it points to. +// The existing constructors, operator overloads, etc. take care to implement the correct semantics. +// +// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and +// special care must be taken to handle this. +struct Tensor : public detail::TensorBase { + using TensorBase = detail::TensorBase; + Tensor() : TensorBase() {} + Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {} + Tensor(const TensorBase & rhs) : TensorBase(rhs) {} + Tensor(const Tensor & rhs) = default; + Tensor(Tensor && rhs) noexcept = default; + + // reimplemented from TensorBase so the return type is Tensor rather than TensorBase + Tensor & operator=(Tensor && rhs) & { + rhs.swap(*this); + return *this; + } + Tensor & operator=(Tensor const & rhs) & { + //Tensor ctor retains original rhs.pImpl + //then rhs.pImpl is swapped with this->pImpl + //finally Tensor dtor releases rhs.pImpl, which was originally this->pImpl + Tensor(rhs).swap(*this); + return *this; + } + + inline Tensor & operator=(Tensor const & rhs) &&; + Tensor & operator=(Scalar v) &&; + const char * toString() const { + return pImpl->toString(); + } + IntList sizes() const { + return pImpl->sizes(); + } + IntList strides() const { + return pImpl->strides(); + } + int64_t ndimension() const { + return dim(); + } + Type & type() const { + return pImpl->type(); + } + std::unique_ptr storage() const { + return pImpl->storage(); + } + inline Tensor toType(const Type & t, bool non_blocking=false) const; + inline Tensor & copy_(const Tensor & src, bool non_blocking=false); + inline Tensor toType(ScalarType t) const; + inline Tensor toBackend(Backend b) const; + + /// New-style `to()` methods. + /// NB: These methods are defined in TensorOptions.h. + Tensor to(Device device, ScalarType dtype, bool non_blocking = false) const; + Tensor to(ScalarType dtype, bool non_blocking = false) const; + Tensor to(Device device, bool non_blocking = false) const; + + /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`. + /// Defined in Type.h because of include order issues. + bool is_variable() const noexcept; + + /// Returns a `Tensor`'s layout. Defined in Type.h + Layout layout() const noexcept; + + /// Returns a `Tensor`'s dtype (`ScalarType`). Defined in Type.h + ScalarType dtype() const noexcept; + + /// Returns a `Tensor`'s device. + Device device() const; + + /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in + /// TensorOptions.h. + TensorOptions options() const; + + template + T * data() const; + + void * unsafeGetTH(bool retain) const { + return pImpl->unsafeGetTH(retain); + } + + // non-retaining + TensorImpl * unsafeGetTensorImpl() const { + return pImpl; + } + + // Purposely not defined here to avoid inlining + void print() const; + + //toLongData(), toFloatData() etc. + #define TO_TYPE_DATA(T,name,_) \ + T * to##name##Data() const; + AT_FORALL_SCALAR_TYPES(TO_TYPE_DATA) + #undef TO_TYPE_DATA + + #define TO_C_TYPE(T,name,_) \ + T toC##name () const; + AT_FORALL_SCALAR_TYPES(TO_C_TYPE) + #undef TO_C_TYPE + + template + TensorAccessor accessor() const { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return TensorAccessor(data(),sizes().data(),strides().data()); + } + + Tensor operator-() const; + Tensor& operator+=(const Tensor & other); + Tensor& operator+=(Scalar other); + Tensor& operator-=(const Tensor & other); + Tensor& operator-=(Scalar other); + Tensor& operator*=(const Tensor & other); + Tensor& operator*=(Scalar other); + Tensor& operator/=(const Tensor & other); + Tensor& operator/=(Scalar other); + Tensor operator[](Scalar index) const; + Tensor operator[](Tensor index) const; + Tensor operator[](int64_t index) const; + + // ~~~~~ Autograd API ~~~~~ + + Tensor& set_requires_grad(bool requires_grad) { + pImpl->set_requires_grad(requires_grad); + return *this; + } + bool requires_grad() const { + return pImpl->requires_grad(); + } + + Tensor& grad() { + return pImpl->grad(); + } + const Tensor& grad() const { + return pImpl->grad(); + } + + Tensor detach() const { + return pImpl->detach(); + } + void detach_() { + pImpl->detach_(); + } + + /// Computes the gradient of current tensor w.r.t. graph leaves. + void backward( + at::optional gradient = at::nullopt, + bool keep_graph = false, + bool create_graph = false); + + friend void detail::set_data(Tensor& tensor, Tensor new_data); + + // STOP. Thinking of adding a method here, which only makes use + // of other ATen methods? Define it in native_functions.yaml. + + //example + //Tensor * add(Tensor & b); + ${tensor_method_declarations} + + template + auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward(params)...)) { + return func(*this, std::forward(params)...); + } + + friend struct WeakTensor; +}; + +struct WeakTensor : public detail::WeakTensorBase { + using WeakTensorBase = detail::WeakTensorBase; + WeakTensor() : WeakTensorBase() {} + WeakTensor(TensorImpl * self, bool retain) : WeakTensorBase(self, retain) {} + WeakTensor(const WeakTensor & rhs) = default; + WeakTensor(WeakTensor && rhs) noexcept = default; + WeakTensor(const Tensor& t) : WeakTensorBase(t.pImpl, true) {} + + // reimplemented from TensorBase so the return type is WeakTensor rather than TensorBase + WeakTensor & operator=(WeakTensor && rhs) & { + rhs.swap(*this); + return *this; + } + WeakTensor & operator=(WeakTensor const & rhs) & { + //Tensor ctor retains original rhs.pImpl + //then rhs.pImpl is swapped with this->pImpl + //finally Tensor dtor releases rhs.pImpl, which was originally this->pImpl + WeakTensor(rhs).swap(*this); + return *this; + } + + WeakTensor & operator=(const Tensor& t) { + WeakTensor(t.pImpl, true).swap(*this); + return *this; + } + + // non-retaining + TensorImpl * unsafeGetTensorImpl() const { + return pImpl; + } + + // XXX: this can return undefined tensors + // Ideally it would be at::optional, but MSVC is too cool for that + Tensor lock() const { + return pImpl->weak_lock() ? Tensor(pImpl, false) : Tensor(); + } +}; + +namespace detail { +inline void set_data(Tensor& tensor, Tensor new_data) { + tensor.pImpl->set_data(new_data); +} +} // namespace detail +} // namespace at diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp new file mode 100644 index 0000000..92ffeb3 --- /dev/null +++ b/aten/src/ATen/templates/TensorDense.cpp @@ -0,0 +1,15 @@ +// included as 'TensorDenseOrSparse' in TensorDerived.cpp + +IntList ${Tensor}::strides() const { + return IntList(tensor->stride,dim()); +} +Scalar ${Tensor}::localScalar() { + int64_t numel = ${THTensor}_nElement(${state,}tensor); + AT_CHECK(numel == 1,"a Tensor with ", numel, " elements cannot be converted to Scalar"); + return Scalar(${to_at_type}(${THStorage}_get(${state,}tensor->storage, tensor->storageOffset))); +} +std::unique_ptr ${Tensor}::storage() { + auto storage = ${THTensor}_storage(${state,}tensor); + ${THStorage}_retain(${state,}storage); + return std::unique_ptr(new ${Storage}(&type().get_context(), storage)); +} diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp new file mode 100644 index 0000000..e15eb5f --- /dev/null +++ b/aten/src/ATen/templates/TensorDerived.cpp @@ -0,0 +1,59 @@ +// required for old g++ to compile PRId64 macros, see +// https://github.com/pytorch/pytorch/issues/3571 +// for context +#define __STDC_FORMAT_MACROS + +// ${generated_comment} + +#include "ATen/Config.h" +#include "ATen/${Tensor}.h" +#include "ATen/${Storage}.h" +#include "ATen/Scalar.h" +#include "ATen/Half.h" + +$extra_cuda_headers + +namespace at { + +${Tensor}::${Tensor}(Context* context) +: ${Tensor}(context,${THTensor}_new(${state})) {} + +${Tensor}::${Tensor}(Context* context, ${THTensor} * tensor) +: TensorImpl(&context->getType(Backend::${Backend},ScalarType::${ScalarName})), + tensor(tensor), + context(context) {} +${Tensor}::~${Tensor}() { + ${THTensor}_free(${state,} tensor); +} + +const char * ${Tensor}::toString() const { + return "${Tensor}"; +} + +IntList ${Tensor}::sizes() const { + return IntList(tensor->size,dim()); +} + +int64_t ${Tensor}::dim() const { + if(isScalar()) + return 0; + return tensor->dim(); +} + +const char * ${Tensor}::typeString() { + return "${Type}"; +} +void * ${Tensor}::unsafeGetTH(bool retain) { + if (retain) + ${THTensor}_retain(${state,} tensor); + return tensor; +} + +void ${Tensor}::release_resources() { + ${THTensor}_free(${state,} tensor); + tensor = nullptr; +} + +${TensorDenseOrSparse} + +} diff --git a/aten/src/ATen/templates/TensorDerived.h b/aten/src/ATen/templates/TensorDerived.h new file mode 100644 index 0000000..892d6bc --- /dev/null +++ b/aten/src/ATen/templates/TensorDerived.h @@ -0,0 +1,37 @@ +#pragma once + +// ${generated_comment} + +$th_headers + +#include "ATen/Tensor.h" +#include "ATen/TensorImpl.h" +#include "ATen/Context.h" +#include "ATen/TensorMethods.h" + +namespace at { + +struct ${Tensor} final : public TensorImpl { +public: + explicit ${Tensor}(Context* context); + ${Tensor}(Context* context, ${THTensor} * tensor); + virtual ~${Tensor}(); + virtual const char * toString() const override; + virtual IntList sizes() const override; + virtual IntList strides() const override; + virtual int64_t dim() const override; + virtual Scalar localScalar() override; + virtual void * unsafeGetTH(bool retain) override; + virtual std::unique_ptr storage() override; + virtual void release_resources() override; + static const char * typeString(); + +//TODO(zach): sort of friend permissions later so this +// can be protected +public: + ${THTensor} * tensor; + Context* context; + friend struct ${Type}; +}; + +} // namespace at diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h new file mode 100644 index 0000000..846f5c5 --- /dev/null +++ b/aten/src/ATen/templates/TensorMethods.h @@ -0,0 +1,62 @@ +#pragma once + +// ${generated_comment} + +#include "ATen/Tensor.h" +#include "ATen/Scalar.h" +#include "ATen/SparseTensorRef.h" +#include "ATen/Type.h" + +namespace at { + +inline Tensor & Tensor::operator=(Tensor const & rhs) && { + return copy_(rhs); +} + +inline Tensor Tensor::toType(const Type & t, bool non_blocking) const { + if(type() == t) + return *this; + return t.copy(*this, non_blocking); +} + +inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) { + return type().copy_(*this, src, non_blocking); +} + +inline Tensor Tensor::toType(ScalarType t) const { + return toType(type().toScalarType(t)); +} + +inline Tensor Tensor::toBackend(Backend b) const { + return toType(type().toBackend(b)); +} + + +// all static inline to allow for inlining of the non-dynamic part of dispatch +${tensor_method_definitions} + +#define DEFINE_CAST(T, name, _) \ + template <> \ + inline T* Tensor::data() const { \ + AT_CHECK( \ + type().scalarType() == ScalarType::name, \ + "expected scalar type ", \ + #name, \ + " but found ", \ + at::toString(type().scalarType())); \ + return static_cast(this->data_ptr()); \ + } \ + inline T* Tensor::to##name##Data() const { \ + return data(); \ + } + +AT_FORALL_SCALAR_TYPES(DEFINE_CAST) +#undef DEFINE_CAST + +#define DEFINE_TO_C_TYPE(T,name,_) \ +inline T Tensor::toC##name () const { return pImpl->localScalar().to##name (); } + +AT_FORALL_SCALAR_TYPES(DEFINE_TO_C_TYPE) +#undef DEFINE_TO_C_TYPE + +} //namespace at diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp new file mode 100644 index 0000000..ea75f1c --- /dev/null +++ b/aten/src/ATen/templates/Type.cpp @@ -0,0 +1,108 @@ +#include "ATen/Type.h" + +// ${generated_comment} + +#include "ATen/ExpandUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Scalar.h" +#include "ATen/SparseTensorRef.h" +#include "ATen/Storage.h" +#include "ATen/Tensor.h" +#include "ATen/TensorOptions.h" +#include "ATen/UndefinedType.h" +#include "ATen/DeviceGuard.h" + +#include + +#include +${cpu_type_headers} + +namespace at { + +void Type::registerCPU(Context * context) { + ${cpu_type_registrations} + context->type_registry[static_cast(Backend::Undefined)] + [static_cast(ScalarType::Undefined)].reset(new UndefinedType(context)); +} + +Tensor & Type::copy_(Tensor & self, const Tensor & src, bool non_blocking) const { + Tensor b_src; + std::tie(b_src) = expand_inplace(self, src, "copy"); + return s_copy_(self, b_src, non_blocking); +} + +Tensor Type::copy(const Tensor & src, bool non_blocking) const { + // TODO(psag): have a DeviceGuard here + AT_CHECK(src.defined(), "attempt to copy an undefined tensor"); + if (is_sparse()) { + auto indices = src._indices(); + auto values = src._values(); + auto & this_dense = toBackend(is_cuda() ? Backend::CUDA : Backend::CPU); + auto & this_dense_idx = this_dense.toScalarType(ScalarType::Long); + auto indices_copy = this_dense_idx.copy(indices, non_blocking); + auto values_copy = this_dense.copy(values, non_blocking); + return _sparse_coo_tensor_unsafe(indices_copy, values_copy, src.sizes()); + } else { + Tensor r = this->tensor(src.sizes()); + r.copy_(src, non_blocking); + return r; + } +} + +Type & Type::toBackend(Backend b) const { + return context->getType(b,scalarType()); +} +Type & Type::toScalarType(ScalarType s) const { + return context->getType(backend(),s); +} +static std::vector defaultStrides(IntList sizes) { + std::vector strides(sizes.size()); + int64_t stride = 1; + for(size_t i = sizes.size(); i > 0; --i) { + strides[i-1] = stride; + stride *= sizes[i-1]; + } + return strides; +} +static int64_t computeStorageSize(IntList sizes, IntList strides) { + // size of the underlying storage is 1 bigger than the offset + // of the last element according to stride + int64_t size = 1; + for(size_t i = 0; i < sizes.size(); i++) { + if(sizes[i] == 0) { + return 0; + } + size += strides[i]*(sizes[i]-1); + } + return size; +} +Tensor Type::tensorFromBlob(void * data, IntList sizes, const std::function & deleter) const { + return tensorFromBlob(data, sizes, defaultStrides(sizes), deleter); +} +Tensor Type::tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function & deleter) const { + auto storage = storageFromBlob(data, computeStorageSize(sizes, strides), deleter); + return tensor(*storage, 0, sizes, strides); +} +Tensor Type::tensorWithAllocator(IntList sizes, Allocator* allocator) const { + return tensorWithAllocator(sizes, defaultStrides(sizes), std::move(allocator)); +} +Tensor Type::tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const { + auto storage = storageWithAllocator(computeStorageSize(sizes, strides), std::move(allocator)); + return tensor(*storage, 0, sizes, strides); +} +Tensor Type::scalarTensor(Scalar s) const { + if(s.isBackedByTensor()) + return Tensor(s.t).toType(*this); + return tensor({}).fill_(s); +} + +bool Type::operator==(const Type& other) const { + return this == &other; +} +bool Type::operator!=(const Type& other) const { + return this != &other; +} + +${type_method_definitions} + +} diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h new file mode 100644 index 0000000..459e363 --- /dev/null +++ b/aten/src/ATen/templates/Type.h @@ -0,0 +1,122 @@ +#pragma once + +// ${generated_comment} + +#include "ATen/Allocator.h" +#include "ATen/ArrayRef.h" +#include "ATen/ATenGeneral.h" +#include "ATen/Generator.h" +#include "ATen/Half.h" +#include "ATen/Scalar.h" +#include "ATen/ScalarType.h" +#include "ATen/SparseTensorRef.h" +#include "ATen/Tensor.h" +#include "ATen/Deprecated.h" +#include "ATen/Layout.h" +#include "THNN/Reduction.h" + +#include +#include +#include +#include +#include + +// To solve the conflict of s_addr in inaddr.h +#ifdef _MSC_VER +#ifdef s_addr +#undef s_addr +#endif +#endif + +namespace at { + +class Context; +struct Allocator; +struct Generator; +struct Storage; + +static inline void noop_deleter(void*) {} + +enum class TypeID { + ${type_ids} + Undefined, + NumOptions +}; + +struct AT_API Type { + explicit Type(Context* context, bool is_variable, bool is_undefined) + : context(context), is_variable_(is_variable), is_undefined_(is_undefined) {} + virtual ~Type() {} + virtual ScalarType scalarType() const = 0; + virtual Backend backend() const = 0; + Layout layout() const noexcept { return layout_from_backend(backend()); } + virtual bool is_cuda() const = 0; + virtual bool is_sparse() const = 0; + virtual bool is_distributed() const = 0; + bool is_variable() const noexcept { return is_variable_; } + bool is_undefined() const noexcept { return is_undefined_; } + static void registerCPU(Context * context); + virtual std::unique_ptr storage() const = 0; + virtual std::unique_ptr storage(size_t size) const = 0; + virtual std::unique_ptr storageFromBlob(void * data, int64_t size, const std::function & deleter=noop_deleter) const = 0; + virtual std::unique_ptr storageWithAllocator(int64_t size, Allocator* allocator) const = 0; + virtual std::unique_ptr generator() const = 0; + virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0; + virtual std::unique_ptr unsafeStorageFromTH(void * th_pointer, bool retain) const = 0; + virtual const char * toString() const = 0; + virtual size_t elementSizeInBytes() const = 0; + virtual Type & toBackend(Backend b) const; + virtual Type & toScalarType(ScalarType s) const; + Type & toSparse() const { + return this->toBackend(at::toSparse(this->backend())); + } + Type & toDense() const { + return this->toBackend(at::toDense(this->backend())); + } + Context& get_context() const { return *context; } + + // contingious IDs for all types in the system + // for external dispatch + virtual TypeID ID() const = 0; + + Tensor copy(const Tensor & src, bool non_blocking=false) const; + Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const; + virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; + virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0; + + Tensor tensorFromBlob(void * data, IntList sizes, const std::function & deleter=noop_deleter) const; + Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function & deleter=noop_deleter) const; + Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const; + Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const; + Tensor scalarTensor(Scalar s) const; + + bool operator==(const Type& other) const; + bool operator!=(const Type& other) const; + + // example + // virtual Tensor * add(Tensor & a, Tensor & b) = 0; + ${type_method_declarations} +protected: + Context* context; + bool is_variable_; + bool is_undefined_; + +}; + +inline bool Tensor::is_variable() const noexcept { + return type().is_variable(); +} + +inline ScalarType Tensor::dtype() const noexcept { + return type().scalarType(); +} + +inline Layout Tensor::layout() const noexcept { + return type().layout(); +} + +inline Device Tensor::device() const { + return Device(type().backend(), type().is_cuda() ? get_device() : -1); +} + +} // namespace at diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp new file mode 100644 index 0000000..6699070 --- /dev/null +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -0,0 +1,98 @@ +// required for old g++ to compile PRId64 macros, see +// https://github.com/pytorch/pytorch/issues/3571 +// for context +#define __STDC_FORMAT_MACROS + +#include "ATen/${Type}.h" + +// ${generated_comment} + +$storage_tensor_headers +#include "ATen/${Generator}.h" +#include "ATen/${DenseTensor}.h" +#include "ATen/${DenseBackend}LongTensor.h" +#include "ATen/Allocator.h" +#include "ATen/Half.h" +#include "ATen/WrapDimUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/THLongStorageView.h" +#include "ATen/UndefinedTensor.h" +#include "ATen/Utils.h" +#include "ATen/DeviceGuard.h" +#include "ATen/optional.h" + +#include +#include +#include +#include + +#include "ATen/Config.h" +$extra_cuda_headers + +namespace at { + +${Type}::${Type}(Context* context) + : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {} +ScalarType ${Type}::scalarType() const { + return ScalarType::${ScalarName}; +} +Backend ${Type}::backend() const { + return Backend::${Backend}; +} +bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; } +bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; } +bool ${Type}::is_distributed() const { return false; } + +std::unique_ptr ${Type}::storage() const { + return std::unique_ptr(new ${Storage}(context)); +} +std::unique_ptr ${Type}::storage(size_t size) const { + return std::unique_ptr(new ${Storage}(context,size)); +} +std::unique_ptr ${Type}::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { + return std::unique_ptr( + new ${Storage}(context,data,size,deleter)); +} +std::unique_ptr ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const { + return std::unique_ptr( + new ${Storage}(context, size, allocator)); +} +Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { + if (retain) + ${THTensor}_retain(${state,} (${THTensor}*) th_pointer); + return Tensor(new ${Tensor}(context,(${THTensor}*)(th_pointer)), false); +} +std::unique_ptr ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { + if (retain) + ${THStorage}_retain(${state,} (${THStorage}*) th_pointer); + return std::unique_ptr(new ${Storage}(context, (${THStorage}*) th_pointer)); +} +std::unique_ptr ${Type}::generator() const { + return std::unique_ptr(new ${Generator}(context)); +} + +const char * ${Type}::toString() const { + return ${Type}::typeString(); +} +TypeID ${Type}::ID() const { + return ${TypeID}; +} + +size_t ${Type}::elementSizeInBytes() const { + return sizeof(${ScalarType}); +} + +const char * ${Type}::typeString() { + return "${Type}"; +} + +/* example +Tensor * ${Type}::add(Tensor & a, Tensor & b) { + std::cout << "add ${Tensor}\n"; + return &a; +} +*/ + +${type_derived_method_definitions} + +} diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h new file mode 100644 index 0000000..92d3cf2 --- /dev/null +++ b/aten/src/ATen/templates/TypeDerived.h @@ -0,0 +1,45 @@ +#pragma once + +// ${generated_comment} + +#include "ATen/Type.h" +#include "ATen/Context.h" +#include "ATen/TensorMethods.h" +#include "ATen/CheckGenerator.h" + +#ifdef _MSC_VER +#ifdef Type +#undef Type +#endif +#endif + +namespace at { + +struct ${Type} final : public Type { + explicit ${Type}(Context* context); + virtual ScalarType scalarType() const override; + virtual Backend backend() const override; + virtual bool is_cuda() const override; + virtual bool is_sparse() const override; + virtual bool is_distributed() const override; + virtual std::unique_ptr storage() const override; + virtual std::unique_ptr storage(size_t size) const override; + virtual std::unique_ptr storageFromBlob(void * data, int64_t size, const std::function & deleter) const override; + virtual std::unique_ptr storageWithAllocator(int64_t size, Allocator* allocator) const override; + virtual std::unique_ptr generator() const override; + virtual const char * toString() const override; + virtual size_t elementSizeInBytes() const override; + virtual TypeID ID() const override; + static const char * typeString(); + virtual std::unique_ptr unsafeStorageFromTH(void * th_pointer, bool retain) const override; + virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override; + + // example + // virtual Tensor * add(Tensor & a, Tensor & b) override; + + virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override; + virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override; + ${type_derived_method_declarations} +}; + +} // namespace at diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt new file mode 100644 index 0000000..25d84a3 --- /dev/null +++ b/aten/src/ATen/test/CMakeLists.txt @@ -0,0 +1,36 @@ +IF (MSVC) + IF (MSVC_VERSION LESS 1911) + return() + ENDIF() +ENDIF(MSVC) + +list(APPEND ATen_CPU_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/scalar_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/apply_utils_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/basic.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/atest.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/half_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/broadcast_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/wrapdim_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dlconvertor_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/native_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/scalar_tensor_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_parallel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/undefined_tensor_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/verify_api_visibility.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tbb_init_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/weakref_test.cpp) + +list(APPEND ATen_CUDA_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/integer_divider_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rng_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/apply_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/stream_test.cpp) +if (CUDNN_FOUND) + list(APPEND ATen_CUDA_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_test.cpp) +endif() + +# ---[ Send the lists to the parent scope. +set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) +set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp new file mode 100644 index 0000000..986f599 --- /dev/null +++ b/aten/src/ATen/test/apply_test.cpp @@ -0,0 +1,121 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "cuda.h" +#include "cuda_runtime.h" + +#include "ATen/cuda/detail/TensorInfo.cuh" + +/* +Tests related to tensor indexing and applying operations. +*/ +#ifndef _WIN32 + +TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") { + int sizes[] = {4, 4}; + int strides[] = {4, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; + ti.collapseDims(); + REQUIRE(ti.dims == 1); + REQUIRE(ti.sizes[0] == (4 * 4)); +} + +TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") { + int sizes[] = {6, 3, 7}; + int strides[] = {3 * 7, 7, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + ti.collapseDims(); + REQUIRE(ti.dims == 1); + REQUIRE(ti.sizes[0] == (6 * 3 * 7)); +} + +TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") { + int sizes[] = {4, 3, 2}; + int strides[] = {3 * 3, 3, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + ti.collapseDims(); + REQUIRE(ti.dims == 2); + REQUIRE(ti.sizes[0] == (4 * 3)); + REQUIRE(ti.sizes[1] == 2); +} + +TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") { + int sizes[] = {3, 2}; + int strides[] = {2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; + ti.collapseDims(); + REQUIRE(ti.dims == 1); + REQUIRE(ti.sizes[0] == (3 * 2)); + REQUIRE(ti.strides[0] == 2); +} + +TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){ + int sizes[] = {3, 6, 5, 2}; + int strides[] = {6 * 22, 22, 2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; + ti.collapseDims(); + REQUIRE(ti.dims == 2); + REQUIRE(ti.sizes[0] == (3 * 6)); + REQUIRE(ti.strides[0] == 22); + REQUIRE(ti.sizes[1] == (5 * 2)); + REQUIRE(ti.strides[1] == 2); +} + +TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") { + int sizes[] = {1, 10, 1, 5, 4}; + int strides[] = {4, 0, 16, 0, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 5, sizes, strides}; + ti.collapseDims(); + REQUIRE(ti.dims == 2); + REQUIRE(ti.sizes[0] == (10 * 5)); + REQUIRE(ti.strides[0] == 0); + REQUIRE(ti.sizes[1] == 4); + REQUIRE(ti.strides[1] == 1); +} + +TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") { + int sizes[] = {1, 1, 1}; + int strides[] = {17, 12, 3}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + REQUIRE(ti.collapseDims() == 0); + REQUIRE(ti.dims == 1); + REQUIRE(ti.sizes[0] == 1); + REQUIRE(ti.strides[0] == 1); +} + +TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") { + int sizes[] = {3, 6, 5, 2}; + int strides[] = {6 * 22, 22, 2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; + REQUIRE(ti.collapseDims(1) == 1); + REQUIRE(ti.dims == 3); + REQUIRE(ti.sizes[0] == 3); + REQUIRE(ti.strides[0] == (6 * 22)); + REQUIRE(ti.sizes[1] == 6); + REQUIRE(ti.strides[1] == 22); + REQUIRE(ti.sizes[2] == (5 * 2)); + REQUIRE(ti.strides[2] == 2); +} + +TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") { + int sizes[] = {3, 6, 5, 2}; + int strides[] = {6 * 22, 22, 2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; + REQUIRE(ti.collapseDims(2) == 1); + REQUIRE(ti.dims == 3); + REQUIRE(ti.sizes[0] == (3 * 6)); + REQUIRE(ti.strides[0] == 22); + REQUIRE(ti.sizes[1] == 5); + REQUIRE(ti.strides[1] == 4); + REQUIRE(ti.sizes[2] == 2); + REQUIRE(ti.strides[2] == 2); +} + +TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") { + int sizes[] = {1, 1, 1}; + int strides[] = {17, 12, 3}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + REQUIRE_THROWS(ti.collapseDims(5)); +} + +#endif diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp new file mode 100644 index 0000000..24359a0 --- /dev/null +++ b/aten/src/ATen/test/apply_utils_test.cpp @@ -0,0 +1,139 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "ATen/CPUApplyUtils.h" +#include "test_assert.h" +#include "test_seed.h" + +#include +using namespace std; +using namespace at; + +void fill_tensor(int64_t scalar, Tensor& t_) { + auto t = t_.view(-1); + for (int64_t i = 0; i < t.numel(); i++) { + t[i] = (i + 1) * scalar; + } +} + +// This test exercises all sequential applyX functions. Given a shape and two +// transpose dimensions we create 5 tensors (a0, ..., a4) of the given shape and +// transpose the dimension a with b for each tensor. Then we call the applyX +// function on each floating type. a4 is allocated in doubles only, whereas a0, +// ..., a3 are allocated in the given type. For each applyX function we once +// write the same type as we read (using a0, ..., aX-1) and we once write to +// double (using a4 as a target). We also exercise on a zero_dim and empty +// tensor. +void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) { + auto zero_dim = at::empty({}, type); + zero_dim.fill_(2); + zero_dim.exp_(); + AT_DISPATCH_FLOATING_TYPES(zero_dim.type(), "test0", [&] { + ASSERT(zero_dim.data()[0] == std::exp(2)); + }); + + auto empty_t = at::empty({0}, type); + empty_t.fill_(3); + empty_t.exp_(); + + auto a0 = type.tensor(); + auto a1 = type.tensor(); + auto a2 = type.tensor(); + auto a3 = type.tensor(); + auto a4 = CPU(kDouble).tensor(); + + std::vector tensors({a0, a1, a2, a3, a4}); + for (size_t i = 0; i < tensors.size(); i++) { + tensors[i].resize_(shape); + fill_tensor(i + 1, tensors[i]); + if (a >= 0 && b >= 0) { + tensors[i].transpose_(a, b); + } + } + + AT_DISPATCH_FLOATING_TYPES(a0.type(), "test1", [&] { + CPU_tensor_apply2( + a0, a1, [](scalar_t& y, const scalar_t& x) { y = x * x; }); + CPU_tensor_apply2( + a4, a1, [](double& y, scalar_t x) { y = (double)(x * x); }); + for (int64_t i = 0; i < a0.numel(); i++) { + auto target = a1.data()[i] * a1.data()[i]; + ASSERT(a0.data()[i] == target); + ASSERT(a4.data()[i] == target); + } + }); + + AT_DISPATCH_FLOATING_TYPES(a0.type(), "test2", [&] { + CPU_tensor_apply3( + a0, a1, a2, [](scalar_t& y, const scalar_t& x, const scalar_t& z) { + y = x * x + z; + }); + CPU_tensor_apply3( + a4, a1, a2, [](double& y, const scalar_t& x, const scalar_t& z) { + y = (double)(x * x + z); + }); + for (int64_t i = 0; i < a0.numel(); i++) { + auto target = a1.data()[i] * a1.data()[i]; + target = target + a2.data()[i]; + ASSERT(a0.data()[i] == target); + ASSERT(a4.data()[i] == target); + } + }); + + AT_DISPATCH_FLOATING_TYPES(a0.type(), "test3", [&] { + CPU_tensor_apply4( + a0, + a1, + a2, + a3, + [](scalar_t& y, + const scalar_t& x, + const scalar_t& z, + const scalar_t& a) { y = x * x + z * a; }); + CPU_tensor_apply4( + a4, + a1, + a2, + a3, + [](double& y, const scalar_t& x, const scalar_t& z, const scalar_t& a) { + y = (double)(x * x + z * a); + }); + for (int64_t i = 0; i < a0.numel(); i++) { + auto target = a1.data()[i] * a1.data()[i]; + target = target + a2.data()[i] * a3.data()[i]; + ASSERT(a0.data()[i] == target); + ASSERT(a4.data()[i] == target); + } + }); +} + +TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { + manual_seed(123, at::Backend::CPU); + test(CPU(kDouble), {2, 1}, -1, -1); +} + +TEST_CASE("apply utils test 2-dim small", "[cpu]") { + manual_seed(123, at::Backend::CPU); + test(CPU(kDouble), {2, 1}); +} + +TEST_CASE("apply utils test 2-dim", "[cpu]") { + manual_seed(123, at::Backend::CPU); + test(CPU(kDouble), {20, 10}); +} + +TEST_CASE("apply utils test 3-dim", "[cpu]") { + manual_seed(123, at::Backend::CPU); + test(CPU(kDouble), {3, 4, 2}); +} + +TEST_CASE("apply utils test 3-dim medium", "[cpu]") { + manual_seed(123, at::Backend::CPU); + test(CPU(kDouble), {3, 40, 2}); +} + +TEST_CASE("apply utils test 10-dim", "[cpu]") { + manual_seed(123, at::Backend::CPU); + test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3}); +} diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp new file mode 100644 index 0000000..af25179 --- /dev/null +++ b/aten/src/ATen/test/atest.cpp @@ -0,0 +1,113 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" + +#include +using namespace std; +using namespace at; + +void trace() { + Tensor foo = rand({12,12}); + + // ASSERT foo is 2-dimensional and holds floats. + auto foo_a = foo.accessor(); + float trace = 0; + + for(int i = 0; i < foo_a.size(0); i++) { + trace += foo_a[i][i]; + } + + REQUIRE(Scalar(foo.trace()).toFloat() == Approx(trace)); +} + +TEST_CASE( "atest", "[]" ) { + + manual_seed(123, at::Backend::CPU); + manual_seed(123, at::Backend::CUDA); + + auto foo = rand({12,6}); + REQUIRE(foo.data() == foo.toFloatData()); + + REQUIRE(foo.size(0) == 12); + REQUIRE(foo.size(1) == 6); + + foo = foo+foo*3; + foo -= 4; + + { + Tensor no; + REQUIRE_THROWS(add_out(no,foo,foo)); + } + Scalar a = 4; + + float b = a.to(); + REQUIRE(b == 4); + + foo = (foo*foo) == (foo.pow(3)); + foo = 2 + (foo+1); + //foo = foo[3]; + auto foo_v = foo.accessor(); + + for(int i = 0; i < foo_v.size(0); i++) { + for(int j = 0; j < foo_v.size(1); j++) { + foo_v[i][j]++; + } + } + + REQUIRE(foo.equal(4 * ones({12, 6}, kByte))); + + trace(); + + float data[] = { 1, 2, 3, + 4, 5, 6}; + + auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3}); + auto f_a = f.accessor(); + + REQUIRE(f_a[0][0][0] == 1.0); + REQUIRE(f_a[0][1][1] == 5.0); + + REQUIRE(f.strides()[0] == 6); + REQUIRE(f.strides()[1] == 3); + REQUIRE(f.strides()[2] == 1); + REQUIRE(f.sizes()[0] == 1); + REQUIRE(f.sizes()[1] == 2); + REQUIRE(f.sizes()[2] == 3); + + REQUIRE_THROWS(f.resize_({3,4,5})); + { + int isgone = 0; + { + auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) { + isgone++; + }); + } + REQUIRE(isgone == 1); + } + { + int isgone = 0; + Tensor a_view; + { + auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) { + isgone++; + }); + a_view = f2.view({3,2,1}); + } + REQUIRE(isgone == 0); + a_view.reset(); + REQUIRE(isgone == 1); + } + + if(at::hasCUDA()) { + int isgone = 0; + { + auto base = CUDA(kFloat).tensor({1,2,3}); + auto f2 = CUDA(kFloat).tensorFromBlob(base.data_ptr(), {1,2,3}, [&](void*) { + isgone++; + }); + } + REQUIRE(isgone==1); + } +} diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp new file mode 100644 index 0000000..6b46c8c --- /dev/null +++ b/aten/src/ATen/test/basic.cpp @@ -0,0 +1,287 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "THNN/Reduction.h" + +// for TH compat test only... +struct THFloatTensor; +extern "C" THFloatTensor * THFloatTensor_newWithSize2d(size_t a, size_t b); +extern "C" void THFloatTensor_fill(THFloatTensor *, float v); + +#include +#include +#include +#include +#include "test_seed.h" + +using namespace at; + +using Catch::Matchers::StartsWith; + +static void test(Type & type) { + SECTION( "resize" ) { + auto a = type.tensor(); + a.resize_({3,4}); + REQUIRE(a.numel() == 12); + a.resize_({5, 7}); + REQUIRE(a.numel() == 35); + + } + + SECTION( "ones and dot" ) { + Tensor b0 = ones({1, 1}, type); + REQUIRE(2 == (b0+b0).sum().toCDouble()); + + Tensor b1 = ones({1, 2}, type); + REQUIRE(4 == (b1+b1).sum().toCDouble()); + + Tensor b = ones({3, 4}, type); + REQUIRE(24 == (b+b).sum().toCDouble()); + REQUIRE(12 == b.numel()); + REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12); + } + + SECTION( "rand" ) { + for(auto i = 0; i < 10; i++) { + Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble)); + } + } + + SECTION( "sort" ) { + Tensor b = rand({3, 4}, type); + + auto z = b.sort(1); + auto z_sorted = std::get<0>(z); + + REQUIRE(Scalar(z_sorted[0][0]).toFloat() < Scalar(z_sorted[0][1]).toFloat()); + } + + if(type.backend() != kCUDA) + SECTION( "randperm" ) { + Tensor b = randperm(15, type); + Tensor rv, ri; + std::tie(rv, ri) = sort(b, 0); + REQUIRE(Scalar(rv[0]).toFloat() <= Scalar(rv[1]).toFloat()); + } + + SECTION( "context" ) { + std::stringstream ss; + ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl; + } + + SECTION( "add" ) { + Tensor a = rand({3, 4}, type); + Tensor b = rand({3, 4}, type); + Tensor c = add(a, add(a, b)); + //TODO:0-dim Tensor d(3.f); + Scalar d = 3.f; + REQUIRE( add(c, d).allclose(a + a + b + d) ); + } + + SECTION( "loads of adds" ) { + auto begin = std::chrono::high_resolution_clock::now(); + Tensor d = ones({3, 4}, type); + Tensor r = zeros({3, 4}, type); + for(auto i = 0; i < 100000; i++) { + add_out(r, r, d); + } + auto end = std::chrono::high_resolution_clock::now(); + //TODO TEST PERF? + std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; + REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); + } + + SECTION( "loads of adds (with copy)" ) { + auto begin = std::chrono::high_resolution_clock::now(); + Tensor d = ones({3, 4}, type); + Tensor r = zeros({3, 4}, type); + for(auto i = 0; i < 100000; i++) { + r = add(r, d); + } + auto end = std::chrono::high_resolution_clock::now(); + //TODO TEST PERF? + std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; + REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); + } + + SECTION( "isContiguous" ) { + Tensor a = rand({3, 4}, type); + REQUIRE(a.is_contiguous()); + a = a.transpose(0, 1); + REQUIRE(!a.is_contiguous()); + } + + SECTION( "permute" ) { + Tensor a = rand({3, 4, 5}, type); + Tensor b = a.permute({1, 2, 0}); + REQUIRE(b.sizes().equals({4, 5, 3})); + REQUIRE(b.strides().equals({5, 1, 20})); + } + + SECTION( "mm" ) { + Tensor a = rand({3, 4}, type); + Tensor b = rand({4}, type); + Tensor c = mv(a, b); + REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1))); + } + + SECTION( "squeeze" ) { + Tensor a = rand({2, 1}, type); + Tensor b = squeeze(a); + REQUIRE(b.dim() == 1); + a = rand({1}, type); + b = squeeze(a); + //TODO 0-dim squeeze + REQUIRE(a[0].equal(b)); + } + + SECTION( "copy" ) { + Tensor a = zeros({4, 3}, type); + Tensor e = rand({4, 3}, type); + a.copy_(e); + REQUIRE(a.equal(e)); + } + + SECTION( "copy (broadcasting)" ) { + Tensor a = zeros({4, 3}, type); + Tensor e = rand({3}, type); + a.copy_(e); + for (int i = 0; i < 4; ++i) { + REQUIRE(a[i].equal(e)); + } + } + + SECTION( "abs(value)" ) { + Tensor r = at::abs(type.scalarTensor(-3)); + REQUIRE(Scalar(r).toInt() == 3); + } + +//TODO(zach): operator overloads +#if 0 + { + std::cout << "eq (value):" << std::endl; + Tensor a = Tensor(10.f); + std::cout << (a == 11_i64) << " -- should be 0" << std::endl; + std::cout << (a == 10_i64) << " -- should be 1" << std::endl; + std::cout << (a == 10.) << " -- should be 1" << std::endl; + } +#endif + + SECTION( "adding a value with a scalar" ) { + Tensor a = rand({4, 3}, type); + REQUIRE((ones({4,3}, type) + a).equal(add(a,1))); + } + + SECTION( "select" ) { + Tensor a = rand({3, 7}, type); + auto a_13 = select(a, 1, 3); + auto a_13_02 = select(select(a, 1, 3), 0, 2); + REQUIRE( a[0][3].equal(a_13[0]) ); + REQUIRE( a[2][3].equal(a_13_02) ); + } + + SECTION( "zero-dim" ) { + Tensor a = type.scalarTensor(4); //rand(type, {1}); + + REQUIRE_NOTHROW(Scalar(a)); + Tensor b = rand({3,4}, type); + REQUIRE((a + a).dim() == 0); + REQUIRE((1 + a).dim() == 0); + REQUIRE((b + a).dim() == 2); + REQUIRE((a + b).dim() == 2); + auto c = rand({3,4}, type); + REQUIRE(c[1][2].dim() == 0); + + auto f = rand({3,4}, type); + f[2] = zeros({4}, type); + f[1][0] = -1; + REQUIRE(Scalar(f[2][0]).toDouble() == 0); + } + + SECTION( "tensor from TH" ) { + int a = 4; + THFloatTensor *t = THFloatTensor_newWithSize2d(a, a); + THFloatTensor_fill(t, a); + Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false); + REQUIRE_NOTHROW(tt); + } + + SECTION( "toCFloat" ) { + Tensor a = zeros({3,4}); + Tensor b = ones({3,7}); + Tensor c = cat({a,b},1); + REQUIRE(c.size(1) == 11); + + Tensor e = rand({}); + REQUIRE(*e.data() == e.sum().toCFloat()); + } + + SECTION( "to string" ) { + Tensor b = ones({3,7})*.0000001f; + std::stringstream s; + s << b << "\n"; + std::string expect = "1e-07 *"; + REQUIRE(s.str().substr(0,expect.size()) == expect); + } + SECTION("indexing by Scalar") { + Tensor tensor = arange(0, 10, kInt); + Tensor one = ones({1}, kInt); + for (int64_t i = 0; i < tensor.numel(); ++i) { + REQUIRE(tensor[i].equal(one * i)); + } + for (size_t i = 0; i < static_cast(tensor.numel()); ++i) { + REQUIRE(tensor[i].equal(one * static_cast(i))); + } + for (int i = 0; i < tensor.numel(); ++i) { + REQUIRE(tensor[i].equal(one * i)); + } + for (int16_t i = 0; i < tensor.numel(); ++i) { + REQUIRE(tensor[i].equal(one * i)); + } + for (int8_t i = 0; i < tensor.numel(); ++i) { + REQUIRE(tensor[i].equal(one * i)); + } + REQUIRE_THROWS_WITH( + tensor[Scalar(3.14)].equal(one), + StartsWith( + "Can only index tensors with integral scalars (got CPUDoubleType)")); + } + SECTION("indexing by zero-dim tensor") { + Tensor tensor = arange(0, 10, kInt); + Tensor one = ones({}, kInt); + for (int i = 0; i < tensor.numel(); ++i) { + REQUIRE(tensor[one * i].equal(one * i)); + } + REQUIRE_THROWS_WITH( + tensor[ones({}) * 3.14].equal(one), + StartsWith( + "Can only index tensors with integral scalars (got CPUFloatType)")); + REQUIRE_THROWS_WITH( + tensor[Tensor()].equal(one), + StartsWith("Can only index with tensors that are defined")); + REQUIRE_THROWS_WITH( + tensor[ones({2, 3, 4}, kInt)].equal(one), + StartsWith("Can only index with tensors that are scalars (zero-dim)")); + } + SECTION("dispatch") { + Tensor tensor = randn({20, 20}); + Tensor other = randn({20, 20}); + auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean); + REQUIRE(result.allclose(mse_loss(relu(tensor), other))); + } +} + +TEST_CASE( "basic tests CPU", "[cpu]" ) { + manual_seed(123, at::Backend::CPU); + + test(CPU(kFloat)); +} + +TEST_CASE( "basic tests GPU", "[cuda]" ) { + manual_seed(123, at::Backend::CUDA); + + if(at::hasCUDA()) { + test(CUDA(kFloat)); + } +} diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp new file mode 100644 index 0000000..2c98121 --- /dev/null +++ b/aten/src/ATen/test/broadcast_test.cpp @@ -0,0 +1,154 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" + +using namespace at; + +TEST_CASE( "broadcast", "[]" ) { + + manual_seed(123, at::Backend::CPU); + + Type & T = CPU(kFloat); + + // 0) pre-req tests: + SECTION( "can't expand empty tensor" ) { + auto empty = randn({0}, T); + REQUIRE_THROWS(empty.expand({3})); + } + + // 1) out-place function with 2 args + SECTION( "out-place function with 2 args" ) { + + SECTION( "basic" ) { + auto a = randn({3, 1}, T); + auto b = randn({5}, T); + std::vector expanded_sizes = {3, 5}; + REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); + } + + SECTION( "with scalar" ) { + auto aScalar = ones({1}, T); + aScalar.get()->maybeScalar(true); + auto b = randn({3, 5}, T); + REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); + } + + SECTION( "old fallback behavior yields error" ) { + auto a = randn({3, 5}, T); + auto b = randn({5, 3}, T); + REQUIRE_THROWS(a + b); + } + + SECTION( "with mismatched sizes" ) { + auto a = randn({3, 5}, T); + auto b = randn({7, 5}, T); + REQUIRE_THROWS(a + b); + } + } + + SECTION( "out-place function with 3 args" ) { + + SECTION( "basic" ) { + auto a = randn({3, 1, 1}, T); + auto b = randn({1, 2, 1}, T); + auto c = randn({1, 1, 5}, T); + std::vector expanded_sizes = {3, 2, 5}; + REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes))); + } + + SECTION( "with scalar" ) { + auto aTensorScalar = ones({1}, T); + aTensorScalar.get()->maybeScalar(true); + auto b = randn({3, 2, 1}, T); + auto c = randn({1, 2, 5}, T); + std::vector expanded_sizes = {3, 2, 5}; + REQUIRE(aTensorScalar.addcmul(b, c).equal( + aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes)))); + } + + SECTION( "old fallback behavior yields error" ) { + auto a = randn({3, 2, 5}, T); + auto b = randn({2, 3, 5}, T); + auto c = randn({5, 3, 2}, T); + REQUIRE_THROWS(a.addcmul(b, c)); + } + + SECTION( "with mismatched sizes" ){ + auto a = randn({3, 2, 5}, T); + auto b = randn({2, 3, 5}, T); + auto c = randn({5, 5, 5}, T); + REQUIRE_THROWS(a.addcmul(b, c)); + } + } + + SECTION( "in-place function with 2 args" ) { + SECTION( "basic" ) { + auto a = randn({3, 5}, T); + auto b = randn({3, 1}, T); + REQUIRE((a + b).equal(a + b.expand({3, 5}))); + } + + SECTION( "with scalar" ) { + auto a = randn({3, 5}, T); + auto bScalar = ones({1}, T); + bScalar.get()->maybeScalar(true); + REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); + } + + SECTION( "error: would have to expand inplace arg" ) { + auto a = randn({1, 5}, T); + auto b = randn({3, 1}, T); + REQUIRE_THROWS(a.add_(b)); + } + } + + SECTION( "in-place function with 3 args" ) { + + auto a = randn({3, 5, 2}, T); + auto b = randn({3, 1, 2}, T); + auto c = randn({1, 5, 1}, T); + + SECTION( "basic" ) { + auto aClone = a.clone(); + REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); + } + + SECTION( "with scalar" ) { + auto aClone = a.clone(); + auto bScalar = ones({1}, T); + bScalar.get()->maybeScalar(true); + REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes())))); + } + + SECTION( "error: would have to expand inplace arg" ) { + auto a = randn({1, 3, 5}, T); + auto b = randn({4, 1, 1}, T); + auto c = randn({1, 3, 1}, T); + REQUIRE_THROWS(a.addcmul_(b, c)); + } + } + + SECTION( "explicit dim specification" ) { + + auto a = randn({1}, T); + auto b = randn({5, 3}, T); + auto c = randn({3, 7}, T); + + SECTION( "basic" ) { + REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c))); + } + + SECTION( "with scalar" ) { + Tensor aScalar = ones({1}, T); + aScalar.get()->maybeScalar(true); + REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); + } + + SECTION( "with mismatched sizes" ) { + auto a = randn({3, 3}, T); + REQUIRE_THROWS(a.addmm(b, c)); + } + } +} diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp new file mode 100644 index 0000000..d32903d --- /dev/null +++ b/aten/src/ATen/test/cuda_rng_test.cpp @@ -0,0 +1,27 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "cuda.h" +#include "cuda_runtime.h" +#include + +void makeRandomNumber() { + cudaSetDevice(std::rand() % 2); + auto x = at::randn({1000}); +} + +void testCudaRNGMultithread() { + auto threads = std::vector(); + for (auto i = 0; i < 1000; i++) { + threads.emplace_back(makeRandomNumber); + } + for (auto& t : threads) { + t.join(); + } +}; + +TEST_CASE( "CUDA RNG test", "[cuda]" ) { + SECTION( "multithread" ) + testCudaRNGMultithread(); +} diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp new file mode 100644 index 0000000..7c1bc96 --- /dev/null +++ b/aten/src/ATen/test/cudnn_test.cpp @@ -0,0 +1,25 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "ATen/cudnn/Descriptors.h" +#include "ATen/cudnn/Handles.h" +#include "test_seed.h" + +using namespace at; +using namespace at::native; + +TEST_CASE( "cudnn", "[cuda]" ) { + manual_seed(123, at::Backend::CUDA); + +#if CUDNN_VERSION < 7000 + auto handle = getCudnnHandle(); + DropoutDescriptor desc1, desc2; + desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42); + desc2.set(handle, 0.5, desc1.state); + + REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout); + REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates); + REQUIRE(desc1.desc()->states == desc2.desc()->states); +#endif +} diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp new file mode 100644 index 0000000..1603e3d --- /dev/null +++ b/aten/src/ATen/test/dlconvertor_test.cpp @@ -0,0 +1,27 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "ATen/DLConvertor.h" + +#include +#include +#include +#include "test_seed.h" + +using namespace at; + +TEST_CASE( "dlconvertor", "[cpu]" ) { + + manual_seed(123, at::Backend::CPU); + + INFO( "convert ATen to DLTensor" ); + + Tensor a = rand({3,4}); + DLManagedTensor* dlMTensor = toDLPack(a); + + INFO( "convert DLTensor to ATen" ); + Tensor b = fromDLPack(dlMTensor); + + REQUIRE(a.equal(b)); +} diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp new file mode 100644 index 0000000..fc70522 --- /dev/null +++ b/aten/src/ATen/test/half_test.cpp @@ -0,0 +1,117 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include +#include +#include +#include +#include + +using namespace at; + +TEST_CASE( "half arithmetic", "[]" ) { + Half zero = 0; + Half one = 1; + REQUIRE(zero + one == one); + REQUIRE(zero + zero == zero); + REQUIRE(zero * one == zero); + REQUIRE(one * one == one); + REQUIRE(one / one == one); + REQUIRE(one - one == zero); + REQUIRE(one - zero == one); + REQUIRE(zero - one == -one); + REQUIRE(one + one == Half(2)); + REQUIRE(one + one == 2); +} + +TEST_CASE( "half comparisons", "[]" ) { + Half zero = 0; + Half one = 1; + REQUIRE(zero < one); + REQUIRE(zero < 1); + REQUIRE(1 > zero); + REQUIRE(0 >= zero); + REQUIRE(0 != one); + REQUIRE(zero == 0); + REQUIRE(zero == zero); + REQUIRE(zero == -zero); +} + +TEST_CASE( "half cast", "[]" ) { + Half value = 1.5f; + REQUIRE((int)value == 1); + REQUIRE((short)value == 1); + REQUIRE((long long)value == 1LL); + REQUIRE((float)value == 1.5f); + REQUIRE((double)value == 1.5); + REQUIRE((bool)value == true); + REQUIRE((bool)Half(0.0f) == false); +} + +TEST_CASE( "half construction", "[]" ) { + REQUIRE(Half((short)3) == Half(3.0f)); + REQUIRE(Half((unsigned short)3) == Half(3.0f)); + REQUIRE(Half(3) == Half(3.0f)); + REQUIRE(Half(3U) == Half(3.0f)); + REQUIRE(Half(3LL) == Half(3.0f)); + REQUIRE(Half(3ULL) == Half(3.0f)); + REQUIRE(Half(3.5) == Half(3.5f)); +} + +static std::string to_string(const Half& h) { + std::stringstream ss; + ss << h; + return ss.str(); +} + +TEST_CASE( "half to string", "[]" ) { + REQUIRE(to_string(Half(3.5f)) == "3.5"); + REQUIRE(to_string(Half(-100.0f)) == "-100"); +} + +TEST_CASE( "half numeric limits", "[]" ) { + using limits = std::numeric_limits; + REQUIRE(limits::lowest() == -65504.0f); + REQUIRE(limits::max() == 65504.0f); + REQUIRE(limits::min() > 0); + REQUIRE(limits::min() < 1); + REQUIRE(limits::denorm_min() > 0); + REQUIRE(limits::denorm_min() / 2 == 0); + REQUIRE(limits::infinity() == std::numeric_limits::infinity()); + REQUIRE(limits::quiet_NaN() != limits::quiet_NaN()); + REQUIRE(limits::signaling_NaN() != limits::signaling_NaN()); +} + +// Check the declared type of members of numeric_limits matches +// the declared type of that member on numeric_limits + +#define ASSERT_SAME_TYPE(name) \ + static_assert( \ + std::is_same< \ + decltype(std::numeric_limits::name), \ + decltype(std::numeric_limits::name)>::value, \ + "decltype(" #name ") differs") + +ASSERT_SAME_TYPE(is_specialized); +ASSERT_SAME_TYPE(is_signed); +ASSERT_SAME_TYPE(is_integer); +ASSERT_SAME_TYPE(is_exact); +ASSERT_SAME_TYPE(has_infinity); +ASSERT_SAME_TYPE(has_quiet_NaN); +ASSERT_SAME_TYPE(has_signaling_NaN); +ASSERT_SAME_TYPE(has_denorm); +ASSERT_SAME_TYPE(has_denorm_loss); +ASSERT_SAME_TYPE(round_style); +ASSERT_SAME_TYPE(is_iec559); +ASSERT_SAME_TYPE(is_bounded); +ASSERT_SAME_TYPE(is_modulo); +ASSERT_SAME_TYPE(digits); +ASSERT_SAME_TYPE(digits10); +ASSERT_SAME_TYPE(max_digits10); +ASSERT_SAME_TYPE(radix); +ASSERT_SAME_TYPE(min_exponent); +ASSERT_SAME_TYPE(min_exponent10); +ASSERT_SAME_TYPE(max_exponent); +ASSERT_SAME_TYPE(max_exponent10); +ASSERT_SAME_TYPE(traps); +ASSERT_SAME_TYPE(tinyness_before); diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu new file mode 100644 index 0000000..4c63ab3 --- /dev/null +++ b/aten/src/ATen/test/integer_divider_test.cu @@ -0,0 +1,190 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +// Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or +// (b-1), so it takes a few minutes to run. + +#include +#include +#include +#include + +#include "THC/THCIntegerDivider.cuh" + +using std::vector; + +template +struct TestCase { + Value dividend; + int divisor_idx; + int steps; + + TestCase(Value dividend, int divisor_idx, int steps) + : dividend(dividend), divisor_idx(divisor_idx), steps(steps) { } +}; + +template +__global__ void testIntDivider(const IntDivider *dividers, + const TestCase *testCases, + int numCases) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = index; i < numCases; i += stride) { + const TestCase &tc = testCases[i]; + Value dividend = tc.dividend; + const IntDivider ÷r = dividers[tc.divisor_idx]; + Value divisor = divider.divisor; + + for (int j = 0; j < tc.steps; j++) { + if (sizeof(Value) == 4 && dividend > INT32_MAX) return; + + DivMod qr = divider.divmod(dividend); + assert(qr.div == dividend / divisor && qr.mod == dividend % divisor); + dividend += divisor; + } + } +} + +enum { + // Number of test cases per each kernel invocation. + NUM_CASES = 1000000, + + // Maximum number of steps per each test case. + MAX_STEPS = 10000, +}; + +// Test the magic division algorithm. +template +class IntDividerTester { + public: + IntDividerTester() { + cudaError_t err; + + err = cudaMalloc(÷rsBuf_, NUM_CASES * sizeof(IntDivider)); + REQUIRE(err == cudaSuccess); + err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase)); + REQUIRE(err == cudaSuccess); + } + + ~IntDividerTester() { + cudaError_t err; + + err = cudaFree(dividersBuf_); + REQUIRE(err == cudaSuccess); + err = cudaFree(testCasesBuf_); + REQUIRE(err == cudaSuccess); + } + + void addTestCase(Value dividend, Value divisor, int steps) { + // Append a new IntDivider using 'divisor' if necessary. + if (dividers_.empty() || dividers_.back().divisor != divisor) + dividers_.emplace_back(divisor); + + // Append the test case. + testCases_.emplace_back(dividend, dividers_.size() - 1, steps); + + // Launch the test kernel if the buffer is full. + if (testCases_.size() == NUM_CASES) flush(); + } + + void flush() { + cudaError_t err; + + if (testCases_.empty()) return; + REQUIRE(!dividers_.empty()); + + REQUIRE(dividers_.size() <= NUM_CASES); + REQUIRE(testCases_.size() <= NUM_CASES); + err = cudaMemcpy(dividersBuf_, dividers_.data(), + dividers_.size() * sizeof(IntDivider), + cudaMemcpyHostToDevice); + REQUIRE(err == cudaSuccess); + err = cudaMemcpy(testCasesBuf_, testCases_.data(), + testCases_.size() * sizeof(TestCase), + cudaMemcpyHostToDevice); + REQUIRE(err == cudaSuccess); + + int numCases = testCases_.size(); + testIntDivider<<<512, 512>>>( + dividersBuf_, testCasesBuf_, numCases); + + dividers_.clear(); + testCases_.clear(); + } + + private: + vector> dividers_; + vector> testCases_; + + IntDivider *dividersBuf_; + TestCase *testCasesBuf_; +}; + +static void testUint32Divider() +{ + fprintf(stderr, "Testing 32-bit integer division ..."); + + IntDividerTester tester; + + for (uint64_t divisor = 1; divisor <= INT32_MAX; divisor++) { + if (divisor < 1000000 && divisor % 10000 == 0) fprintf(stderr, "."); + if (divisor % 10000000 == 0) fprintf(stderr, "-"); + + // In order to save time, we only test when the remainder is zero or + // (divisor - 1). + uint64_t dividend = 0; + while (dividend <= INT32_MAX) { + uint64_t steps = (INT32_MAX - dividend) / divisor + 1; + if (steps > MAX_STEPS) steps = MAX_STEPS; + + tester.addTestCase(dividend, divisor, steps); + tester.addTestCase(dividend + divisor - 1, divisor, steps); + + dividend += divisor * steps; + } + + // Check the boundary cases. + tester.addTestCase(1, divisor, 1); + tester.addTestCase(INT32_MAX, divisor, 1); + } + + tester.flush(); + + fprintf(stderr, " Done!\n"); +} + +// uint64_t divider uses plain division, so we just check a few random cases. +static void testUint64Divider() +{ + IntDividerTester tester; + + uint64_t dividend = 0x123456789ULL; + uint64_t divisor = 0x54321ULL; + + for (int i = 0; i < 1000; i++) { + if (divisor != 0) { + tester.addTestCase(dividend, divisor, 100); + + // Test small divisor. + tester.addTestCase(dividend, divisor % 65536, 100); + + // Create pseudorandom numbers. + dividend *= 0x100000001b3ULL; + dividend ^= 0x1234567890abcdefULL; + divisor *= 0x100000001b3ULL; + divisor ^= 0x1234567890abcdefULL; + } + } + + tester.flush(); +} + +TEST_CASE( "CUDA integer divider", "[cuda]" ) { + + testUint64Divider(); + testUint32Divider(); + + cudaError_t err = cudaDeviceSynchronize(); + REQUIRE(err == cudaSuccess); +} diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp new file mode 100644 index 0000000..99a21d3 --- /dev/null +++ b/aten/src/ATen/test/native_test.cpp @@ -0,0 +1,193 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" + +using namespace at; + +#define REQUIRE_EQUAL(t1, t2) \ + REQUIRE(t1.equal(t2)); + +#define REQUIRE_ALLCLOSE(t1, t2) \ + REQUIRE(t1.is_same_size(t2)); \ + REQUIRE(t1.allclose(t2)); + +#define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ + REQUIRE(t1.is_same_size(t2)); \ + REQUIRE(t1.allclose(t2, atol, rtol)); + +void requireEqualTensorList(TensorList t1, TensorList t2) { + REQUIRE(t1.size() == t2.size()); + for (size_t i = 0; i < t1.size(); ++i) { + REQUIRE_EQUAL(t1[ i ], t2[ i ]); + } +} + +void test(Type & T, Type & AccT) { + auto t = randn({3, 3}, T); + + SECTION( "split: test method, type, namespace give same result" ) { + auto splitMethod = t.split(1, 0); + auto splitType = T.split(t, 1, 0); + auto splitNs = at::split(t, 1, 0); + requireEqualTensorList(splitMethod, splitType); + requireEqualTensorList(splitMethod, splitNs); + + // test rebuilding with cat + REQUIRE_EQUAL(at::cat(splitMethod, 0), t); + } + + SECTION( "chunk: test method, type, namespace give same result" ) { + // test method, type, namespace give same result + auto chunkMethod = t.chunk(3, 0); + auto chunkType = T.chunk(t, 3, 0); + auto chunkNs = at::chunk(t, 3, 0); + requireEqualTensorList(chunkMethod, chunkType); + requireEqualTensorList(chunkMethod, chunkNs); + + // test rebuilding with cat + REQUIRE_EQUAL(at::cat(chunkMethod, 0), t); + } + + // stack + SECTION( "stack" ) { + auto x = rand({2, 3, 4}); + auto y = rand({2, 3, 4}); + auto z = rand({2, 3, 4}); + for (int64_t dim = 0; dim < 4; ++dim) { + auto res = at::stack({x, y, z}, dim); + auto res_neg = at::stack({x, y, z}, dim - 4); + std::vector expected_size; + expected_size.insert(expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim); + expected_size.insert(expected_size.end(), 3); + expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end()); + + REQUIRE_EQUAL(res, res_neg); + REQUIRE(res.sizes().equals(expected_size)); + REQUIRE_EQUAL(res.select(dim, 0), x); + REQUIRE_EQUAL(res.select(dim, 1), y); + REQUIRE_EQUAL(res.select(dim, 2), z); + } + } + + SECTION( "size / stride" ) { + auto scalar = randn({}, T); + REQUIRE_THROWS_WITH(scalar.size(0), "dimension specified as 0 but tensor has no dimensions"); + REQUIRE_THROWS_WITH(scalar.size(-1), "dimension specified as -1 but tensor has no dimensions"); + REQUIRE_THROWS_WITH(scalar.stride(0), "dimension specified as 0 but tensor has no dimensions"); + REQUIRE_THROWS_WITH(scalar.stride(-1), "dimension specified as -1 but tensor has no dimensions"); + + auto empty = randn({0}, T); + REQUIRE(empty.size(0) == 0); + REQUIRE(empty.size(-1) == 0); + REQUIRE(empty.stride(0) == 1); + REQUIRE(empty.stride(-1) == 1); + } + + // matmul + SECTION( "matmul" ) { + auto scalar = randn({}, T); + auto d1 = randn({3}, T); + auto d2 = randn({2, 3}, T); + + // 0-d + REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D")); + REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D")); + + // 1-d + REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1)); + REQUIRE_ALLCLOSE(d2.matmul(d1), d2.mv(d1)); + auto d1o = randn({2}, T); + REQUIRE_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0)); + + // 2-d + auto d2o = randn({3, 5}, T); + REQUIRE_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o)); + + // > 2-d, 1-d + auto d3 = randn({5, 2, 3}, T); + REQUIRE_ALLCLOSE(d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2})); + REQUIRE_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3})); + + auto d5 = randn({3, 2, 4, 2, 3}, T); + REQUIRE_ALLCLOSE(d5.matmul(d1), d5.view({24, 2, 3}).bmm(d1.view({1, 3, 1}).expand({24, 3, 1})).view({3, 2, 4, 2})); + REQUIRE_ALLCLOSE(d1o.matmul(d5), d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3})); + + // > 2-d, 2-d + // we use a "folding" algorithm in this case of matmul, so the direct comparison to bmm doesn't work; + // instead, compare to the higher precision computation (technically, we should always do this). + // Tolerances are selected empirically. + double atol = 1e-04; + double rtol = 1e-06; + d2 = randn({3, 4}, T); + d2o = randn({4, 2}, T); + auto result = d5.matmul(d2).toType(AccT); + + auto d5Acc = d5.toType(AccT); + auto d2Acc = d2.toType(AccT); + auto acc_result = d5Acc.view({24, 2, 3}).bmm(d2Acc.expand({24, 3, 4})).view({3, 2, 4, 2, 4}); + REQUIRE_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol); + REQUIRE_ALLCLOSE(d2o.matmul(d5), d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3})); + + // > 2-d, > 2-d + auto d5o = randn({2, 1, 2, 4, 3, 2}, T); + auto d5_bmm_view = d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3}); + auto d5o_bmm_view = d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2}); + REQUIRE_ALLCLOSE(d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2})); + + // non-expandable case + auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T); + REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size")); + } + + // _standard_gamma_grad + SECTION( "_standard_gamma_grad" ) { + // check empty + auto empty = ones({0}, T); + REQUIRE_EQUAL(empty, empty._standard_gamma_grad(empty)); + + // check scalar equals one element + auto one_scalar = ones({}, T).mul(5); + auto one_with_dim = ones({1}, T).mul(5); + REQUIRE_ALLCLOSE(one_scalar._standard_gamma_grad(one_scalar), + one_with_dim._standard_gamma_grad(one_with_dim).sum()); + + // check mixing types + auto t1 = randn({3, 4}, T); + auto t2 = randn({3, 4}, T).toType(kDouble); + REQUIRE_THROWS_WITH(t1._standard_gamma_grad(t2), Catch::StartsWith("expected scalar type")); + } + + SECTION( "where" ) { + // empty + auto empty = ones({0}, T); + auto &bT = T.toScalarType(ScalarType::Byte); + auto empty_byte = ones({0}, bT); + REQUIRE_EQUAL(empty, at::where(empty_byte, empty, empty)); + + // check scalar equals one element + auto x_scalar = ones({}, T).mul(5); + auto y_scalar = ones({}, T).mul(7); + auto cond_scalar = zeros({}, bT); + auto x_1d = x_scalar.unsqueeze(0); + auto y_1d = y_scalar.unsqueeze(0); + auto cond_1d = cond_scalar.unsqueeze(0); + REQUIRE_ALLCLOSE(at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0), + at::where(cond_1d, x_1d, y_1d)); + } +} + +TEST_CASE( "native test CPU", "[cpu]" ) { + manual_seed(123, at::Backend::CPU); + + test(CPU(kFloat), CPU(kDouble)); +} + +TEST_CASE( "native test CUDA", "[cuda]" ) { + manual_seed(123, at::Backend::CUDA); + + if (at::hasCUDA()) { + test(CUDA(kFloat), CUDA(kDouble)); + } +} diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp new file mode 100644 index 0000000..620e5ec --- /dev/null +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -0,0 +1,286 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" +#include +#include +#include + +using namespace at; + +#define TRY_CATCH_ELSE(fn, catc, els) \ + { \ + /* avoid mistakenly passing if els code throws exception*/ \ + bool _passed = false; \ + try { \ + fn; \ + _passed = true; \ + els; \ + } catch (std::exception &e) { \ + REQUIRE(!_passed); \ + catc; \ + } \ + } + +void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) { + REQUIRE(lhs.dim() == rhs.dim()); + REQUIRE(lhs.sizes().equals(rhs.sizes())); +} + +bool should_expand(const IntList &from_size, const IntList &to_size) { + if(from_size.size() > to_size.size()) { + return false; + } + for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend(); ++from_dim_it) { + for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend(); ++to_dim_it) { + if (*from_dim_it != 1 && *from_dim_it != *to_dim_it) { + return false; + } + } + } + return true; +} + +void test(Type &T) { + std::vector > sizes = { {}, {0}, {1}, {1, 1}, {2}}; + + // single-tensor/size tests + for (auto s = sizes.begin(); s != sizes.end(); ++s) { + // verify that the dim, sizes, strides, etc match what was requested. + auto t = ones(*s, T); + REQUIRE((size_t)t.dim() == s->size()); + REQUIRE((size_t)t.ndimension() == s->size()); + REQUIRE(t.sizes().equals(*s)); + REQUIRE(t.strides().size() == s->size()); + auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies()); + REQUIRE(t.numel() == numel); + // verify we can output + std::stringstream ss; + REQUIRE_NOTHROW(ss << t << std::endl); + + // set_ + auto t2 = ones(*s, T); + t2.set_(); + require_equal_size_dim(t2, ones({0}, T)); + + // unsqueeze + if (t.numel() != 0) { + REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); + } else { + REQUIRE_THROWS(t.unsqueeze(0)); + } + + // unsqueeze_ + { + auto t2 = ones(*s, T); + if (t2.numel() != 0) { + auto r = t2.unsqueeze_(0); + REQUIRE(r.dim() == t.dim() + 1); + } else { + REQUIRE_THROWS(t2.unsqueeze_(0)); + } + } + + // squeeze (with dimension argument) + if (t.dim() == 0 || t.sizes()[0] == 1) { + REQUIRE(t.squeeze(0).dim() == std::max(t.dim() - 1, 0)); + } else { + // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; + // in NumPy this is an error. + REQUIRE(t.squeeze(0).dim() == t.dim()); + } + + // squeeze (with no dimension argument) + { + std::vector size_without_ones; + for (auto size : *s) { + if (size != 1) { + size_without_ones.push_back(size); + } + } + auto result = t.squeeze(); + require_equal_size_dim(result, ones(size_without_ones, T)); + } + + { + // squeeze_ (with dimension argument) + auto t2 = ones(*s, T); + if (t2.dim() == 0 || t2.sizes()[0] == 1) { + REQUIRE(t2.squeeze_(0).dim() == std::max(t.dim() - 1, 0)); + } else { + // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; + // in NumPy this is an error. + REQUIRE(t2.squeeze_(0).dim() == t.dim()); + } + } + + // squeeze_ (with no dimension argument) + { + auto t2 = ones(*s, T); + std::vector size_without_ones; + for (auto size : *s) { + if (size != 1) { + size_without_ones.push_back(size); + } + } + auto r = t2.squeeze_(); + require_equal_size_dim(t2, ones(size_without_ones, T)); + } + + // reduce (with dimension argument and with 1 return argument) + if (t.numel() != 0) { + REQUIRE(t.sum(0).dim() == std::max(t.dim() - 1, 0)); + } else { + REQUIRE(t.sum(0).equal(at::zeros({}, T))); + } + + // reduce (with dimension argument and with 2 return arguments) + if (t.numel() != 0) { + auto ret = t.min(0); + REQUIRE(std::get<0>(ret).dim() == std::max(t.dim() - 1, 0)); + REQUIRE(std::get<1>(ret).dim() == std::max(t.dim() - 1, 0)); + } else { + REQUIRE_THROWS(t.min(0)); + } + + // simple indexing + if (t.dim() > 0 && t.numel() != 0) { + REQUIRE(t[0].dim() == std::max(t.dim() - 1, 0)); + } else { + REQUIRE_THROWS(t[0]); + } + + // fill_ (argument to fill_ can only be a 0-dim tensor) + TRY_CATCH_ELSE(t.fill_(t.sum(0)), + REQUIRE(t.dim() > 1), + REQUIRE(t.dim() <= 1)); + } + + for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) { + for (auto rhs_it = sizes.begin(); rhs_it != sizes.end(); ++rhs_it) { + // is_same_size should only match if they are the same shape + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + if(*lhs_it != *rhs_it) { + REQUIRE(!lhs.is_same_size(rhs)); + REQUIRE(!rhs.is_same_size(lhs)); + } + } + // forced size functions (resize_, resize_as, set_) + { + // resize_ + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + lhs.resize_(*rhs_it); + require_equal_size_dim(lhs, rhs); + } + // resize_as_ + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + lhs.resize_as_(rhs); + require_equal_size_dim(lhs, rhs); + } + // set_ + { + { + // with tensor + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + lhs.set_(rhs); + require_equal_size_dim(lhs, rhs); + } + { + // with storage + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + auto storage = T.storage(rhs.numel()); + lhs.set_(*storage); + // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars + REQUIRE(lhs.dim() != 0); + } + { + // with storage, offset, sizes, strides + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + auto storage = T.storage(rhs.numel()); + lhs.set_(*storage, rhs.storage_offset(), rhs.sizes(), rhs.strides()); + require_equal_size_dim(lhs, rhs); + } + } + } + + // view + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + auto rhs_size = *rhs_it; + TRY_CATCH_ELSE(auto result = lhs.view(rhs_size), + REQUIRE(lhs.numel() != rhs.numel()), + REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs);); + } + + // take + { + auto lhs = ones(*lhs_it, T); + auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long); + TRY_CATCH_ELSE(auto result = lhs.take(rhs), + REQUIRE(lhs.numel() == 0); REQUIRE(rhs.numel() != 0), + require_equal_size_dim(result, rhs)); + } + + + // ger + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + TRY_CATCH_ELSE(auto result = lhs.ger(rhs), + REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)), + [&]() { + int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0); + int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0); + require_equal_size_dim(result, result.type().tensor({dim0, dim1})); + }();); + } + + // expand + { + auto lhs = ones(*lhs_it, T); + auto lhs_size = *lhs_it; + auto rhs = ones(*rhs_it, T); + auto rhs_size = *rhs_it; + bool should_pass = should_expand(lhs_size, rhs_size); + TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size), + REQUIRE(!should_pass), + REQUIRE(should_pass); require_equal_size_dim(result, rhs);); + + // in-place functions (would be good if we can also do a non-broadcasting one, b/c + // broadcasting functions will always end up operating on tensors of same size; + // is there an example of this outside of assign_ ?) + { + bool should_pass_inplace = should_expand(rhs_size, lhs_size); + TRY_CATCH_ELSE(lhs.add_(rhs), + REQUIRE(!should_pass_inplace), + REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T));); + } + } + } + } +} + +TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { + manual_seed(123, at::Backend::CPU); + + test(CPU(kFloat)); +} + +TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { + manual_seed(123, at::Backend::CUDA); + + if (at::hasCUDA()) { + test(CUDA(kFloat)); + } +} diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp new file mode 100644 index 0000000..ccdab08 --- /dev/null +++ b/aten/src/ATen/test/scalar_test.cpp @@ -0,0 +1,151 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include +// define constants like M_PI and C keywords for MSVC +#ifdef _MSC_VER +#define _USE_MATH_DEFINES +#include +#endif +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" +#include "test_seed.h" + +using std::cout; +using namespace at; + +constexpr auto Float = ScalarType::Float; + +template +struct Foo { + static void apply(Tensor a, Tensor b) { + scalar_type s = 1; + std::stringstream ss; + ss << "hello, dispatch: " << a.type().toString() << s << "\n"; + auto data = (scalar_type*)a.data_ptr(); + (void)data; + } +}; +template<> +struct Foo { + static void apply(Tensor a, Tensor b) {} +}; + +void test_ctors() { + // create scalars backed by tensors + auto s1 = Scalar(CPU(kFloat).scalarTensor(1)); + auto s2 = Scalar(CPU(kFloat).scalarTensor(2)); + Scalar{s1}; + Scalar{std::move(s2)}; + REQUIRE(s2.isBackedByTensor()); + REQUIRE(!s2.toTensor().defined()); + s2 = s1; + REQUIRE(s2.isBackedByTensor()); + REQUIRE(s2.toFloat() == 1.0); + Scalar s3; + s3 = std::move(s2); + REQUIRE(s2.isBackedByTensor()); + REQUIRE(!s2.toTensor().defined()); + REQUIRE(s3.isBackedByTensor()); + REQUIRE(s3.toFloat() == 1.0); +} + +void test_overflow() { + auto s1 = Scalar(M_PI); + REQUIRE(s1.toFloat() == static_cast(M_PI)); + s1.toHalf(); + + s1 = Scalar(100000); + REQUIRE(s1.toFloat() == 100000.0); + REQUIRE(s1.toInt() == 100000); + + REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error); + + s1 = Scalar(NAN); + REQUIRE(std::isnan(s1.toFloat())); + REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + + s1 = Scalar(INFINITY); + REQUIRE(std::isinf(s1.toFloat())); + REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); +} + +TEST_CASE( "scalar test", "[]" ) { + + manual_seed(123, at::Backend::CPU); + manual_seed(123, at::Backend::CUDA); + + Scalar what = 257; + Scalar bar = 3.0; + Half h = bar.toHalf(); + Scalar h2 = h; + cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() << "\n"; + Generator & gen = at::globalContext().defaultGenerator(Backend::CPU); + REQUIRE_NOTHROW(gen.seed()); + auto && C = at::globalContext(); + if(at::hasCUDA()) { + auto & CUDAFloat = C.getType(Backend::CUDA,ScalarType::Float); + auto t2 = zeros({4,4}, CUDAFloat); + cout << &t2 << "\n"; + cout << "AFTER GET TYPE " << &CUDAFloat << "\n"; + auto s = CUDAFloat.storage(4); + REQUIRE( s->get(3).toFloat() == 0.0 ); + s->fill(7); + REQUIRE( s->get(3).toFloat() == 7.0 ); + } + auto t = ones({4,4}); + + auto wha2 = zeros({4,4}).add(t).sum(); + REQUIRE( wha2.toCDouble() == 16.0 ); + + REQUIRE( t.sizes()[0] == 4 ); + REQUIRE( t.sizes()[1] == 4 ); + REQUIRE( t.strides()[0] == 4 ); + REQUIRE( t.strides()[1] == 1 ); + + Type & T = CPU(Float); + Tensor x = randn({1,10}, T); + Tensor prev_h = randn({1,20}, T); + Tensor W_h = randn({20,20}, T); + Tensor W_x = randn({20,10}, T); + Tensor i2h = at::mm(W_x, x.t()); + Tensor h2h = at::mm(W_h, prev_h.t()); + Tensor next_h = i2h.add(h2h); + next_h = next_h.tanh(); + + REQUIRE_THROWS(Scalar{Tensor{}}); + + test_ctors(); + test_overflow(); + + if(at::hasCUDA()) { + auto r = CUDA(Float).copy(next_h); + REQUIRE(CPU(Float).copy(r).equal(next_h)); + } + REQUIRE_NOTHROW(randn({10,10,2}, T)); + + // check Scalar.toTensor on Scalars backed by different data types + REQUIRE(bar.toTensor().type().scalarType() == kDouble); + REQUIRE(what.toTensor().type().scalarType() == kLong); + REQUIRE(Scalar(ones({})).toTensor().type().scalarType() == kFloat); + + if (x.type().scalarType() != ScalarType::Half) { + AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] { + scalar_t s = 1; + std::stringstream ss; + REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n"); + auto data = (scalar_t*)x.data_ptr(); + (void)data; + }); + } + + // test direct C-scalar type conversions + { + auto x = ones({1,2}, T); + REQUIRE_THROWS(x.toCFloat()); + } + auto float_one = ones({}, T); + REQUIRE(float_one.toCFloat() == 1); + REQUIRE(float_one.toCInt() == 1); + REQUIRE((float_one.toCHalf() == 1)); +} diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp new file mode 100644 index 0000000..8946026 --- /dev/null +++ b/aten/src/ATen/test/stream_test.cpp @@ -0,0 +1,103 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" + +#include "cuda_runtime.h" + +#include +#include + +/* +Tests related to ATen streams. +*/ +TEST_CASE("Copying and Moving Streams", "Verifies streams are live through copying and moving") { + int32_t device = -1; + cudaStream_t cuda_stream; + + // Tests that copying works as expected and preserves the stream + at::CUDAStream copyStream; + { + auto s = at::globalContext().createCUDAStream(); + device = s.device(); + cuda_stream = s.stream(); + + copyStream = s; + + REQUIRE(copyStream.internals() == s.internals()); + REQUIRE(copyStream.device() == device); + REQUIRE(copyStream.stream() == cuda_stream); + } + + REQUIRE(copyStream.internals()); + REQUIRE(copyStream.device() == device); + REQUIRE(copyStream.stream() == cuda_stream); + + // Tests that moving works as expected and preserves the stream + at::CUDAStream moveStream; + { + auto s = at::globalContext().createCUDAStream(); + device = s.device(); + cuda_stream = s.stream(); + + moveStream = std::move(s); + + REQUIRE(moveStream.device() == device); + REQUIRE(moveStream.stream() == cuda_stream); + } + + REQUIRE(moveStream.internals()); + REQUIRE(moveStream.device() == device); + REQUIRE(moveStream.stream() == cuda_stream); +} + +TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { + at::CUDAStream myStream = at::globalContext().createCUDAStream(); + + // Sets and gets + at::globalContext().setCurrentCUDAStream(myStream); + at::CUDAStream curStream = at::globalContext().getCurrentCUDAStream(); + + REQUIRE(myStream == curStream); + + // Gets, sets, and gets default stream + at::CUDAStream defaultStream = at::globalContext().getDefaultCUDAStream(); + at::globalContext().setCurrentCUDAStream(defaultStream); + curStream = at::globalContext().getCurrentCUDAStream(); + + REQUIRE(defaultStream != myStream); + REQUIRE(curStream == defaultStream); +} + +TEST_CASE("Stream API retain/free", "Ensures streams are destroyed properly") { + auto ptr = at::detail::CUDAStream_createAndRetainWithOptions( + at::CUDAStream::DEFAULT_FLAGS + , at::CUDAStream::DEFAULT_PRIORITY); + + at::detail::CUDAStream_free(ptr); + REQUIRE(ptr == nullptr); +} + +void thread_fun(at::CUDAStream& cur_thread_stream) { + auto new_stream = at::globalContext().createCUDAStream(); + at::globalContext().setCurrentCUDAStream(new_stream); + cur_thread_stream = at::globalContext().getCurrentCUDAStream(); + REQUIRE(cur_thread_stream == new_stream); +} + +TEST_CASE("Multithread Getting and Setting", "Ensures streams are thread local") { + at::CUDAStream s0, s1; + + std::thread t0{thread_fun, std::ref(s0)}; + std::thread t1{thread_fun, std::ref(s1)}; + t0.join(); + t1.join(); + + at::CUDAStream cur_stream = at::globalContext().getCurrentCUDAStream(); + at::CUDAStream default_stream = at::globalContext().getDefaultCUDAStream(); + + REQUIRE(cur_stream == default_stream); + REQUIRE(cur_stream != s0); + REQUIRE(cur_stream != s1); + REQUIRE(s0 != s1); +} diff --git a/aten/src/ATen/test/tbb_init_test.cpp b/aten/src/ATen/test/tbb_init_test.cpp new file mode 100644 index 0000000..027b878 --- /dev/null +++ b/aten/src/ATen/test/tbb_init_test.cpp @@ -0,0 +1,43 @@ +#include "ATen/ATen.h" +#include "ATen/Parallel.h" +#include "test_assert.h" +#include "test_seed.h" +#include + +using namespace at; + +// This checks whether threads can see the global +// numbers of threads set and also whether the scheduler +// will throw an exception when multiple threads call +// their first parallel construct. +void test(int given_num_threads) { + auto t = ones({1000 * 1000}, CPU(kFloat)); + if (given_num_threads >= 0) { + ASSERT(at::get_num_threads() == given_num_threads); + } else { + ASSERT(at::get_num_threads() == -1); + } + auto t_sum = t.sum(); + for (int i = 0; i < 1000; i ++) { + t_sum = t_sum + t.sum(); + } +} + +int main() { + manual_seed(123, at::Backend::CPU); + + test(-1); + std::thread t1(test, -1); + t1.join(); + at::set_num_threads(4); + std::thread t2(test, 4); + std::thread t3(test, 4); + std::thread t4(test, 4); + t4.join(); + t3.join(); + t2.join(); + at::set_num_threads(5); + test(5); + + return 0; +} diff --git a/aten/src/ATen/test/test_assert.h b/aten/src/ATen/test/test_assert.h new file mode 100644 index 0000000..8b01172 --- /dev/null +++ b/aten/src/ATen/test/test_assert.h @@ -0,0 +1,67 @@ +#pragma once +#include +#include + +static inline void barf(const char *fmt, ...) { + char msg[2048]; + va_list args; + va_start(args, fmt); + vsnprintf(msg, 2048, fmt, args); + va_end(args); + throw std::runtime_error(msg); +} + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) +#define AT_EXPECT(x, y) (__builtin_expect((x),(y))) +#else +#define AT_EXPECT(x, y) (x) +#endif + +#define ASSERT(cond) \ + if (AT_EXPECT(!(cond), 0)) { \ + barf("%s:%u: %s: Assertion `%s` failed.", __FILE__, __LINE__, __func__, #cond); \ + } + +//note: msg must be a string literal +//node: In, ##__VA_ARGS '##' supresses the comma if __VA_ARGS__ is empty +#define ASSERTM(cond, msg, ...) \ + if (AT_EXPECT(!(cond), 0)) { \ + barf("%s:%u: %s: Assertion `%s` failed: " msg , __FILE__, __LINE__, __func__, #cond,##__VA_ARGS__); \ + } + +#define TRY_CATCH_ELSE(fn, catc, els) \ + { \ + /* avoid mistakenly passing if els code throws exception*/ \ + bool _passed = false; \ + try { \ + fn; \ + _passed = true; \ + els; \ + } catch (std::runtime_error &e) { \ + ASSERT(!_passed); \ + catc; \ + } \ + } + +#define ASSERT_THROWSM(fn, message) \ + TRY_CATCH_ELSE(fn, ASSERT(std::string(e.what()).find(message) != std::string::npos), ASSERT(false)) + +#define ASSERT_THROWS(fn) \ + ASSERT_THROWSM(fn, ""); + +#define ASSERT_EQUAL(t1, t2) \ + ASSERT(t1.equal(t2)); + +// allclose broadcasts, so check same size before allclose. +#define ASSERT_ALLCLOSE(t1, t2) \ + ASSERT(t1.is_same_size(t2)); \ + ASSERT(t1.allclose(t2)); + +// allclose broadcasts, so check same size before allclose. +#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ + ASSERT(t1.is_same_size(t2)); \ + ASSERT(t1.allclose(t2, atol, rtol)); diff --git a/aten/src/ATen/test/test_install/CMakeLists.txt b/aten/src/ATen/test/test_install/CMakeLists.txt new file mode 100644 index 0000000..dc904b4 --- /dev/null +++ b/aten/src/ATen/test/test_install/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.0) +find_package(ATen REQUIRED) +include_directories(${ATEN_INCLUDE_DIR}) + +# C++11 +set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") +add_executable(main main.cpp) +target_link_libraries(main ${ATEN_LIBRARIES}) diff --git a/aten/src/ATen/test/test_install/main.cpp b/aten/src/ATen/test/test_install/main.cpp new file mode 100644 index 0000000..adeae38 --- /dev/null +++ b/aten/src/ATen/test/test_install/main.cpp @@ -0,0 +1,5 @@ +#include "ATen/ATen.h" + +int main() { + std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n"; +} diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp new file mode 100644 index 0000000..5dbd967 --- /dev/null +++ b/aten/src/ATen/test/test_parallel.cpp @@ -0,0 +1,28 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "ATen/DLConvertor.h" + +#include +#include +#include +#include "test_seed.h" + +using namespace at; + +TEST_CASE( "parallel", "[cpu]" ) { + + manual_seed(123, at::Backend::CPU); + set_num_threads(1); + + Tensor a = rand({1,3}); + a[0][0] = 1; + a[0][1] = 0; + a[0][2] = 0; + Tensor as = rand({3}); + as[0] = 1; + as[1] = 0; + as[2] = 0; + REQUIRE(a.sum(0).equal(as)); +} diff --git a/aten/src/ATen/test/test_seed.h b/aten/src/ATen/test/test_seed.h new file mode 100644 index 0000000..16f9ecb --- /dev/null +++ b/aten/src/ATen/test/test_seed.h @@ -0,0 +1,13 @@ +#pragma once + +#include "ATen/ATen.h" + +void manual_seed(uint64_t seed, at::Backend backend) { + if (backend == at::Backend::CPU) { + at::Generator & cpu_gen = at::globalContext().defaultGenerator(at::Backend::CPU); + cpu_gen.manualSeed(seed); + } else if (backend == at::Backend::CUDA && at::hasCUDA()) { + at::Generator & cuda_gen = at::globalContext().defaultGenerator(at::Backend::CUDA); + cuda_gen.manualSeed(seed); + } +} diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp new file mode 100644 index 0000000..d88923d --- /dev/null +++ b/aten/src/ATen/test/undefined_tensor_test.cpp @@ -0,0 +1,53 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "ATen/UndefinedTensor.h" +#include +#include "test_seed.h" + +using namespace at; + +TEST_CASE( "undefined tensor test", "[]" ) { + manual_seed(123, at::Backend::CPU); + + // mainly test ops on undefined tensors don't segfault and give a reasonable errror message. + Tensor und; + Tensor ft = ones({1}, CPU(kFloat)); + + std::stringstream ss; + ss << und << std::endl; + REQUIRE(!und.defined()); + REQUIRE(std::string("UndefinedTensor") == und.toString()); + + REQUIRE_THROWS_WITH(und.strides(), Catch::Contains("strides")); + REQUIRE_THROWS_WITH(und.dim(), Catch::Contains("dim")); + REQUIRE_THROWS_WITH([]() {return Tensor();}() = Scalar(5), Catch::Contains("UndefinedType")); + REQUIRE_THROWS_WITH(und.unsafeGetTH(true), Catch::Contains("unsafeGetTH")); + REQUIRE_THROWS_WITH(und.add(und), Catch::Contains("add")); + REQUIRE_THROWS_WITH(und.add(ft), Catch::Contains("add")); + REQUIRE_THROWS_WITH(ft.add(und), Catch::Contains("add")); + REQUIRE_THROWS_WITH(und.add(5), Catch::Contains("add")); + REQUIRE_THROWS_WITH(und.mm(und), Catch::Contains("mm")); + + und.toType(und.type()); + REQUIRE_THROWS_WITH(und.toType(ft.type()), Catch::Contains("attempt to copy an undefined tensor")); + REQUIRE_THROWS_WITH(ft.toType(und.type()), Catch::Contains("UndefinedType")); + und.toType(ScalarType::Undefined); + REQUIRE_THROWS_WITH(und.toType(ScalarType::Float), Catch::Contains("toScalarType")); + REQUIRE_THROWS_WITH(ft.toType(ScalarType::Undefined), Catch::Contains("UndefinedType")); + + // copy_ + REQUIRE_THROWS_WITH(und.copy_(und), Catch::Contains("copy")); + REQUIRE_THROWS_WITH(und.copy_(ft), Catch::Contains("copy")); + REQUIRE_THROWS_WITH(ft.copy_(und), Catch::Contains("copy")); + + und.toBackend(Backend::Undefined); + REQUIRE_THROWS_WITH(und.toBackend(Backend::CPU), Catch::Contains("toBackend")); + REQUIRE_THROWS_WITH(ft.toBackend(Backend::Undefined), Catch::Contains("UndefinedType")); + + Tensor to_move = ones({1}, CPU(kFloat)); + Tensor m(std::move(to_move)); + REQUIRE(!to_move.defined()); + REQUIRE(to_move.get() == UndefinedTensor::singleton()); +} diff --git a/aten/src/ATen/test/verify_api_visibility.cpp b/aten/src/ATen/test/verify_api_visibility.cpp new file mode 100644 index 0000000..ed296ce --- /dev/null +++ b/aten/src/ATen/test/verify_api_visibility.cpp @@ -0,0 +1,15 @@ +#include + +#ifdef AT_CUDNN_ENABLED +#error "AT_CUDNN_ENABLED should not be visible in public headers" +#endif + +#ifdef AT_MKL_ENABLED +#error "AT_MKL_ENABLED should not be visible in public headers" +#endif + +#ifdef AT_MKLDNN_ENABLED +#error "AT_MKLDNN_ENABLED should not be visible in public headers" +#endif + +auto main() -> int {} diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp new file mode 100644 index 0000000..aab2ec5 --- /dev/null +++ b/aten/src/ATen/test/weakref_test.cpp @@ -0,0 +1,64 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" + +#include +#include +#include + +using at::Tensor; +using at::WeakTensor; + +TEST_CASE( "Weak pointer tests", "" ) { + SECTION("gets invalidated") { + Tensor a = at::ones({2, 2}); + WeakTensor b = a; + a.reset(); + REQUIRE_FALSE(b.lock().defined()); + } + + SECTION("can successfully lock") { + Tensor a = at::ones({2, 2}); + WeakTensor b = a; + auto c = b.lock(); + REQUIRE(c.defined()); + + a.reset(); + REQUIRE(b.lock().defined()); + c.reset(); + REQUIRE_FALSE(b.lock().defined()); + } + + SECTION("updates refcounts correctly") { + Tensor a = at::ones({2, 2}); + auto ai = a.unsafeGetTensorImpl(); + REQUIRE(ai->use_count() == 1); + REQUIRE(ai->weak_use_count() == 1); + { + WeakTensor b = a; + REQUIRE(ai->use_count() == 1); + REQUIRE(ai->weak_use_count() == 2); + } + REQUIRE(ai->use_count() == 1); + REQUIRE(ai->weak_use_count() == 1); + { + WeakTensor b = a; + REQUIRE(ai->use_count() == 1); + auto locked = b.lock(); + REQUIRE(locked.defined()); + REQUIRE(ai->use_count() == 2); + } + REQUIRE(ai->use_count() == 1); + REQUIRE(ai->weak_use_count() == 1); + { + WeakTensor b = a; + REQUIRE(ai->use_count() == 1); + REQUIRE(ai->weak_use_count() == 2); + a.reset(); + auto bi = b.unsafeGetTensorImpl(); + REQUIRE(bi->use_count() == 0); + REQUIRE(bi->weak_use_count() == 1); + } + } +} diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp new file mode 100644 index 0000000..599c103 --- /dev/null +++ b/aten/src/ATen/test/wrapdim_test.cpp @@ -0,0 +1,43 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" + +using namespace at; + +TEST_CASE( "wrapdim test", "[]" ) { + manual_seed(123, at::Backend::CPU); + + Type & T = CPU(kFloat); + + SECTION( "simple case" ) { + auto a = randn({2, 3, 4, 5}, T); + REQUIRE(a.prod(-4).equal(a.prod(0))); + REQUIRE(a.prod(3).equal(a.prod(-1))); + } + + SECTION( "expression specification" ) { + auto a = randn({2, 3, 4, 5}, T); + REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0))); + REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1))); + + // can unsqueeze scalar + auto b = randn(1, T); + b.get()->maybeScalar(true); + REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1))); + } + + SECTION( "empty tensor" ) { + auto a = randn(0, T); + REQUIRE(a.prod(0).equal(at::ones({}, T))); + } + + SECTION( "scalar vs 1-dim, 1-size" ) { + auto a = randn(1, T); + REQUIRE(a.prod(0).equal(a.prod(-1))); + a.get()->maybeScalar(true); + REQUIRE(a.get()->isScalar()); + REQUIRE(a.prod(0).equal(a.prod(-1))); + } +} diff --git a/aten/src/README.md b/aten/src/README.md new file mode 100644 index 0000000..a641ea1 --- /dev/null +++ b/aten/src/README.md @@ -0,0 +1,144 @@ +This directory contains the low-level tensor libraries for PyTorch, +as well as the new ATen C++ bindings. + +The low-level libraries trace their lineage from the original Torch. There are +multiple variants of the library, summarized here: + +* TH = TorcH +* THC = TorcH Cuda +* THCS = TorcH Cuda Sparse (now defunct) +* THCUNN = TorcH CUda Neural Network (see cunn) +* THD = TorcH Distributed +* THNN = TorcH Neural Network +* THS = TorcH Sparse (now defunct) + +(You'll also see these abbreviations show up in symbol names.) + +## Reference counting + +PyTorch employs reference counting in order to permit tensors to provide +differing views on a common underlying storage. For example, when you call +view() on a Tensor, a new THTensor is allocated with differing dimensions, +but it shares the same THStorage with the original tensor. + +Unfortunately, this means we are in the business of manually tracking reference +counts inside our C library code. Fortunately, for most of our library code implementing +tensor operations, there is only one rule you have to remember: + +> **Golden Rule of Reference Counting:** You must either FREE or RETURN +> a pointer which was returned by a function whose name begins with +> `new` or which you called `retain` on. +> If you return this pointer, your function name must begin with `new`. + +In a long function, there may be many invocations of functions with `new` in +their name. Your responsibility is to go through each of them and ensure +that there is a matching `free` for it for EACH exit point of the function. + +### Examples + +Suppose you want to get a reference to the indices of a sparse tensor. This +function is called `newIndices`. The `new` means you MUST free it when you're +done (usually at the end of your function.) (It's worth noting that +`newIndices` doesn't actually allocate a fresh indices tensor; it just gives +you a pointer to the existing one.) DO NOT directly access the member +variables of the struct. + +``` +THIndexTensor *indices = THSTensor_(newIndices)(state, sparse); +// ... do some stuff ... +THIndexTensor_(free)(state, indices); +``` + +Let's take a look at the implementation of `newIndices`. This doesn't free the +return result of `newNarrow`, but returns it. This justifies the `new` in its +name. + +``` +THIndexTensor *THSTensor_(newIndices)(const THSTensor *self) { + // ... + return THIndexTensor_(newNarrow)(self->indices, 1, 0, self->nnz); +} +``` + +Passing an object to another function does NOT absolve you of responsibility +of freeing it. If that function holds on to a pointer to the object, it +will `retain` it itself. + +``` + THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel); + THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, NULL); + THLongStorage_free(inferred_size); +``` + +Sometimes, you have a tensor in hand which you'd like to use directly, but +under some conditions you have to have to call, e.g., `newContiguous`, to get +it into the correct form: + +``` + if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) { + kernel = THTensor_(newContiguous)(k_); + } else { + THTensor_(retain)(k_); + kernel = k_; + } + ... + THTensor_(free)(kernel); +``` + +In this case, we have (redundantly) called `retain` on `k_`, so that we can +unconditionally free `kernel` at the end of the function; intuitively, you +want it to be possible to replace the conditional expression with an equivalent +function call, e.g., `kernel = THTensor_(newContiguous2D)(k_)`. + +### Tips + +* If you have an early exit in a function (via a `return`), don't forget to + `free` any pointers which you allocated up to this point. If at all possible, + move early exits prior to these allocations, so that you don't have to clean up. + +* Very occasionally, you may be able to implement an algorithm more efficiently + if you "destroy" its input. This is a `move`; after moving an object away, + you must NOT `free` it. This is the one exception to the rule, and at the + moment there is only one instance of `move` in the code base. + +* We use `THError` to signal error cases, and fortunately, + you do NOT need to make sure you've freed everything before calling `THError`, + because by default, it aborts the entire process. However, it's good style + to call `THError` before performing any allocations, since in some cases we + sketchily throw a C++ exception and try to recover (in particular, the test + suite does this.) + +## The C interface + +Historically, the Torch libraries were implemented in C. Since then, we have slowly +started rewriting bits of pieces of Torch in C++ (usually because there is some +C++ feature which would be really helpful for writing something.) However, +Torch has *always been*, and *will always be* a library that provides a C ABI +interface, even if, at some point in the future, its internal implementation +is entirely done in a C++ library that heavily uses C++ idioms. (At the moment, +all of the source files are C++, but they are mostly C code that happens to be +compiled as C++). + +In order to achieve this, the `TH_API` macro (called `THC_API` in `THC`) plays +a crucial role: it declares a function as having C-linkage, which means that the +C++ compiler doesn't mangle its name and a C client can link against it. + +As a developer, here is what you need to know: + +1. If you add a function to the public API of Torch, you *must* mark it with + `TH_API` or `THC_API` (depending if you are in CPU or CUDA land). + This will ensure it is built with C-linkage (and on Windows, it + will also ensure that the symbol is exported from the DLL; otherwise it + won't be visible.) + +2. C++ features should ONLY be used in `.cpp` and `.hpp` files, and not in + `.h` files. If you need to use a C++ type in a header file, you should + define this in a separate, C++ only header `.hpp`, and declare it opaquely + in the `.h`. Search for `mutex` for an example of this principle being applied. + (This convention is OPPOSITE from the prevailing convention in PyTorch and + ATen, where C++ headers are defined in `.h` files.) + +Arguably, the "C-compatible" headers should live in a separate directory, +distinct from the C++ code. We think this might be a good thing to do +eventually, and would make the code structure more clear, but we have not +done it at the moment. diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt new file mode 100644 index 0000000..5d588df --- /dev/null +++ b/aten/src/TH/CMakeLists.txt @@ -0,0 +1,131 @@ +set(extra_src) + +# IF ANY SIMD FOUND +IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND) + LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve.cpp) +ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND) + +# IF SSE4 FOUND +IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND) + LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_sse.cpp) +ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND) + +# IF AVX FOUND +IF(C_AVX_FOUND) + LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX.cpp) + LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_avx.cpp) +ENDIF(C_AVX_FOUND) + +IF(C_AVX2_FOUND) + LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX2.cpp) +ENDIF(C_AVX2_FOUND) + +SET(hdr + THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h + THLapack.h THLogAdd.h THRandom.h THVector.h ) + +set(ATen_TH_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THLapack.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THLogAdd.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THRandom.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THFile.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THDiskFile.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THMemoryFile.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THVector.cpp + ${extra_src} + ) +# Remember that PARENT_SCOPE variables are not in the current scope +set(ATen_TH_SRCS ${ATen_TH_SRCS} PARENT_SCOPE) +set(ATen_CPU_SRCS ${ATen_CPU_SRCS} ${ATen_TH_SRCS} PARENT_SCOPE) +###################################################### + + +set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +PARENT_SCOPE) + +set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +PARENT_SCOPE) + +CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h") + + +INSTALL(FILES + TH.h + THAllocator.h + THMath.h + THBlas.h + THDiskFile.h + THFile.h + THFilePrivate.h + ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h + THGenerateAllTypes.h + THGenerateDoubleType.h + THGenerateFloatType.h + THGenerateHalfType.h + THGenerateLongType.h + THGenerateIntType.h + THGenerateShortType.h + THGenerateCharType.h + THGenerateByteType.h + THGenerateFloatTypes.h + THGenerateIntTypes.h + THLapack.h + THLogAdd.h + THMemoryFile.h + THRandom.h + THSize.h + THStorage.h + THTensor.h + THTensorApply.h + THTensorDimApply.h + THVector.h + THHalf.h + THTensor.hpp + THStorage.hpp + THGenerator.hpp + THTypeConversion.hpp + DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH") + +INSTALL(FILES + vector/AVX.h + vector/AVX2.h + ../ATen/native/cpu/avx_mathfun.h + DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/vector") + +INSTALL(FILES + generic/THBlas.cpp + generic/THBlas.h + generic/THLapack.cpp + generic/THLapack.h + generic/THStorage.cpp + generic/THStorage.h + generic/THStorageCopy.cpp + generic/THStorageCopy.h + generic/THTensor.cpp + generic/THTensor.h + generic/THTensorConv.cpp + generic/THTensorConv.h + generic/THTensorCopy.cpp + generic/THTensorCopy.h + generic/THTensorLapack.cpp + generic/THTensorLapack.h + generic/THTensorMath.cpp + generic/THTensorMath.h + generic/THTensorRandom.cpp + generic/THTensorRandom.h + generic/THVectorDispatch.cpp + generic/THVector.h + # See Note [TH abstraction violation] + generic/THTensorFastGetSet.hpp + DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/generic") diff --git a/aten/src/TH/README.md b/aten/src/TH/README.md new file mode 100644 index 0000000..4ac26c1 --- /dev/null +++ b/aten/src/TH/README.md @@ -0,0 +1,11 @@ +Environment variables control the disabling of certain explicit SIMD optimizations. + +``` +x64 options: +TH_NO_AVX2=1 # disable AVX2 codepaths +TH_NO_AVX=1 # disable AVX codepaths +TH_NO_SSE=1 # disable SSE codepaths + +ppc64le options: +TH_NO_VSX=1 # disable VSX codepaths +``` diff --git a/aten/src/TH/TH.h b/aten/src/TH/TH.h new file mode 100644 index 0000000..08bdde8 --- /dev/null +++ b/aten/src/TH/TH.h @@ -0,0 +1,24 @@ +#ifndef TH_INC +#define TH_INC + +#include "THGeneral.h" + +#include "THBlas.h" +#ifdef USE_LAPACK +#include "THLapack.h" +#endif + +#include "THVector.h" +#include "THLogAdd.h" +#include "THRandom.h" +#include "THSize.h" +#include "THStorage.h" +#include "THTensor.h" +#include "THTensorApply.h" +#include "THTensorDimApply.h" + +#include "THFile.h" +#include "THDiskFile.h" +#include "THMemoryFile.h" + +#endif diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp new file mode 100644 index 0000000..9dccbb3 --- /dev/null +++ b/aten/src/TH/THAllocator.cpp @@ -0,0 +1,563 @@ +#include "THAllocator.h" + +/* stuff for mapped files */ +#ifdef _WIN32 +#include +#endif + +#include +#if ATOMIC_INT_LOCK_FREE == 2 +#define TH_ATOMIC_IPC_REFCOUNT 1 +#endif + +#if HAVE_MMAP +#include +#include +#include +#include +#include +#endif +/* end of stuff for mapped files */ + +struct THDefaultAllocator final : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + auto* ptr = THAlloc(size); + return {ptr, ptr, &THFree, at::kCPU}; + } + at::DeleterFnPtr raw_deleter() const override { + return &THFree; + } +}; + +static THDefaultAllocator th_default_allocator; +at::Allocator* getTHDefaultAllocator() { + return &th_default_allocator; +} + +#if defined(_WIN32) || defined(HAVE_MMAP) + +#define TH_ALLOC_ALIGNMENT 64 + +typedef struct { + std::atomic refcount; +} THMapInfo; + +const char * unknown_filename = "filename not specified"; +#ifdef _WIN32 +const char * unknown_eventname = "eventname not specified"; +#endif + +THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size) + : filename_(filename ? filename : unknown_filename) + , flags_(0) // to be filled later + , size_(0) // to be filled later +#ifdef _WIN32 + , handle_(INVALID_HANDLE_VALUE) // to be filled later + , event_(INVALID_HANDLE_VALUE) // to be filled later + , eventname_(filename ? std::string(filename) + "_event" : unknown_eventname) +#else + , fd_(fd) +#endif + , base_ptr_(nullptr) +{ + + if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)) { + flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE; + } + if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0) { + AT_ERROR("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file in shared mode"); + } +#ifdef _WIN32 + if (fd != -1) { + AT_ERROR("THMapAllocator_newWithFd is unsupported on Windows"); + } +#endif + flags_ = flags; + + // OK, now do the allocation + + if (size == 0) { + return; + } + +#ifdef _WIN32 + if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) { + // Shadowing + const char *filename; + const char *eventname; + LARGE_INTEGER hfilesz; + + if (filename_[0] == '/') { + filename = filename_.c_str() + 1; + eventname = eventname_.c_str() + 1; + } else { + filename = filename_.c_str(); + eventname = eventname_.c_str(); + } + + hfilesz.QuadPart = size; + + if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { + handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename); + event_ = CreateEvent(nullptr, FALSE, FALSE, eventname); + } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { + handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename); + event_ = OpenEvent(EVENT_ALL_ACCESS, FALSE, eventname); + } else { + AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE"); + } + + if (event_ == nullptr) { + AT_ERROR("Couldn't open shared event: <", eventname, ">, error code: <", GetLastError(), ">"); + } + + if (handle_ == nullptr) { + AT_ERROR("Couldn't open shared file mapping: <", filename, ">, error code: <", GetLastError(), ">"); + } + + size_ = size; + base_ptr_ = MapViewOfFile(handle_, FILE_MAP_ALL_ACCESS, 0, 0, size); + if (!base_ptr_) { + AT_ERROR("Couldn't map view of shared file <", filename, ">, error code: <", GetLastError(), ">"); + } + } else { + + HANDLE hfile; + HANDLE hmfile; + LARGE_INTEGER hfilesz; + + if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { + AT_ERROR("exclusive file mapping is not supported on Windows"); + } + if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { + AT_ERROR("file mapping without creation is not supported on Windows"); + } + if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) { + AT_ERROR("TH_ALLOCATOR_MAPPED_KEEPFD not supported on Windows"); + } + if (flags_ & TH_ALLOCATOR_MAPPED_FROMFD) { + AT_ERROR("TH_ALLOCATOR_MAPPED_FROMFD not supported on Windows"); + } + + /* open file */ + /* FILE_FLAG_RANDOM_ACCESS ? */ + if (flags_) { + hfile = CreateFileA(filename_.c_str(), GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0); + if (hfile == INVALID_HANDLE_VALUE) { + AT_ERROR("could not open file <", filename_, "> in read-write mode; error code: <", GetLastError(), ">"); + } + } else { + hfile = CreateFileA(filename_.c_str(), GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + if (hfile == INVALID_HANDLE_VALUE) { + AT_ERROR("could not open file <", filename_, "> in read-only mode; error code: <", GetLastError(), ">"); + } + } + + if (GetFileSizeEx(hfile, &hfilesz) == 0) { + AT_ERROR("could not get file size: <", filename_, ">; error code: <", GetLastError(), ">"); + } + + if (size > 0) { + if (size > hfilesz.QuadPart) { + if (flags_) { + hfilesz.QuadPart = size; + if (SetFilePointerEx(hfile, hfilesz, NULL, FILE_BEGIN) == 0) { + CloseHandle(hfile); + AT_ERROR("unable to stretch file <", filename_, "> to the right size; error code: <", GetLastError(), ">", filename_); + } + if (SetEndOfFile(hfile) == 0) { + CloseHandle(hfile); + AT_ERROR("unable to write to file <", filename_, ">; error code: <", GetLastError(), ">"); + } + } else { + CloseHandle(hfile); + AT_ERROR("file <", filename_, "> size is smaller than the required mapping size <", size, ">; error code: <", GetLastError(), ">"); + } + } + } else { + size = hfilesz.QuadPart; + } + + size_ = size; /* if we are here, it must be the right size */ + + hfilesz.QuadPart = size_; + + /* get map handle */ + if (flags_) { + if ( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) { + AT_ERROR("could not create a map on file <", filename_, ">; error code: <", GetLastError(), ">"); + } + } else { + if ( (hmfile = CreateFileMapping(hfile, NULL, PAGE_WRITECOPY, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) { + AT_ERROR("could not create a map on file <", filename_, ">; error code: <", GetLastError(), ">"); + } + } + + /* map the stuff */ + if(flags_) { + base_ptr_ = MapViewOfFile(hmfile, FILE_MAP_ALL_ACCESS, 0, 0, 0); + } else { + base_ptr_ = MapViewOfFile(hmfile, FILE_MAP_COPY, 0, 0, 0); + } + + CloseHandle(hfile); + CloseHandle(hmfile); + } +#else /* _WIN32 */ + { + /* open file */ + int fd; + int flags; // shadow + struct stat file_stat; + + if (flags_ & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM)) { + flags = O_RDWR | O_CREAT; + } else { + flags = O_RDONLY; + } + + if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { + flags |= O_EXCL; + } + if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { + flags &= ~O_CREAT; + } + + if (!(flags_ & TH_ALLOCATOR_MAPPED_FROMFD)) { + if (flags_ & TH_ALLOCATOR_MAPPED_SHARED) { + if ((fd = open(filename_.c_str(), flags, (mode_t)0600)) == -1) { + AT_ERROR("unable to open file <", filename_, "> in read-write mode"); + } + } else if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) { +#ifdef HAVE_SHM_OPEN + if((fd = shm_open(filename_.c_str(), flags, (mode_t)0600)) == -1) { + AT_ERROR("unable to open shared memory object <", filename_, "> in read-write mode"); + } +#else + AT_ERROR("unable to open file <", filename_, "> in sharedmem mode, shm_open unavailable on this platform"); +#endif + } else { + if ((fd = open(filename_.c_str(), O_RDONLY)) == -1) { + AT_ERROR("unable to open file <", filename_, "> in read-only mode"); + } + } + } else { + fd = fd_; + } + + if (fstat(fd, &file_stat) == -1) { + if (!(flags_ & TH_ALLOCATOR_MAPPED_FROMFD)) { + ::close(fd); + } + AT_ERROR("unable to stat the file <", filename_, ">"); + } + + if (size > 0) { + if (size > file_stat.st_size) { + if (flags_) { + if (ftruncate(fd, size) == -1) { + AT_ERROR("unable to resize file <", filename_, "> to the right size"); + } + if (fstat(fd, &file_stat) == -1 || file_stat.st_size < size) { + ::close(fd); + AT_ERROR("unable to stretch file <", filename_, "> to the right size"); + } +/* on macOS write returns with errno 45 (Opperation not supported) when used + * with a file descriptor obtained via shm_open + */ +#ifndef __APPLE__ + if ((write(fd, "", 1)) != 1) /* note that the string "" contains the '\0' byte ... */ { + ::close(fd); + AT_ERROR("unable to write to file <", filename_, ">"); + } +#endif + } else { + ::close(fd); + AT_ERROR("file <", filename_, "> size is smaller than the required mapping size <", size, ">"); + } + } + } else { + size = file_stat.st_size; + } + + size_ = size; /* if we are here, it must be the right size */ + + /* map it */ + if (flags_ & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM)) { + base_ptr_ = mmap(nullptr, size_, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + } else { + base_ptr_ = mmap(nullptr, size_, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + } + + if (base_ptr_ == MAP_FAILED) { + base_ptr_ = nullptr; /* let's be sure it is NULL */ + } + + if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) { + fd_ = fd; + } else { + if (::close(fd) == -1) { + AT_ERROR("Error closing file <", filename_, ">"); + } + fd_ = -1; + } + + if (flags_ & TH_ALLOCATOR_MAPPED_UNLINK) { + if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) { +#ifdef HAVE_SHM_UNLINK + if (shm_unlink(filename_.c_str()) == -1) { + AT_ERROR("could not unlink the shared memory file ", filename_); + } +#else + AT_ERROR("could not unlink the shared memory file ", filename_, ", shm_unlink not available on platform"); +#endif + } else { + if (unlink(filename_.c_str()) == -1) + AT_ERROR("could not unlink file %s", filename_); + } + } + + if (base_ptr_ == MAP_FAILED) { + AT_ERROR("$ Torch: unable to mmap memory: you tried to mmap ", size_/1073741824, " GB."); + } + } +#endif +} + +THMapAllocator::THMapAllocator(const char *filename, int flags, size_t size) + : THMapAllocator(WITH_FD, filename, -1, flags, size) +{} + +#ifdef _WIN32 +typedef struct{ + HANDLE event; + HANDLE handle; + HANDLE wait; +} ReleaseContext; +static VOID CALLBACK WaitForReleaseHandle(PVOID lpParam, BOOLEAN TimerOrWaitFired) +{ + if (lpParam) { + ReleaseContext *ctx = (ReleaseContext *)lpParam; + + SetEvent(ctx->event); + CloseHandle(ctx->event); + CloseHandle(ctx->handle); + + UnregisterWait(ctx->wait); + + THFree(ctx); + } +} +#endif + +void THMapAllocator::close() { + if (closed_) { + return; + } + closed_ = true; + if (base_ptr_ == nullptr) { + return; + } +#ifdef _WIN32 + if ((flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) || (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM)) + CloseHandle(handle_); + if(UnmapViewOfFile(base_ptr_) == 0) + AT_ERROR("could not unmap the shared memory file"); +#else /* _WIN32 */ + if (flags_ & TH_ALLOCATOR_MAPPED_KEEPFD) { + if (::close(fd_) == -1) { + AT_ERROR("could not close file descriptor ", fd_); + } + } + + if (munmap(base_ptr_, size_)) { + AT_ERROR("could not unmap the shared memory file"); + } + + if (!(flags_ & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK))) { + if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) { +#ifdef HAVE_SHM_UNLINK + if (shm_unlink(filename_.c_str()) == -1) { + AT_ERROR("could not unlink the shared memory file ", filename_); + } +#else + AT_ERROR("could not unlink the shared memory file ", filename_, ", shm_unlink not available on platform"); +#endif + } + } +#endif /* _WIN32 */ +} + +#else /* defined(_WIN32) || defined(HAVE_MMAP) */ + +THMapAllocator::THMapAllocator(const char *filename, int flags, size_t size) { + AT_ERROR("file mapping not supported on your system"); +} + +THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags) { + AT_ERROR("file mapping not supported on your system"); +} + +THMapAllocator::~THMapAllocator(THMapAllocator* ctx) {} + +#endif + +#if (defined(_WIN32) || defined(HAVE_MMAP)) && defined(TH_ATOMIC_IPC_REFCOUNT) + +THRefcountedMapAllocatorArgCheck::THRefcountedMapAllocatorArgCheck(int flags) { + if (flags & TH_ALLOCATOR_MAPPED_FROMFD) { + AT_ERROR("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_FROMFD flag"); + } + if (flags & TH_ALLOCATOR_MAPPED_KEEPFD) { + AT_ERROR("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_KEEPFD flag"); + } + if (flags & TH_ALLOCATOR_MAPPED_UNLINK) { + AT_ERROR("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_UNLINK flag"); + } + if (!(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)) { + AT_ERROR("THRefcountedMapAllocator requires TH_ALLOCATOR_MAPPED_SHAREDMEM flag"); + } +} + +THRefcountedMapAllocator::THRefcountedMapAllocator(const char *filename, int flags, size_t size) + : THRefcountedMapAllocatorArgCheck(flags) + , THMapAllocator(filename, flags, size + TH_ALLOC_ALIGNMENT) { + + initializeAlloc(); +} +THRefcountedMapAllocator::THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size) + : THRefcountedMapAllocatorArgCheck(flags) + , THMapAllocator(WITH_FD, filename, flags, fd, size + TH_ALLOC_ALIGNMENT) { + + initializeAlloc(); +} + +void THRefcountedMapAllocator::initializeAlloc() { + char *data = ((char*)base_ptr_) + TH_ALLOC_ALIGNMENT; + THMapInfo *map_info = (THMapInfo*)base_ptr_; + +#ifdef _WIN32 + ReleaseContext* r_ctx = (ReleaseContext *) THAlloc(sizeof(ReleaseContext)); + r_ctx->handle = handle_; + r_ctx->event = event_; + r_ctx->wait = NULL; + BOOL can_wait = RegisterWaitForSingleObject(&r_ctx->wait, event_, WaitForReleaseHandle, (PVOID)r_ctx, INFINITE, WT_EXECUTEONLYONCE); + if (!can_wait) { + AT_ERROR("Couldn't register wait on event, error code: <", GetLastError(), ">"); + } +#endif + + if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { + new (&map_info->refcount) std::atomic(1); + } else { + map_info->refcount++; + } +} + +void THRefcountedMapAllocator::close() { + if (closed_) { + return; + } + closed_ = true; + + void* data = base_ptr_; + +#ifdef _WIN32 + THMapInfo *info = (THMapInfo*)data; + if (--info->refcount == 0) { + SetEvent(event_); + } + if(UnmapViewOfFile(data) == 0) { + AT_ERROR("could not unmap the shared memory file"); + } +#else /* _WIN32 */ + + THMapInfo *info = (THMapInfo*)(data); + if (--info->refcount == 0) { +#ifdef HAVE_SHM_UNLINK + if (shm_unlink(filename_.c_str()) == -1) { + AT_ERROR("could not unlink the shared memory file ", filename_); + } +#else + AT_ERROR("could not unlink the shared memory file ", filename_, ", shm_unlink not available on platform"); +#endif /* HAVE_SHM_UNLINK */ + } + if (munmap(info, size_)) { + AT_ERROR("could not unmap the shared memory file ", filename_); + } +#endif /* _WIN32 */ +} + +void THRefcountedMapAllocator::incref() +{ + THMapInfo *map_info = static_cast(base_ptr_); + ++map_info->refcount; +} + +int THRefcountedMapAllocator::decref() +{ + THMapInfo *map_info = static_cast(base_ptr_); + return --map_info->refcount == 0; +} + +#else + + +THRefcountedMapAllocatorArgCheck::THRefcountedMapAllocatorArgCheck(int flags) {} + +THRefcountedMapAllocator::THRefcountedMapAllocator(const char *filename, int flags, size_t size) { + AT_ERROR("refcounted file mapping not supported on your system"); +} + +THRefcountedMapAllocator::THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size) { + AT_ERROR("refcounted file mapping not supported on your system"); +} + +void THRefcountedMapAllocator::initializeAlloc() {} +THRefcountedMapAllocator::~THRefcountedMapAllocator() {} + +#endif + +static void deleteTHMapAllocator(void* ptr) { + delete static_cast(ptr); +} + +static void deleteTHRefcountedMapAllocator(void* ptr) { + delete static_cast(ptr); +} + +THMapAllocator* THMapAllocator::fromDataPtr(const at::DataPtr& dptr) { + return dptr.cast_context(&deleteTHMapAllocator); +} + +THRefcountedMapAllocator* THRefcountedMapAllocator::fromDataPtr(const at::DataPtr& dptr) { + return dptr.cast_context(&deleteTHRefcountedMapAllocator); +} + +at::DataPtr THMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) { + auto* context = new THMapAllocator(filename, flags, size); + if (actual_size_out) *actual_size_out = context->size(); + return {context->data(), context, &deleteTHMapAllocator, at::kCPU}; +} + +at::DataPtr THMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { + auto* context = new THMapAllocator(WITH_FD, filename, fd, flags, size); + if (actual_size_out) *actual_size_out = context->size(); + return {context->data(), context, &deleteTHMapAllocator, at::kCPU}; +} + +at::DataPtr THRefcountedMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) { + auto* context = new THRefcountedMapAllocator(filename, flags, size); + if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT; + return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU}; +} + +at::DataPtr THRefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { + auto* context = new THRefcountedMapAllocator(WITH_FD, filename, fd, flags, size); + if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT; + return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU}; +} + +void* THRefcountedMapAllocator::data() const { + return static_cast(static_cast(base_ptr_) + TH_ALLOC_ALIGNMENT); +} diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h new file mode 100644 index 0000000..460f238 --- /dev/null +++ b/aten/src/TH/THAllocator.h @@ -0,0 +1,111 @@ +#pragma once + +#include "THGeneral.h" + +#ifdef __cplusplus +#include +#endif + +#define TH_ALLOCATOR_MAPPED_SHARED 1 +#define TH_ALLOCATOR_MAPPED_SHAREDMEM 2 +#define TH_ALLOCATOR_MAPPED_EXCLUSIVE 4 +#define TH_ALLOCATOR_MAPPED_NOCREATE 8 +#define TH_ALLOCATOR_MAPPED_KEEPFD 16 +#define TH_ALLOCATOR_MAPPED_FROMFD 32 +#define TH_ALLOCATOR_MAPPED_UNLINK 64 + +#ifdef __cplusplus +using THAllocator = at::Allocator; +#else +// struct at_THAllocator doesn't and will never exist, but we cannot name +// the actual struct because it's a namespaced C++ thing +typedef struct at_THAllocator THAllocator; +#endif + +/* default malloc/free allocator. malloc and realloc raise an error (using + * THError) on allocation failure. + */ +TH_API THAllocator* getTHDefaultAllocator(void); + +#ifdef __cplusplus +// Sentinel value/type to help distinguish the file descriptor constructor from +// the non-file descriptor constructor +enum WithFd { WITH_FD }; + +class AT_API THMapAllocator { +public: + THMapAllocator(const char *filename, int flags, size_t size); + THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size); + THMapAllocator(const THMapAllocator&) = delete; + THMapAllocator& operator=(const THMapAllocator&) = delete; + THMapAllocator(THMapAllocator&&) = delete; + THMapAllocator& operator=(THMapAllocator&&) = delete; + + const char* filename() const { return filename_.c_str(); } + int fd() const { +#ifdef _WIN32 + AT_ERROR("THMapAllocator::fd() is unsupported on Windows"); +#else + return fd_; +#endif + } + ptrdiff_t size() const { return size_; } + // Return a pointer to the actual data for this allocator + // (in the case of the refcounted allocator, this is offset + // from the base pointer.) + virtual void* data() const { return base_ptr_; } + + static THMapAllocator* fromDataPtr(const at::DataPtr&); + static at::DataPtr makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out); + static at::DataPtr makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out); + + // Closes the data. Helps us avoid destructor shenanigans + virtual void close(); + + // This is very dangerous. You have to redefine this destructor for each + // subclass + virtual ~THMapAllocator() { close(); } + +protected: + bool closed_ = false; + std::string filename_; + int flags_ = 0; + ptrdiff_t size_; /* mapped size */ +#ifdef _WIN32 + void* handle_; + void* event_; + std::string eventname_; +#else + int fd_ = -1; +#endif + void *base_ptr_ = nullptr; +}; + +// Base-from-member idiom +struct AT_API THRefcountedMapAllocatorArgCheck { + THRefcountedMapAllocatorArgCheck(int flags); +}; + +class AT_API THRefcountedMapAllocator : private THRefcountedMapAllocatorArgCheck, public THMapAllocator { +public: + THRefcountedMapAllocator(const char *filename, int flags, size_t size); + THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size); + + static THRefcountedMapAllocator* fromDataPtr(const at::DataPtr&); + static at::DataPtr makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out); + static at::DataPtr makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out); + + void* data() const override; + + void incref(); + int decref(); + void close() override; + + virtual ~THRefcountedMapAllocator() { close(); } + +protected: + void checkFlags(); + void initializeAlloc(); +}; + +#endif // __cplusplus diff --git a/aten/src/TH/THBlas.cpp b/aten/src/TH/THBlas.cpp new file mode 100644 index 0000000..7523c9e --- /dev/null +++ b/aten/src/TH/THBlas.cpp @@ -0,0 +1,4 @@ +#include "THBlas.h" + +#include "generic/THBlas.cpp" +#include "THGenerateAllTypes.h" diff --git a/aten/src/TH/THBlas.h b/aten/src/TH/THBlas.h new file mode 100644 index 0000000..5fef0fe --- /dev/null +++ b/aten/src/TH/THBlas.h @@ -0,0 +1,11 @@ +#ifndef TH_BLAS_INC +#define TH_BLAS_INC + +#include "THGeneral.h" + +#define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME) + +#include "generic/THBlas.h" +#include "THGenerateAllTypes.h" + +#endif diff --git a/aten/src/TH/THBlasUtils.h b/aten/src/TH/THBlasUtils.h new file mode 100644 index 0000000..8281047 --- /dev/null +++ b/aten/src/TH/THBlasUtils.h @@ -0,0 +1,32 @@ +#include +#include + +// This header file shouldn't be anything permanent; it's just a temporary +// dumping ground to help you get access to utilities in THBlas.h via templates, +// rather than by name directly. Someone should figure out a reasonable way to +// rewrite these in more idiomatic ATen and move it into ATen proper. + +template +inline void THBlas_axpy(int64_t n, T a, T *x, int64_t incx, T *y, int64_t incy); + +#define AXPY_SPECIALIZATION(ctype,name,_1) \ + template<> \ + inline void THBlas_axpy(int64_t n, ctype a, ctype *x, int64_t incx, \ + ctype *y, int64_t incy) { \ + TH ## name ## Blas_axpy(n, a, x, incx, y, incy); \ + } + +AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(AXPY_SPECIALIZATION) + + +template +inline void THBlas_copy(int64_t n, T *x, int64_t incx, T *y, int64_t incy); + +#define COPY_SPECIALIZATION(ctype,name,_1) \ + template<> \ + inline void THBlas_copy(int64_t n, ctype *x, int64_t incx, \ + ctype *y, int64_t incy) { \ + TH ## name ## Blas_copy(n, x, incx, y, incy); \ + } + +AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(COPY_SPECIALIZATION) diff --git a/aten/src/TH/THConfig.cmake.in b/aten/src/TH/THConfig.cmake.in new file mode 100644 index 0000000..306cd87 --- /dev/null +++ b/aten/src/TH/THConfig.cmake.in @@ -0,0 +1,9 @@ +# Find the TH includes and library +# +# TH_INCLUDE_DIR -- where to find the includes +# TH_LIBRARIES -- list of libraries to link against +# TH_FOUND -- set to 1 if found + +SET(TH_FOUND 1) +SET(TH_INCLUDE_DIR "@TH_INCLUDE_DIR@") +SET(TH_LIBRARIES "@TH_LIBRARIES@") diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp new file mode 100644 index 0000000..258ad2c --- /dev/null +++ b/aten/src/TH/THDiskFile.cpp @@ -0,0 +1,801 @@ +#include "THGeneral.h" +#include "THDiskFile.h" +#include "THFilePrivate.h" + +#ifndef _WIN32 +#include +#endif + +#include +#ifndef LLONG_MAX +#define LLONG_MAX 9223372036854775807LL +#endif + +typedef struct THDiskFile__ +{ + THFile file; + + FILE *handle; + char *name; + int isNativeEncoding; + int longSize; + +} THDiskFile; + +static int THDiskFile_isOpened(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)self; + return (dfself->handle != NULL); +} + +const char *THDiskFile_name(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)self; + return dfself->name; +} + +/* workaround mac osx lion ***insane*** fread bug */ +#ifdef __APPLE__ +size_t fread__(void *ptr, size_t size, size_t nitems, FILE *stream) +{ + size_t nread = 0; + while(!feof(stream) && !ferror(stream) && (nread < nitems)) + nread += fread((char*)ptr+nread*size, size, THMin(2147483648/size, nitems-nread), stream); + return nread; +} +#else +#define fread__ fread +#endif + +#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM) \ + static ssize_t THDiskFile_read##TYPEC(THFile *self, TYPE *data, ssize_t n) \ + { \ + THDiskFile *dfself = (THDiskFile*)(self); \ + ssize_t nread = 0L; \ + \ + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \ + THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file"); \ + \ + if(dfself->file.isBinary) \ + { \ + nread = fread__(data, sizeof(TYPE), n, dfself->handle); \ + if(!dfself->isNativeEncoding && (sizeof(TYPE) > 1) && (nread > 0)) \ + THDiskFile_reverseMemory(data, data, sizeof(TYPE), nread); \ + } \ + else \ + { \ + ssize_t i; \ + for(i = 0; i < n; i++) \ + { \ + ASCII_READ_ELEM; /* increment here result and break if wrong */ \ + } \ + if(dfself->file.isAutoSpacing && (n > 0)) \ + { \ + int c = fgetc(dfself->handle); \ + if( (c != '\n') && (c != EOF) ) \ + ungetc(c, dfself->handle); \ + } \ + } \ + \ + if(nread != n) \ + { \ + dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \ + if(!dfself->file.isQuiet) \ + THError("read error: read %d blocks instead of %d", nread, n); \ + } \ + \ + return nread; \ + } \ + \ + static ssize_t THDiskFile_write##TYPEC(THFile *self, TYPE *data, ssize_t n) \ + { \ + THDiskFile *dfself = (THDiskFile*)(self); \ + ssize_t nwrite = 0L; \ + \ + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \ + THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file"); \ + \ + if(dfself->file.isBinary) \ + { \ + if(dfself->isNativeEncoding) \ + { \ + nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle); \ + } \ + else \ + { \ + if(sizeof(TYPE) > 1) \ + { \ + char *buffer = static_cast(THAlloc(sizeof(TYPE)*n)); \ + THDiskFile_reverseMemory(buffer, data, sizeof(TYPE), n); \ + nwrite = fwrite(buffer, sizeof(TYPE), n, dfself->handle); \ + THFree(buffer); \ + } \ + else \ + nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle); \ + } \ + } \ + else \ + { \ + ssize_t i; \ + for(i = 0; i < n; i++) \ + { \ + ASCII_WRITE_ELEM; \ + if( dfself->file.isAutoSpacing && (i < n-1) ) \ + fprintf(dfself->handle, " "); \ + } \ + if(dfself->file.isAutoSpacing && (n > 0)) \ + fprintf(dfself->handle, "\n"); \ + } \ + \ + if(nwrite != n) \ + { \ + dfself->file.hasError = 1; \ + if(!dfself->file.isQuiet) \ + THError("write error: wrote %d blocks instead of %d", nwrite, n); \ + } \ + \ + return nwrite; \ +} + +static int THDiskFile_mode(const char *mode, int *isReadable, int *isWritable) +{ + *isReadable = 0; + *isWritable = 0; + if(strlen(mode) == 1) + { + if(*mode == 'r') + { + *isReadable = 1; + return 1; + } + else if(*mode == 'w') + { + *isWritable = 1; + return 1; + } + } + else if(strlen(mode) == 2) + { + if(mode[0] == 'r' && mode[1] == 'w') + { + *isReadable = 1; + *isWritable = 1; + return 1; + } + } + return 0; +} + +static void THDiskFile_synchronize(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + fflush(dfself->handle); +} + +static void THDiskFile_seek(THFile *self, ssize_t position) +{ + THDiskFile *dfself = (THDiskFile*)(self); + + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + +#if defined(_WIN64) + THArgCheck(position <= INT64_MAX, 2, "position must be smaller than INT64_MAX"); + if(_fseeki64(dfself->handle, (int64_t)position, SEEK_SET) < 0) +#elif defined(_WIN32) + THArgCheck(position <= LONG_MAX, 2, "position must be smaller than LONG_MAX"); + if(fseek(dfself->handle, (int32_t)position, SEEK_SET) < 0) +#else + THArgCheck(position <= LLONG_MAX, 2, "position must be smaller than LLONG_MAX"); + if(fseeko(dfself->handle, (off_t)position, SEEK_SET) < 0) +#endif + { + dfself->file.hasError = 1; + if(!dfself->file.isQuiet) + THError("unable to seek to position %zu", position); + } +} + +static void THDiskFile_seekEnd(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + +#if defined(_WIN64) + if(_fseeki64(dfself->handle, 0, SEEK_END) < 0) +#elif defined(_WIN32) + if(fseek(dfself->handle, 0, SEEK_END) < 0) +#else + if(fseeko(dfself->handle, 0, SEEK_END) < 0) +#endif + { + dfself->file.hasError = 1; + if(!dfself->file.isQuiet) + THError("unable to seek at end of file"); + } +} + +static ssize_t THDiskFile_position(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + +#if defined(_WIN64) + int64_t offset = _ftelli64(dfself->handle); +#elif defined(_WIN32) + int32_t offset = ftell(dfself->handle); +#else + off_t offset = ftello(dfself->handle); +#endif + if (offset > -1) + return (ssize_t)offset; + else if(!dfself->file.isQuiet) + THError("unable to obtain disk file offset (maybe a long overflow occurred)"); + + return 0; +} + +static void THDiskFile_close(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + fclose(dfself->handle); + dfself->handle = NULL; +} + +/* Little and Big Endian */ + +static void THDiskFile_reverseMemory(void *dst, const void *src, ssize_t blockSize, ssize_t numBlocks) +{ + if(blockSize > 1) + { + ssize_t halfBlockSize = blockSize/2; + char *charSrc = (char*)src; + char *charDst = (char*)dst; + ssize_t b, i; + for(b = 0; b < numBlocks; b++) + { + for(i = 0; i < halfBlockSize; i++) + { + char z = charSrc[i]; + charDst[i] = charSrc[blockSize-1-i]; + charDst[blockSize-1-i] = z; + } + charSrc += blockSize; + charDst += blockSize; + } + } +} + +int THDiskFile_isLittleEndianCPU(void) +{ + int x = 7; + char *ptr = (char *)&x; + + if(ptr[0] == 0) + return 0; + else + return 1; +} + +int THDiskFile_isBigEndianCPU(void) +{ + return(!THDiskFile_isLittleEndianCPU()); +} + +void THDiskFile_nativeEndianEncoding(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + dfself->isNativeEncoding = 1; +} + +void THDiskFile_littleEndianEncoding(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + dfself->isNativeEncoding = THDiskFile_isLittleEndianCPU(); +} + +void THDiskFile_bigEndianEncoding(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + dfself->isNativeEncoding = !THDiskFile_isLittleEndianCPU(); +} + +/* End of Little and Big Endian Stuff */ + +void THDiskFile_longSize(THFile *self, int size) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified"); + dfself->longSize = size; +} + +void THDiskFile_noBuffer(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + if (setvbuf(dfself->handle, NULL, _IONBF, 0)) { + THError("error: cannot disable buffer"); + } +} + +static void THDiskFile_free(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + if(dfself->handle) + fclose(dfself->handle); + THFree(dfself->name); + THFree(dfself); +} + +/* READ_WRITE_METHODS(int, Bool, */ +/* int value = 0; int ret = fscanf(file->handle, "%d", &value); array[i] = (value ? 1 : 0); if(ret <= 0) break; else result++, */ +/* int value = (array[i] ? 1 : 0); nElemWritten = fprintf(file->handle, "%d", value), */ +/* true) */ + +/* Note that we do a trick */ +READ_WRITE_METHODS(uint8_t, Byte, + nread = fread(data, 1, n, dfself->handle); break, + nwrite = fwrite(data, 1, n, dfself->handle); break) + +READ_WRITE_METHODS(int8_t, Char, + nread = fread(data, 1, n, dfself->handle); break, + nwrite = fwrite(data, 1, n, dfself->handle); break) + +READ_WRITE_METHODS(int16_t, Short, + int ret = fscanf(dfself->handle, "%hd", &data[i]); if(ret <= 0) break; else nread++, + int ret = fprintf(dfself->handle, "%hd", data[i]); if(ret <= 0) break; else nwrite++) + +READ_WRITE_METHODS(int32_t, Int, + int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++, + int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++) + +READ_WRITE_METHODS(float, Float, + int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++, + int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++) + +READ_WRITE_METHODS(THHalf, Half, + float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; }, + int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++) + +READ_WRITE_METHODS(double, Double, + int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++, + int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++) + + +/* For Long we need to rewrite everything, because of the special management of longSize */ +static ssize_t THDiskFile_readLong(THFile *self, int64_t *data, ssize_t n) +{ + THDiskFile *dfself = (THDiskFile*)(self); + ssize_t nread = 0L; + + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file"); + + if(dfself->file.isBinary) + { + if(dfself->longSize == 0 || dfself->longSize == sizeof(int64_t)) + { + nread = fread__(data, sizeof(int64_t), n, dfself->handle); + if(!dfself->isNativeEncoding && (sizeof(int64_t) > 1) && (nread > 0)) + THDiskFile_reverseMemory(data, data, sizeof(int64_t), nread); + } else if(dfself->longSize == 4) + { + nread = fread__(data, 4, n, dfself->handle); + if(!dfself->isNativeEncoding && (nread > 0)) + THDiskFile_reverseMemory(data, data, 4, nread); + ssize_t i; + for(i = nread; i > 0; i--) + data[i-1] = ((int *)data)[i-1]; + } + else /* if(dfself->longSize == 8) */ + { + int big_endian = !THDiskFile_isLittleEndianCPU(); + int32_t *buffer = static_cast(THAlloc(8*n)); + nread = fread__(buffer, 8, n, dfself->handle); + ssize_t i; + for(i = nread; i > 0; i--) + data[i-1] = buffer[2*(i-1) + big_endian]; + THFree(buffer); + if(!dfself->isNativeEncoding && (nread > 0)) + THDiskFile_reverseMemory(data, data, 4, nread); + } + } + else + { + ssize_t i; + for(i = 0; i < n; i++) + { + int ret = fscanf(dfself->handle, "%" PRId64, &data[i]); if(ret <= 0) break; else nread++; + } + if(dfself->file.isAutoSpacing && (n > 0)) + { + int c = fgetc(dfself->handle); + if( (c != '\n') && (c != EOF) ) + ungetc(c, dfself->handle); + } + } + + if(nread != n) + { + dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ + if(!dfself->file.isQuiet) + THError("read error: read %d blocks instead of %d", nread, n); + } + + return nread; +} + +static ssize_t THDiskFile_writeLong(THFile *self, int64_t *data, ssize_t n) +{ + THDiskFile *dfself = (THDiskFile*)(self); + ssize_t nwrite = 0L; + + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file"); + + if(dfself->file.isBinary) + { + if(dfself->longSize == 0 || dfself->longSize == sizeof(int64_t)) + { + if(dfself->isNativeEncoding) + { + nwrite = fwrite(data, sizeof(int64_t), n, dfself->handle); + } + else + { + char *buffer = static_cast(THAlloc(sizeof(int64_t)*n)); + THDiskFile_reverseMemory(buffer, data, sizeof(int64_t), n); + nwrite = fwrite(buffer, sizeof(int64_t), n, dfself->handle); + THFree(buffer); + } + } else if(dfself->longSize == 4) + { + int32_t *buffer = static_cast(THAlloc(4*n)); + ssize_t i; + for(i = 0; i < n; i++) + buffer[i] = (int32_t) data[i]; + if(!dfself->isNativeEncoding) + THDiskFile_reverseMemory(buffer, buffer, 4, n); + nwrite = fwrite(buffer, 4, n, dfself->handle); + THFree(buffer); + } + else /* if(dfself->longSize == 8) */ + { + int big_endian = !THDiskFile_isLittleEndianCPU(); + int32_t *buffer = static_cast(THAlloc(8*n)); + ssize_t i; + for(i = 0; i < n; i++) + { + buffer[2*i + !big_endian] = 0; + buffer[2*i + big_endian] = (int32_t) data[i]; + } + if(!dfself->isNativeEncoding) + THDiskFile_reverseMemory(buffer, buffer, 8, n); + nwrite = fwrite(buffer, 8, n, dfself->handle); + THFree(buffer); + } + } + else + { + ssize_t i; + for(i = 0; i < n; i++) + { + int ret = fprintf(dfself->handle, "%" PRId64, data[i]); if(ret <= 0) break; else nwrite++; + if( dfself->file.isAutoSpacing && (i < n-1) ) + fprintf(dfself->handle, " "); + } + if(dfself->file.isAutoSpacing && (n > 0)) + fprintf(dfself->handle, "\n"); + } + + if(nwrite != n) + { + dfself->file.hasError = 1; + if(!dfself->file.isQuiet) + THError("write error: wrote %d blocks instead of %d", nwrite, n); + } + + return nwrite; +} + +static ssize_t THDiskFile_readString(THFile *self, const char *format, char **str_) +{ + THDiskFile *dfself = (THDiskFile*)(self); + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file"); + THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'"); + +/* note: the string won't survive long, as it is copied into lua */ +/* so 1024 is not that big... */ +#define TBRS_BSZ 1024L + + if(format[1] == 'a') + { + char *p = static_cast(THAlloc(TBRS_BSZ)); + ssize_t total = TBRS_BSZ; + ssize_t pos = 0; + + for (;;) + { + if(total-pos == 0) /* we need more space! */ + { + total += TBRS_BSZ; + p = static_cast(THRealloc(p, total)); + } + pos += fread(p+pos, 1, total-pos, dfself->handle); + if (pos < total) /* eof? */ + { + if(pos == 0) + { + THFree(p); + dfself->file.hasError = 1; + if(!dfself->file.isQuiet) + THError("read error: read 0 blocks instead of 1"); + + *str_ = NULL; + return 0; + } + *str_ = p; + return pos; + } + } + } + else + { + char *p = static_cast(THAlloc(TBRS_BSZ)); + ssize_t total = TBRS_BSZ; + ssize_t pos = 0; + ssize_t size; + + for (;;) + { + if(total-pos <= 1) /* we can only write '\0' in there! */ + { + total += TBRS_BSZ; + p = static_cast(THRealloc(p, total)); + } + if (fgets(p+pos, (int) (total-pos), dfself->handle) == NULL) /* eof? */ + { + if(pos == 0) + { + THFree(p); + dfself->file.hasError = 1; + if(!dfself->file.isQuiet) + THError("read error: read 0 blocks instead of 1"); + + *str_ = NULL; + return 0; + } + *str_ = p; + return pos; + } + size = strlen(p+pos); + if (size == 0 || (p+pos)[size-1] != '\n') + { + pos += size; + } + else + { + pos += size-1; /* do not include `eol' */ + *str_ = p; + return pos; + } + } + } + + *str_ = NULL; + return 0; +} + + +static ssize_t THDiskFile_writeString(THFile *self, const char *str, ssize_t size) +{ + THDiskFile *dfself = (THDiskFile*)(self); + ssize_t nwrite; + + THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); + THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file"); + + nwrite = fwrite(str, 1, size, dfself->handle); + if(nwrite != size) + { + dfself->file.hasError = 1; + if(!dfself->file.isQuiet) + THError("write error: wrote %zu blocks instead of %zu", nwrite, size); + } + + return nwrite; +} + +THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet) +{ + static struct THFileVTable vtable = { + THDiskFile_isOpened, + + THDiskFile_readByte, + THDiskFile_readChar, + THDiskFile_readShort, + THDiskFile_readInt, + THDiskFile_readLong, + THDiskFile_readFloat, + THDiskFile_readDouble, + THDiskFile_readHalf, + THDiskFile_readString, + + THDiskFile_writeByte, + THDiskFile_writeChar, + THDiskFile_writeShort, + THDiskFile_writeInt, + THDiskFile_writeLong, + THDiskFile_writeFloat, + THDiskFile_writeDouble, + THDiskFile_writeHalf, + THDiskFile_writeString, + + THDiskFile_synchronize, + THDiskFile_seek, + THDiskFile_seekEnd, + THDiskFile_position, + THDiskFile_close, + THDiskFile_free + }; + + int isReadable; + int isWritable; + FILE *handle; + THDiskFile *self; + + THArgCheck(THDiskFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'"); + + if( isReadable && isWritable ) + { + handle = fopen(name, "r+b"); + if(!handle) + { + handle = fopen(name, "wb"); + if(handle) + { + fclose(handle); + handle = fopen(name, "r+b"); + } + } + } + else + handle = fopen(name, (isReadable ? "rb" : "wb")); + + if(!handle) + { + if(isQuiet) + return 0; + else + THError("cannot open <%s> in mode %c%c", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' ')); + } + + self = static_cast(THAlloc(sizeof(THDiskFile))); + + self->handle = handle; + self->name = static_cast(THAlloc(strlen(name)+1)); + strcpy(self->name, name); + self->isNativeEncoding = 1; + self->longSize = 0; + + self->file.vtable = &vtable; + self->file.isQuiet = isQuiet; + self->file.isReadable = isReadable; + self->file.isWritable = isWritable; + self->file.isBinary = 0; + self->file.isAutoSpacing = 1; + self->file.hasError = 0; + + return (THFile*)self; +} + +/* PipeFile */ + +static int THPipeFile_mode(const char *mode, int *isReadable, int *isWritable) +{ + *isReadable = 0; + *isWritable = 0; + if(strlen(mode) == 1) + { + if(*mode == 'r') + { + *isReadable = 1; + return 1; + } + else if(*mode == 'w') + { + *isWritable = 1; + return 1; + } + } + return 0; +} + +static void THPipeFile_free(THFile *self) +{ + THDiskFile *dfself = (THDiskFile*)(self); + if(dfself->handle) + pclose(dfself->handle); + THFree(dfself->name); + THFree(dfself); +} + +THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet) +{ + static struct THFileVTable vtable = { + THDiskFile_isOpened, + + THDiskFile_readByte, + THDiskFile_readChar, + THDiskFile_readShort, + THDiskFile_readInt, + THDiskFile_readLong, + THDiskFile_readFloat, + THDiskFile_readDouble, + THDiskFile_readHalf, + THDiskFile_readString, + + THDiskFile_writeByte, + THDiskFile_writeChar, + THDiskFile_writeShort, + THDiskFile_writeInt, + THDiskFile_writeLong, + THDiskFile_writeFloat, + THDiskFile_writeDouble, + THDiskFile_writeHalf, + THDiskFile_writeString, + + THDiskFile_synchronize, + THDiskFile_seek, + THDiskFile_seekEnd, + THDiskFile_position, + THDiskFile_close, + THPipeFile_free + }; + + int isReadable; + int isWritable; + FILE *handle; + THDiskFile *self; + + THArgCheck(THPipeFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w'"); + +#ifdef _WIN32 + handle = _popen(name, (isReadable ? "rb" : "wb")); +#else + handle = popen(name, (isReadable ? "r" : "w")); +#endif + + if(!handle) + { + if(isQuiet) + return 0; + else + THError("cannot open <%s> in mode %c%c. This might be because eg the executable doesn't exist, but it could also be because you are out of memory.", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' ')); + } + + self = static_cast(THAlloc(sizeof(THDiskFile))); + + self->handle = handle; + self->name = static_cast(THAlloc(strlen(name)+1)); + strcpy(self->name, name); + self->isNativeEncoding = 1; + self->longSize = 0; + + self->file.vtable = &vtable; + self->file.isQuiet = isQuiet; + self->file.isReadable = isReadable; + self->file.isWritable = isWritable; + self->file.isBinary = 0; + self->file.isAutoSpacing = 1; + self->file.hasError = 0; + + return (THFile*)self; +} diff --git a/aten/src/TH/THDiskFile.h b/aten/src/TH/THDiskFile.h new file mode 100644 index 0000000..bc5c001 --- /dev/null +++ b/aten/src/TH/THDiskFile.h @@ -0,0 +1,19 @@ +#ifndef TH_DISK_FILE_INC +#define TH_DISK_FILE_INC + +#include "THFile.h" + +TH_API THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet); +TH_API THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet); + +TH_API const char *THDiskFile_name(THFile *self); + +TH_API int THDiskFile_isLittleEndianCPU(void); +TH_API int THDiskFile_isBigEndianCPU(void); +TH_API void THDiskFile_nativeEndianEncoding(THFile *self); +TH_API void THDiskFile_littleEndianEncoding(THFile *self); +TH_API void THDiskFile_bigEndianEncoding(THFile *self); +TH_API void THDiskFile_longSize(THFile *self, int size); +TH_API void THDiskFile_noBuffer(THFile *self); + +#endif diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp new file mode 100644 index 0000000..f3e1741 --- /dev/null +++ b/aten/src/TH/THFile.cpp @@ -0,0 +1,158 @@ +#include "THFile.h" +#include "THStorage.hpp" +#include "THFilePrivate.h" + +#define IMPLEMENT_THFILE_RW(TYPEC, TYPE) \ + size_t THFile_read##TYPEC##Raw(THFile *self, TYPE *data, size_t n) \ + { \ + return (*self->vtable->read##TYPEC)(self, data, n); \ + } \ + \ + size_t THFile_write##TYPEC##Raw(THFile *self, TYPE *data, size_t n) \ + { \ + return (*self->vtable->write##TYPEC)(self, data, n); \ + } + +IMPLEMENT_THFILE_RW(Byte, uint8_t) +IMPLEMENT_THFILE_RW(Char, int8_t) +IMPLEMENT_THFILE_RW(Short, int16_t) +IMPLEMENT_THFILE_RW(Int, int32_t) +IMPLEMENT_THFILE_RW(Long, int64_t) +IMPLEMENT_THFILE_RW(Float, float) +IMPLEMENT_THFILE_RW(Double, double) +IMPLEMENT_THFILE_RW(Half, THHalf) + +size_t THFile_readStringRaw(THFile *self, const char *format, char **str_) +{ + return self->vtable->readString(self, format, str_); +} + +size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size) +{ + return self->vtable->writeString(self, str, size); +} + +void THFile_synchronize(THFile *self) +{ + self->vtable->synchronize(self); +} + +void THFile_seek(THFile *self, size_t position) +{ + self->vtable->seek(self, position); +} + +void THFile_seekEnd(THFile *self) +{ + self->vtable->seekEnd(self); +} + +size_t THFile_position(THFile *self) +{ + return self->vtable->position(self); +} + +void THFile_close(THFile *self) +{ + self->vtable->close(self); +} + +void THFile_free(THFile *self) +{ + self->vtable->free(self); +} + +int THFile_isOpened(THFile *self) +{ + return self->vtable->isOpened(self); +} + +#define IMPLEMENT_THFILE_FLAGS(FLAG) \ + int THFile_##FLAG(THFile *self) \ + { \ + return self->FLAG; \ + } + +IMPLEMENT_THFILE_FLAGS(isQuiet) +IMPLEMENT_THFILE_FLAGS(isReadable) +IMPLEMENT_THFILE_FLAGS(isWritable) +IMPLEMENT_THFILE_FLAGS(isBinary) +IMPLEMENT_THFILE_FLAGS(isAutoSpacing) +IMPLEMENT_THFILE_FLAGS(hasError) + +void THFile_binary(THFile *self) +{ + self->isBinary = 1; +} + +void THFile_ascii(THFile *self) +{ + self->isBinary = 0; +} + +void THFile_autoSpacing(THFile *self) +{ + self->isAutoSpacing = 1; +} + +void THFile_noAutoSpacing(THFile *self) +{ + self->isAutoSpacing = 0; +} + +void THFile_quiet(THFile *self) +{ + self->isQuiet = 1; +} + +void THFile_pedantic(THFile *self) +{ + self->isQuiet = 0; +} + +void THFile_clearError(THFile *self) +{ + self->hasError = 0; +} + +#define IMPLEMENT_THFILE_SCALAR(TYPEC, TYPE) \ + TYPE THFile_read##TYPEC##Scalar(THFile *self) \ + { \ + TYPE scalar; \ + THFile_read##TYPEC##Raw(self, &scalar, 1); \ + return scalar; \ + } \ + \ + void THFile_write##TYPEC##Scalar(THFile *self, TYPE scalar) \ + { \ + THFile_write##TYPEC##Raw(self, &scalar, 1); \ + } + +IMPLEMENT_THFILE_SCALAR(Byte, uint8_t) +IMPLEMENT_THFILE_SCALAR(Char, int8_t) +IMPLEMENT_THFILE_SCALAR(Short, int16_t) +IMPLEMENT_THFILE_SCALAR(Int, int32_t) +IMPLEMENT_THFILE_SCALAR(Long, int64_t) +IMPLEMENT_THFILE_SCALAR(Float, float) +IMPLEMENT_THFILE_SCALAR(Double, double) +IMPLEMENT_THFILE_SCALAR(Half, THHalf) + +#define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE) \ + size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ + { \ + return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size); \ + } \ + \ + size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ + { \ + return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size); \ + } + +IMPLEMENT_THFILE_STORAGE(Byte, uint8_t) +IMPLEMENT_THFILE_STORAGE(Char, int8_t) +IMPLEMENT_THFILE_STORAGE(Short, int16_t) +IMPLEMENT_THFILE_STORAGE(Int, int32_t) +IMPLEMENT_THFILE_STORAGE(Long, int64_t) +IMPLEMENT_THFILE_STORAGE(Float, float) +IMPLEMENT_THFILE_STORAGE(Double, double) +IMPLEMENT_THFILE_STORAGE(Half, THHalf) diff --git a/aten/src/TH/THFile.h b/aten/src/TH/THFile.h new file mode 100644 index 0000000..27041f5 --- /dev/null +++ b/aten/src/TH/THFile.h @@ -0,0 +1,91 @@ +#ifndef TH_FILE_INC +#define TH_FILE_INC + +#include "THStorage.h" + +typedef struct THFile__ THFile; + +TH_API int THFile_isOpened(THFile *self); +TH_API int THFile_isQuiet(THFile *self); +TH_API int THFile_isReadable(THFile *self); +TH_API int THFile_isWritable(THFile *self); +TH_API int THFile_isBinary(THFile *self); +TH_API int THFile_isAutoSpacing(THFile *self); +TH_API int THFile_hasError(THFile *self); + +TH_API void THFile_binary(THFile *self); +TH_API void THFile_ascii(THFile *self); +TH_API void THFile_autoSpacing(THFile *self); +TH_API void THFile_noAutoSpacing(THFile *self); +TH_API void THFile_quiet(THFile *self); +TH_API void THFile_pedantic(THFile *self); +TH_API void THFile_clearError(THFile *self); + +/* scalar */ +TH_API uint8_t THFile_readByteScalar(THFile *self); +TH_API int8_t THFile_readCharScalar(THFile *self); +TH_API int16_t THFile_readShortScalar(THFile *self); +TH_API int32_t THFile_readIntScalar(THFile *self); +TH_API int64_t THFile_readLongScalar(THFile *self); +TH_API float THFile_readFloatScalar(THFile *self); +TH_API double THFile_readDoubleScalar(THFile *self); + +TH_API void THFile_writeByteScalar(THFile *self, uint8_t scalar); +TH_API void THFile_writeCharScalar(THFile *self, int8_t scalar); +TH_API void THFile_writeShortScalar(THFile *self, int16_t scalar); +TH_API void THFile_writeIntScalar(THFile *self, int32_t scalar); +TH_API void THFile_writeLongScalar(THFile *self, int64_t scalar); +TH_API void THFile_writeFloatScalar(THFile *self, float scalar); +TH_API void THFile_writeDoubleScalar(THFile *self, double scalar); + +/* storage */ +TH_API size_t THFile_readByte(THFile *self, THByteStorage *storage); +TH_API size_t THFile_readChar(THFile *self, THCharStorage *storage); +TH_API size_t THFile_readShort(THFile *self, THShortStorage *storage); +TH_API size_t THFile_readInt(THFile *self, THIntStorage *storage); +TH_API size_t THFile_readLong(THFile *self, THLongStorage *storage); +TH_API size_t THFile_readFloat(THFile *self, THFloatStorage *storage); +TH_API size_t THFile_readDouble(THFile *self, THDoubleStorage *storage); + +TH_API size_t THFile_writeByte(THFile *self, THByteStorage *storage); +TH_API size_t THFile_writeChar(THFile *self, THCharStorage *storage); +TH_API size_t THFile_writeShort(THFile *self, THShortStorage *storage); +TH_API size_t THFile_writeInt(THFile *self, THIntStorage *storage); +TH_API size_t THFile_writeLong(THFile *self, THLongStorage *storage); +TH_API size_t THFile_writeFloat(THFile *self, THFloatStorage *storage); +TH_API size_t THFile_writeDouble(THFile *self, THDoubleStorage *storage); + +/* raw */ +TH_API size_t THFile_readByteRaw(THFile *self, uint8_t *data, size_t n); +TH_API size_t THFile_readCharRaw(THFile *self, int8_t *data, size_t n); +TH_API size_t THFile_readShortRaw(THFile *self, int16_t *data, size_t n); +TH_API size_t THFile_readIntRaw(THFile *self, int32_t *data, size_t n); +TH_API size_t THFile_readLongRaw(THFile *self, int64_t *data, size_t n); +TH_API size_t THFile_readFloatRaw(THFile *self, float *data, size_t n); +TH_API size_t THFile_readDoubleRaw(THFile *self, double *data, size_t n); +TH_API size_t THFile_readStringRaw(THFile *self, const char *format, char **str_); /* you must deallocate str_ */ + +TH_API size_t THFile_writeByteRaw(THFile *self, uint8_t *data, size_t n); +TH_API size_t THFile_writeCharRaw(THFile *self, int8_t *data, size_t n); +TH_API size_t THFile_writeShortRaw(THFile *self, int16_t *data, size_t n); +TH_API size_t THFile_writeIntRaw(THFile *self, int32_t *data, size_t n); +TH_API size_t THFile_writeLongRaw(THFile *self, int64_t *data, size_t n); +TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n); +TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n); +TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size); + +TH_API THHalf THFile_readHalfScalar(THFile *self); +TH_API void THFile_writeHalfScalar(THFile *self, THHalf scalar); +TH_API size_t THFile_readHalf(THFile *self, THHalfStorage *storage); +TH_API size_t THFile_writeHalf(THFile *self, THHalfStorage *storage); +TH_API size_t THFile_readHalfRaw(THFile *self, THHalf* data, size_t size); +TH_API size_t THFile_writeHalfRaw(THFile *self, THHalf* data, size_t size); + +TH_API void THFile_synchronize(THFile *self); +TH_API void THFile_seek(THFile *self, size_t position); +TH_API void THFile_seekEnd(THFile *self); +TH_API size_t THFile_position(THFile *self); +TH_API void THFile_close(THFile *self); +TH_API void THFile_free(THFile *self); + +#endif diff --git a/aten/src/TH/THFilePrivate.h b/aten/src/TH/THFilePrivate.h new file mode 100644 index 0000000..93bbaa0 --- /dev/null +++ b/aten/src/TH/THFilePrivate.h @@ -0,0 +1,50 @@ +#include "THGeneral.h" + +#include "THHalf.h" + + +struct THFile__ +{ + struct THFileVTable *vtable; + + int isQuiet; + int isReadable; + int isWritable; + int isBinary; + int isAutoSpacing; + int hasError; +}; + +/* virtual table definition */ + +struct THFileVTable +{ + int (*isOpened)(THFile *self); + + ssize_t (*readByte)(THFile *self, uint8_t *data, ssize_t n); + ssize_t (*readChar)(THFile *self, int8_t *data, ssize_t n); + ssize_t (*readShort)(THFile *self, int16_t *data, ssize_t n); + ssize_t (*readInt)(THFile *self, int32_t *data, ssize_t n); + ssize_t (*readLong)(THFile *self, int64_t *data, ssize_t n); + ssize_t (*readFloat)(THFile *self, float *data, ssize_t n); + ssize_t (*readDouble)(THFile *self, double *data, ssize_t n); + ssize_t (*readHalf)(THFile *self, THHalf *data, ssize_t n); + ssize_t (*readString)(THFile *self, const char *format, char **str_); + + ssize_t (*writeByte)(THFile *self, uint8_t *data, ssize_t n); + ssize_t (*writeChar)(THFile *self, int8_t *data, ssize_t n); + ssize_t (*writeShort)(THFile *self, int16_t *data, ssize_t n); + ssize_t (*writeInt)(THFile *self, int32_t *data, ssize_t n); + ssize_t (*writeLong)(THFile *self, int64_t *data, ssize_t n); + ssize_t (*writeFloat)(THFile *self, float *data, ssize_t n); + ssize_t (*writeDouble)(THFile *self, double *data, ssize_t n); + ssize_t (*writeHalf)(THFile *self, THHalf *data, ssize_t n); + ssize_t (*writeString)(THFile *self, const char *str, ssize_t size); + + void (*synchronize)(THFile *self); + void (*seek)(THFile *self, ssize_t position); + void (*seekEnd)(THFile *self); + ssize_t (*position)(THFile *self); + void (*close)(THFile *self); + void (*free)(THFile *self); +}; diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp new file mode 100644 index 0000000..667d7fb --- /dev/null +++ b/aten/src/TH/THGeneral.cpp @@ -0,0 +1,328 @@ +#include "THGeneral.h" + +#ifdef _OPENMP +#include +#endif + +#ifndef TH_HAVE_THREAD +#define __thread +#elif _MSC_VER +#define __thread __declspec( thread ) +#endif + +#if (defined(__unix) || defined(_WIN32)) + #if defined(__FreeBSD__) + #include + #else + #include + #endif +#elif defined(__APPLE__) +#include +#endif + +#ifdef TH_BLAS_MKL +// this is the C prototype, while mkl_set_num_threads is the fortran prototype +TH_EXTERNC void MKL_Set_Num_Threads(int); +// this is the C prototype, while mkl_get_max_threads is the fortran prototype +TH_EXTERNC int MKL_Get_Max_Threads(void); +#endif + +/* Torch Error Handling */ +static void defaultErrorHandlerFunction(const char *msg, void *data) +{ + printf("$ Error: %s\n", msg); + exit(-1); +} + +static THErrorHandlerFunction defaultErrorHandler = defaultErrorHandlerFunction; +static void *defaultErrorHandlerData; +static __thread THErrorHandlerFunction threadErrorHandler = NULL; +static __thread void *threadErrorHandlerData; + +void _THError(const char *file, const int line, const char *fmt, ...) +{ + char msg[2048]; + va_list args; + + /* vasprintf not standard */ + /* vsnprintf: how to handle if does not exists? */ + va_start(args, fmt); + int n = vsnprintf(msg, 2048, fmt, args); + va_end(args); + + if(n < 2048) { + snprintf(msg + n, 2048 - n, " at %s:%d", file, line); + } + + if (threadErrorHandler) + (*threadErrorHandler)(msg, threadErrorHandlerData); + else + (*defaultErrorHandler)(msg, defaultErrorHandlerData); + TH_UNREACHABLE; +} + +void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...) { + char msg[1024]; + va_list args; + va_start(args, fmt); + vsnprintf(msg, 1024, fmt, args); + va_end(args); + _THError(file, line, "Assertion `%s' failed. %s", exp, msg); +} + +void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data) +{ + threadErrorHandler = new_handler; + threadErrorHandlerData = data; +} + +void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data) +{ + if (new_handler) + defaultErrorHandler = new_handler; + else + defaultErrorHandler = defaultErrorHandlerFunction; + defaultErrorHandlerData = data; +} + +/* Torch Arg Checking Handling */ +static void defaultArgErrorHandlerFunction(int argNumber, const char *msg, void *data) +{ + if(msg) + printf("$ Invalid argument %d: %s\n", argNumber, msg); + else + printf("$ Invalid argument %d\n", argNumber); + exit(-1); +} + +static THArgErrorHandlerFunction defaultArgErrorHandler = defaultArgErrorHandlerFunction; +static void *defaultArgErrorHandlerData; +static __thread THArgErrorHandlerFunction threadArgErrorHandler = NULL; +static __thread void *threadArgErrorHandlerData; + +void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...) +{ + if(!condition) { + char msg[2048]; + va_list args; + + /* vasprintf not standard */ + /* vsnprintf: how to handle if does not exists? */ + va_start(args, fmt); + int n = vsnprintf(msg, 2048, fmt, args); + va_end(args); + + if(n < 2048) { + snprintf(msg + n, 2048 - n, " at %s:%d", file, line); + } + + if (threadArgErrorHandler) + (*threadArgErrorHandler)(argNumber, msg, threadArgErrorHandlerData); + else + (*defaultArgErrorHandler)(argNumber, msg, defaultArgErrorHandlerData); + TH_UNREACHABLE; + } +} + +void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data) +{ + threadArgErrorHandler = new_handler; + threadArgErrorHandlerData = data; +} + +void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data) +{ + if (new_handler) + defaultArgErrorHandler = new_handler; + else + defaultArgErrorHandler = defaultArgErrorHandlerFunction; + defaultArgErrorHandlerData = data; +} + +static __thread void (*torchGCFunction)(void *data) = NULL; +static __thread void *torchGCData; + +/* Optional hook for integrating with a garbage-collected frontend. + * + * If torch is running with a garbage-collected frontend (e.g. Lua), + * the GC isn't aware of TH-allocated memory so may not know when it + * needs to run. These hooks trigger the GC to run in two cases: + * + * (1) When a memory allocation (malloc, realloc, ...) fails + * (2) When the total TH-allocated memory hits a dynamically-adjusted + * soft maximum. + */ +void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data ) +{ + torchGCFunction = torchGCFunction_; + torchGCData = data; +} + +static void* THAllocInternal(ptrdiff_t size) +{ + void *ptr; + + if (size > 5120) + { +#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN)) + if (posix_memalign(&ptr, 64, size) != 0) + ptr = NULL; +/* +#elif defined(_WIN32) + ptr = _aligned_malloc(size, 64); +*/ +#else + ptr = malloc(size); +#endif + } + else + { + ptr = malloc(size); + } + + return ptr; +} + +void* THAlloc(ptrdiff_t size) +{ + void *ptr; + + if(size < 0) + THError("$ Torch: invalid memory size -- maybe an overflow?"); + + if(size == 0) + return NULL; + + ptr = THAllocInternal(size); + + if(!ptr && torchGCFunction) { + torchGCFunction(torchGCData); + ptr = THAllocInternal(size); + } + + if(!ptr) + THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824); + + return ptr; +} + +void* THRealloc(void *ptr, ptrdiff_t size) +{ + if(!ptr) + return(THAlloc(size)); + + if(size == 0) + { + THFree(ptr); + return NULL; + } + + if(size < 0) + THError("$ Torch: invalid memory size -- maybe an overflow?"); + + void *newptr = realloc(ptr, size); + + if(!newptr && torchGCFunction) { + torchGCFunction(torchGCData); + newptr = realloc(ptr, size); + } + + if(!newptr) + THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824); + + return newptr; +} + +void THFree(void *ptr) +{ + free(ptr); +} + +double THLog10(const double x) +{ + return log10(x); +} + +double THLog1p(const double x) +{ +#if (defined(_MSC_VER) || defined(__MINGW32__)) + volatile double y = 1 + x; + return log(y) - ((y-1)-x)/y ; /* cancels errors with IEEE arithmetic */ +#else + return log1p(x); +#endif +} + +double THLog2(const double x) +{ + return log2(x); +} + +double THExpm1(const double x) +{ + return expm1(x); +} + +void THSetNumThreads(int num_threads) +{ +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif +#ifdef TH_BLAS_MKL + MKL_Set_Num_Threads(num_threads); +#endif + +} + +int THGetNumThreads(void) +{ +#ifdef _OPENMP + return omp_get_max_threads(); +#else + return 1; +#endif +} + +int THGetNumCores(void) +{ +#ifdef _OPENMP + return omp_get_num_procs(); +#else + return 1; +#endif +} + +TH_API void THInferNumThreads(void) +{ +#if defined(_OPENMP) && defined(TH_BLAS_MKL) + // If we are using MKL an OpenMP make sure the number of threads match. + // Otherwise, MKL and our OpenMP-enabled functions will keep changing the + // size of the OpenMP thread pool, resulting in worse performance (and memory + // leaks in GCC 5.4) + omp_set_num_threads(MKL_Get_Max_Threads()); +#endif +} + +TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) { + const int L = TH_DESC_BUFF_LEN; + THDescBuff buf; + char *str = buf.str; + int i, n = 0; + n += snprintf(str, L-n, "["); + + for (i = 0; i < ndim; i++) { + if (n >= L) break; + n += snprintf(str+n, L-n, "%" PRId64, size[i]); + if (i < ndim-1) { + n += snprintf(str+n, L-n, " x "); + } + } + + if (n < L - 2) { + snprintf(str+n, L-n, "]"); + } else { + snprintf(str+L-5, 5, "...]"); + } + + return buf; +} diff --git a/aten/src/TH/THGeneral.h.in b/aten/src/TH/THGeneral.h.in new file mode 100644 index 0000000..103710b --- /dev/null +++ b/aten/src/TH/THGeneral.h.in @@ -0,0 +1,187 @@ +#ifndef TH_GENERAL_INC +#define TH_GENERAL_INC + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TH_BLAS_MKL +#include +#endif + +#cmakedefine USE_BLAS +#cmakedefine USE_LAPACK +#cmakedefine BLAS_F2C +#cmakedefine BLAS_USE_CBLAS_DOT + +#ifdef __cplusplus +# define TH_EXTERNC extern "C" +#else +# define TH_EXTERNC extern +#endif + +#ifdef _WIN32 +# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +# define TH_API TH_EXTERNC __declspec(dllexport) +# define TH_CPP_API extern __declspec(dllexport) +# else +# define TH_API TH_EXTERNC __declspec(dllimport) +# define TH_CPP_API extern __declspec(dllimport) +# endif +#else +# define TH_API TH_EXTERNC +# define TH_CPP_API extern +#endif + +#ifdef _WIN32 +# define TH_NO_RETURN __declspec(noreturn) +# define TH_UNREACHABLE +#else +# define TH_NO_RETURN __attribute__((noreturn)) +# define TH_UNREACHABLE __builtin_unreachable(); +#endif + +#if defined(__GNUC__) && ((__GNUC__ > 2) || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)) +# define TH_UNUSED __attribute__((unused)) +#else +# define TH_UNUSED +#endif + +#if defined(__clang__) +#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero"))) +#else +#define __ubsan_ignore_float_divide_by_zero__ +#endif + +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + +#define TH_INDEX_BASE 0 + +typedef void (*THErrorHandlerFunction)(const char *msg, void *data); +typedef void (*THArgErrorHandlerFunction)(int argNumber, const char *msg, void *data); + +#define TH_DESC_BUFF_LEN 64 +typedef struct { + char str[TH_DESC_BUFF_LEN]; +} THDescBuff; + + +TH_API double THLog1p(const double x); +TH_API double THLog2(const double x); +TH_API double THExpm1(const double x); +TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim); +TH_API TH_NO_RETURN void _THError(const char *file, const int line, const char *fmt, ...); +TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...); +TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data); +TH_API void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data); +TH_API void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...); +TH_API void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data); +TH_API void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data); +TH_API void* THAlloc(ptrdiff_t size); +TH_API void* THRealloc(void *ptr, ptrdiff_t size); +TH_API void THFree(void *ptr); +TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data ); +// this hook should only be called by custom allocator functions +TH_API void THHeapUpdate(ptrdiff_t size); +TH_API void THSetNumThreads(int num_threads); +TH_API int THGetNumThreads(void); +TH_API int THGetNumCores(void); +TH_API void THInferNumThreads(void); + +#define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__) + +#define THCleanup(...) __VA_ARGS__ + +#define THArgCheck(...) \ +do { \ + _THArgCheck(__FILE__, __LINE__, __VA_ARGS__); \ +} while(0) + +#define THArgError(...) \ +do { \ + _THArgCheck(__FILE__, __LINE__, false, __VA_ARGS__); \ + TH_UNREACHABLE \ +} while(0) + +#define THArgCheckWithCleanup(condition, cleanup, ...) \ +do if (!(condition)) { \ + cleanup \ + _THArgCheck(__FILE__, __LINE__, 0, __VA_ARGS__); \ +} while(0) + +#define THAssert(exp) \ +do { \ + if (!(exp)) { \ + _THAssertionFailed(__FILE__, __LINE__, #exp, ""); \ + } \ +} while(0) + +#define THAssertMsg(exp, ...) \ +do { \ + if (!(exp)) { \ + _THAssertionFailed(__FILE__, __LINE__, #exp, __VA_ARGS__); \ + } \ +} while(0) + +#define TH_CONCAT_STRING_2(x,y) TH_CONCAT_STRING_2_EXPAND(x,y) +#define TH_CONCAT_STRING_2_EXPAND(x,y) #x #y + +#define TH_CONCAT_STRING_3(x,y,z) TH_CONCAT_STRING_3_EXPAND(x,y,z) +#define TH_CONCAT_STRING_3_EXPAND(x,y,z) #x #y #z + +#define TH_CONCAT_STRING_4(x,y,z,w) TH_CONCAT_STRING_4_EXPAND(x,y,z,w) +#define TH_CONCAT_STRING_4_EXPAND(x,y,z,w) #x #y #z #w + +#define TH_CONCAT_2(x,y) TH_CONCAT_2_EXPAND(x,y) +#define TH_CONCAT_2_EXPAND(x,y) x ## y + +#define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z) +#define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z + +#define TH_CONCAT_4_EXPAND(x,y,z,w) x ## y ## z ## w +#define TH_CONCAT_4(x,y,z,w) TH_CONCAT_4_EXPAND(x,y,z,w) + +#define THMin(X, Y) ((X) < (Y) ? (X) : (Y)) +#define THMax(X, Y) ((X) > (Y) ? (X) : (Y)) + +#if (defined(_MSC_VER) || defined(__MINGW32__)) +#if defined(_MSC_VER) +__inline double log1p(double x) { return THLog1p(x); } +#else +inline double log1p(double x) { return THLog1p(x); } +#endif + +#if defined(_MSC_VER) +__inline double log2(double x) { return THLog2(x); } +#else +inline double log2(double x) { return THLog2(x); } +#endif + +#if defined(_MSC_VER) +__inline double expm1(double x) { return THExpm1(x); } +#else +inline double expm1(double x) { return THExpm1(x); } +#endif + +#define snprintf _snprintf +#define popen _popen +#define pclose _pclose +#include +typedef SSIZE_T ssize_t; +#endif + +#endif diff --git a/aten/src/TH/THGenerateAllTypes.h b/aten/src/TH/THGenerateAllTypes.h new file mode 100644 index 0000000..5b9508d --- /dev/null +++ b/aten/src/TH/THGenerateAllTypes.h @@ -0,0 +1,17 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h" +#endif + +#ifndef THGenerateManyTypes +#define THAllLocalGenerateManyTypes +#define THGenerateManyTypes +#endif + +#include "THGenerateFloatTypes.h" +#include "THGenerateIntTypes.h" + +#ifdef THAllLocalGenerateManyTypes +#undef THAllLocalGenerateManyTypes +#undef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateByteType.h b/aten/src/TH/THGenerateByteType.h new file mode 100644 index 0000000..0ec234d --- /dev/null +++ b/aten/src/TH/THGenerateByteType.h @@ -0,0 +1,26 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateByteType.h" +#endif + +#define real uint8_t +#define ureal uint8_t +#define accreal int64_t +#define Real Byte +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define THInf UCHAR_MAX +#define TH_REAL_IS_BYTE +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef ureal +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_BYTE +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateCharType.h b/aten/src/TH/THGenerateCharType.h new file mode 100644 index 0000000..9c172f1 --- /dev/null +++ b/aten/src/TH/THGenerateCharType.h @@ -0,0 +1,26 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateCharType.h" +#endif + +#define real int8_t +#define ureal uint8_t +#define accreal int64_t +#define Real Char +#define THInf SCHAR_MAX +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define TH_REAL_IS_CHAR +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef ureal +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_CHAR +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateDoubleType.h b/aten/src/TH/THGenerateDoubleType.h new file mode 100644 index 0000000..fffee60 --- /dev/null +++ b/aten/src/TH/THGenerateDoubleType.h @@ -0,0 +1,24 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateDoubleType.h" +#endif + +#define real double +#define accreal double +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define Real Double +#define THInf DBL_MAX +#define TH_REAL_IS_DOUBLE +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef accreal +#undef real +#undef Real +#undef THInf +#undef TH_REAL_IS_DOUBLE +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateFloatType.h b/aten/src/TH/THGenerateFloatType.h new file mode 100644 index 0000000..a31b50c --- /dev/null +++ b/aten/src/TH/THGenerateFloatType.h @@ -0,0 +1,24 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h" +#endif + +#define real float +#define accreal double +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define Real Float +#define THInf FLT_MAX +#define TH_REAL_IS_FLOAT +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef accreal +#undef real +#undef Real +#undef THInf +#undef TH_REAL_IS_FLOAT +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateFloatTypes.h b/aten/src/TH/THGenerateFloatTypes.h new file mode 100644 index 0000000..be5ea84 --- /dev/null +++ b/aten/src/TH/THGenerateFloatTypes.h @@ -0,0 +1,17 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h" +#endif + +#ifndef THGenerateManyTypes +#define THFloatLocalGenerateManyTypes +#define THGenerateManyTypes +#endif + +#include "THGenerateFloatType.h" +#include "THGenerateDoubleType.h" + +#ifdef THFloatLocalGenerateManyTypes +#undef THFloatLocalGenerateManyTypes +#undef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateHalfType.h b/aten/src/TH/THGenerateHalfType.h new file mode 100644 index 0000000..47ff1e8 --- /dev/null +++ b/aten/src/TH/THGenerateHalfType.h @@ -0,0 +1,25 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateHalfType.h" +#endif + +#include "THHalf.h" +#define real THHalf +#define accreal float +#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val) +#define Real Half +#define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF) +#define TH_REAL_IS_HALF +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_HALF +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateIntType.h b/aten/src/TH/THGenerateIntType.h new file mode 100644 index 0000000..5135bc5 --- /dev/null +++ b/aten/src/TH/THGenerateIntType.h @@ -0,0 +1,26 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateIntType.h" +#endif + +#define real int32_t +#define ureal uint32_t +#define accreal int64_t +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define Real Int +#define THInf INT_MAX +#define TH_REAL_IS_INT +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef ureal +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_INT +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateIntTypes.h b/aten/src/TH/THGenerateIntTypes.h new file mode 100644 index 0000000..9931fb1 --- /dev/null +++ b/aten/src/TH/THGenerateIntTypes.h @@ -0,0 +1,20 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateIntTypes.h" +#endif + +#ifndef THGenerateManyTypes +#define THIntLocalGenerateManyTypes +#define THGenerateManyTypes +#endif + +#include "THGenerateByteType.h" +#include "THGenerateCharType.h" +#include "THGenerateShortType.h" +#include "THGenerateIntType.h" +#include "THGenerateLongType.h" + +#ifdef THIntLocalGenerateManyTypes +#undef THIntLocalGenerateManyTypes +#undef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateLongType.h b/aten/src/TH/THGenerateLongType.h new file mode 100644 index 0000000..d2b9af0 --- /dev/null +++ b/aten/src/TH/THGenerateLongType.h @@ -0,0 +1,26 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateLongType.h" +#endif + +#define real int64_t +#define ureal uint64_t +#define accreal int64_t +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define Real Long +#define THInf LONG_MAX +#define TH_REAL_IS_LONG +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef ureal +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_LONG +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerateShortType.h b/aten/src/TH/THGenerateShortType.h new file mode 100644 index 0000000..5b83c47 --- /dev/null +++ b/aten/src/TH/THGenerateShortType.h @@ -0,0 +1,26 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateShortType.h" +#endif + +#define real int16_t +#define ureal uint16_t +#define accreal int64_t +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) +#define Real Short +#define THInf SHRT_MAX +#define TH_REAL_IS_SHORT +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef ureal +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_SHORT +#undef TH_CONVERT_REAL_TO_ACCREAL +#undef TH_CONVERT_ACCREAL_TO_REAL + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/THGenerator.hpp b/aten/src/TH/THGenerator.hpp new file mode 100644 index 0000000..f1e6914 --- /dev/null +++ b/aten/src/TH/THGenerator.hpp @@ -0,0 +1,29 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include + +struct THGeneratorState { + /* The initial seed. */ + uint64_t the_initial_seed; + int left; /* = 1; */ + int seeded; /* = 0; */ + uint64_t next; + uint64_t state[_MERSENNE_STATE_N]; /* the array for the state vector */ + + /********************************/ + + /* For normal distribution */ + double normal_x; + double normal_y; + double normal_rho; + int normal_is_valid; /* = 0; */ +}; + +/* A THGenerator contains all the state required for a single random number stream */ +struct THGenerator { + std::mutex mutex; /* mutex for using this generator */ + THGeneratorState gen_state; +}; diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp new file mode 100644 index 0000000..1c46c59 --- /dev/null +++ b/aten/src/TH/THHalf.cpp @@ -0,0 +1,100 @@ +#include "THHalf.h" + +/* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */ + +THHalf TH_float2half(float f) +{ + THHalf h; + TH_float2halfbits(&f, &h.x); + return h; +} + +TH_API float TH_half2float(THHalf h) +{ + float f; + TH_halfbits2float(&h.x, &f); + return f; +} + +// Host functions for converting between FP32 and FP16 formats + +void TH_halfbits2float(unsigned short* src, float* res) +{ + unsigned h = *src; + unsigned sign = ((h >> 15) & 1); + unsigned exponent = ((h >> 10) & 0x1f); + unsigned mantissa = ((h & 0x3ff) << 13); + + if (exponent == 0x1f) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); + exponent = 0xff; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71; + do { + msb = (mantissa & 0x400000); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70; + } + + *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa); +} + +void TH_float2halfbits(float* src, unsigned short* dest) +{ + unsigned x = *(unsigned*)src; + unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; + unsigned sign, exponent, mantissa; + + // Get rid of +NaN/-NaN case first. + if (u > 0x7f800000) { + *dest = 0x7fffU; + return ; + } + + sign = ((x >> 16) & 0x8000); + + // Get rid of +Inf/-Inf, +0/-0. + if (u > 0x477fefff) { + *dest = sign | 0x7c00U; + return; + } + if (u < 0x33000001) { + *dest = (sign | 0x0000); + return; + } + + exponent = ((u >> 23) & 0xff); + mantissa = (u & 0x7fffff); + + if (exponent > 0x70) { + shift = 13; + exponent -= 0x70; + } else { + shift = 0x7e - exponent; + exponent = 0; + mantissa |= 0x800000; + } + lsb = (1 << shift); + lsb_s1 = (lsb >> 1); + lsb_m1 = (lsb - 1); + + // Round to nearest even. + remainder = (mantissa & lsb_m1); + mantissa >>= shift; + if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { + ++mantissa; + if (!(mantissa & 0x3ff)) { + ++exponent; + mantissa = 0; + } + } + + *dest = (sign | (exponent << 10) | mantissa); +} diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h new file mode 100644 index 0000000..0f9807b --- /dev/null +++ b/aten/src/TH/THHalf.h @@ -0,0 +1,41 @@ +#ifndef TH_HALF_H +#define TH_HALF_H + +#include "THGeneral.h" +#include + +/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */ +#if defined(__GNUC__) +#define __thalign__(n) __attribute__((aligned(n))) +#elif defined(_WIN32) +#define __thalign__(n) __declspec(align(n)) +#else +#define __thalign__(n) +#endif + +typedef struct __thalign__(2){ + unsigned short x; +} __THHalf; + +typedef struct __thalign__(4) { + unsigned int x; +} __THHalf2; + +typedef __THHalf THHalf; +typedef __THHalf2 THHalf2; + +TH_API void TH_float2halfbits(float*, unsigned short*); +TH_API void TH_halfbits2float(unsigned short*, float*); + +TH_API THHalf TH_float2half(float); +TH_API float TH_half2float(THHalf); + +#ifndef TH_HALF_BITS_TO_LITERAL +# define TH_HALF_BITS_TO_LITERAL(n) { n } +#endif + +#define TH_HALF_ZERO 0x0U +#define TH_HALF_INF 0x7C00U + +#undef __thalign__ +#endif diff --git a/aten/src/TH/THLapack.cpp b/aten/src/TH/THLapack.cpp new file mode 100644 index 0000000..e340a63 --- /dev/null +++ b/aten/src/TH/THLapack.cpp @@ -0,0 +1,4 @@ +#include "THLapack.h" + +#include "generic/THLapack.cpp" +#include "THGenerateFloatTypes.h" diff --git a/aten/src/TH/THLapack.h b/aten/src/TH/THLapack.h new file mode 100644 index 0000000..614d15f --- /dev/null +++ b/aten/src/TH/THLapack.h @@ -0,0 +1,27 @@ +#ifndef TH_LAPACK_INC +#define TH_LAPACK_INC + +#include "THGeneral.h" + +#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME) + +#define THLapackCheck(fmt, func, info , ...) \ +if (info < 0) { \ + THError("Lapack Error in %s : Illegal Argument %d", func, -info); \ +} else if(info > 0) { \ + THError(fmt, func, info, ##__VA_ARGS__); \ +} \ + +#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...) \ +if (info < 0) { \ + cleanup \ + THError("Lapack Error in %s : Illegal Argument %d", func, -info); \ +} else if(info > 0) { \ + cleanup \ + THError(fmt, func, info, ##__VA_ARGS__); \ +} + +#include "generic/THLapack.h" +#include "THGenerateAllTypes.h" + +#endif diff --git a/aten/src/TH/THLogAdd.cpp b/aten/src/TH/THLogAdd.cpp new file mode 100644 index 0000000..4b14f85 --- /dev/null +++ b/aten/src/TH/THLogAdd.cpp @@ -0,0 +1,88 @@ +#include "THLogAdd.h" + +#include + +#ifdef USE_DOUBLE +#define MINUS_LOG_THRESHOLD -39.14 +#else +#define MINUS_LOG_THRESHOLD -18.42 +#endif + +const double THLog2Pi=1.83787706640934548355; +const double THLogZero=-DBL_MAX; +const double THLogOne=0; + +double THLogAdd(double log_a, double log_b) +{ + double minusdif; + + if (log_a < log_b) + { + double tmp = log_a; + log_a = log_b; + log_b = tmp; + } + + minusdif = log_b - log_a; +#ifdef DEBUG + if (isnan(minusdif)) + THError("THLogAdd: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a); +#endif + if (minusdif < MINUS_LOG_THRESHOLD) + return log_a; + else + return log_a + log1p(exp(minusdif)); +} + +double THLogSub(double log_a, double log_b) +{ + double minusdif; + + if (log_a < log_b) + THError("LogSub: log_a (%f) should be greater than log_b (%f)", log_a, log_b); + + minusdif = log_b - log_a; +#ifdef DEBUG + if (isnan(minusdif)) + THError("LogSub: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a); +#endif + if (log_a == log_b) + return THLogZero; + else if (minusdif < MINUS_LOG_THRESHOLD) + return log_a; + else + return log_a + log1p(-exp(minusdif)); +} + +/* Credits to Leon Bottou */ +double THExpMinusApprox(const double x) +{ +#define EXACT_EXPONENTIAL 0 +#if EXACT_EXPONENTIAL + return exp(-x); +#else + /* fast approximation of exp(-x) for x positive */ +# define A0 (1.0) +# define A1 (0.125) +# define A2 (0.0078125) +# define A3 (0.00032552083) +# define A4 (1.0172526e-5) + if (x < 13.0) + { +/* assert(x>=0); */ + double y; + y = A0+x*(A1+x*(A2+x*(A3+x*A4))); + y *= y; + y *= y; + y *= y; + y = 1/y; + return y; + } + return 0; +# undef A0 +# undef A1 +# undef A2 +# undef A3 +# undef A4 +#endif +} diff --git a/aten/src/TH/THLogAdd.h b/aten/src/TH/THLogAdd.h new file mode 100644 index 0000000..9319b8f --- /dev/null +++ b/aten/src/TH/THLogAdd.h @@ -0,0 +1,14 @@ +#ifndef TH_LOG_ADD_INC +#define TH_LOG_ADD_INC + +#include "THGeneral.h" + +TH_API const double THLog2Pi; +TH_API const double THLogZero; +TH_API const double THLogOne; + +TH_API double THLogAdd(double log_a, double log_b); +TH_API double THLogSub(double log_a, double log_b); +TH_API double THExpMinusApprox(const double x); + +#endif diff --git a/aten/src/TH/THMath.h b/aten/src/TH/THMath.h new file mode 100644 index 0000000..638c98a --- /dev/null +++ b/aten/src/TH/THMath.h @@ -0,0 +1,287 @@ +#ifndef _THMATH_H +#define _THMATH_H +#include +#ifdef _MSC_VER +#define _USE_MATH_DEFINES +#endif +#include + +#ifndef M_PIf +#define M_PIf 3.1415926535f +#endif // M_PIf + +static inline double TH_sigmoid(double value) { + return 1.0 / (1.0 + exp(-value)); +} + +static inline double TH_frac(double x) { + return x - trunc(x); +} + +static inline double TH_rsqrt(double x) { + return 1.0 / sqrt(x); +} + +static inline double TH_lerp(double a, double b, double weight) { + return a + weight * (b-a); +} + +static inline float TH_sigmoidf(float value) { + return 1.0f / (1.0f + expf(-value)); +} + +static inline float TH_fracf(float x) { + return x - truncf(x); +} + +static inline float TH_rsqrtf(float x) { + return 1.0f / sqrtf(x); +} + +static inline float TH_lerpf(float a, float b, float weight) { + return a + weight * (b-a); +} + +/* The next function is taken from https://github.com/antelopeusersgroup/antelope_contrib/blob/master/lib/location/libgenloc/erfinv.c. +Below is the copyright. +Output was modified to be inf or -inf when input is 1 or -1. */ + + +/* + Copyright (c) 2014 Indiana University + All rights reserved. + + Written by Prof. Gary L. Pavlis, Dept. of Geol. Sci., + Indiana University, Bloomington, IN + + This software is licensed under the New BSD license: + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided + that the following conditions are met: + + Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + + Redistributions in binary form must reproduce the + above copyright notice, this list of conditions and + the following disclaimer in the documentation and/or + other materials provided with the distribution. + + Neither the name of Indiana University nor + the names of its contributors may be used to endorse + or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#define CENTRAL_RANGE 0.7 + +static inline double TH_erfinv(double y) { +/* Function to calculate inverse error function. Rational approximation +is used to generate an initial approximation, which is then improved to +full accuracy by two steps of Newton's method. Code is a direct +translation of the erfinv m file in matlab version 2.0. +Author: Gary L. Pavlis, Indiana University +Date: February 1996 +*/ + double x,z,num,dem; /*working variables */ + /* coefficients in rational expansion */ + double a[4]={ 0.886226899, -1.645349621, 0.914624893, -0.140543331}; + double b[4]={-2.118377725, 1.442710462, -0.329097515, 0.012229801}; + double c[4]={-1.970840454, -1.624906493, 3.429567803, 1.641345311}; + double d[2]={ 3.543889200, 1.637067800}; + if(fabs(y) > 1.0) return (atof("NaN")); /* This needs IEEE constant*/ + if(fabs(y) == 1.0) return((copysign(1.0,y))*atof("INFINITY")); + if(fabs(y) <= CENTRAL_RANGE){ + z = y*y; + num = (((a[3]*z + a[2])*z + a[1])*z + a[0]); + dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0])*z + 1.0); + x = y*num/dem; + } + else{ + z = sqrt(-log((1.0-fabs(y))/2.0)); + num = ((c[3]*z + c[2])*z + c[1])*z + c[0]; + dem = (d[1]*z + d[0])*z + 1.0; + x = (copysign(1.0,y))*num/dem; + } + /* Two steps of Newton-Raphson correction */ + x = x - (erf(x) - y)/( (2.0/sqrt(M_PI))*exp(-x*x)); + x = x - (erf(x) - y)/( (2.0/sqrt(M_PI))*exp(-x*x)); + + return(x); +} +#undef CENTRAL_RANGE + +static inline double TH_polevl(double x, double *A, size_t len) { + double result = 0; + for (size_t i = 0; i <= len; i++) { + result = result * x + A[i]; + } + return result; +} + +static inline float TH_polevlf(float x, float *A, size_t len) { + float result = 0; + for (size_t i = 0; i <= len; i++) { + result = result * x + A[i]; + } + return result; +} + +/* + * The following function comes with the following copyright notice. + * It has been released under the BSD license. + * + * Cephes Math Library Release 2.8: June, 2000 + * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier + */ +static inline double TH_digamma(double x) { + static double PSI_10 = 2.25175258906672110764; + if (x == 0) { + return INFINITY; + } + + int x_is_integer = x == floor(x); + if (x < 0) { + if (x_is_integer) { + return INFINITY; + } + return TH_digamma(1 - x) - M_PI / tan(M_PI * x); + } + + // Push x to be >= 10 + double result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10; + } + + // Compute asymptotic digamma + static double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + double y = 0; + if (x < 1.0e17) { + double z = 1.0 / (x * x); + y = z * TH_polevl(z, A, 6); + } + return result + log(x) - (0.5 / x) - y; +} + +/* + * The following function comes with the following copyright notice. + * It has been released under the BSD license. + * + * Cephes Math Library Release 2.8: June, 2000 + * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier + */ +static inline double TH_digammaf(float x) { + static float PSI_10 = 2.25175258906672110764f; + if (x == 0) { + return INFINITY; + } + + int x_is_integer = x == floorf(x); + if (x < 0) { + if (x_is_integer) { + return INFINITY; + } + // Avoid rounding errors for `tan`'s input. + // Those make a big difference at extreme values. + float pi_over_tan_pi_x = (float)(M_PI / tan(M_PI * (double)x)); + return TH_digammaf(1 - x) - pi_over_tan_pi_x; + } + + // Push x to be >= 10 + float result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10; + } + + // Compute asymptotic digamma + static float A[] = { + 8.33333333333333333333E-2f, + -2.10927960927960927961E-2f, + 7.57575757575757575758E-3f, + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f, + }; + + float y = 0; + if (x < 1.0e17) { + float z = 1 / (x * x); + y = z * TH_polevlf(z, A, 6); + } + return result + logf(x) - (0.5 / x) - y; +} + +static inline double TH_trigamma(double x) { + double sign = +1; + double result = 0; + if (x < 0.5) { + sign = -1; + const double sin_pi_x = sin(M_PI * x); + result -= (M_PI * M_PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const double ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (1./6 - ixx * (1./30 - ixx * (1./42)))) / x; + return sign * result; +} + +static inline float TH_trigammaf(float x) { + float sign = +1; + float result = 0; + if (x < 0.5f) { + sign = -1; + const float sin_pi_x = sinf(M_PIf * x); + result -= (M_PIf * M_PIf) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const float ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x; + return sign * result; +} + +#endif // _THMATH_H diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp new file mode 100644 index 0000000..e13b02f --- /dev/null +++ b/aten/src/TH/THMemoryFile.cpp @@ -0,0 +1,689 @@ +#include "THMemoryFile.h" +#include "THStorage.hpp" +#include "THFilePrivate.h" +#include "THDiskFile.h" +#include "stdint.h" + +#ifndef _WIN32 +#include +#endif + +typedef struct THMemoryFile__ +{ + THFile file; + THCharStorage *storage; + ssize_t size; + ssize_t position; + int longSize; + +} THMemoryFile; + +static int THMemoryFile_isOpened(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + return (mfself->storage != NULL); +} + +static int8_t *THMemoryFile_strnextspace(int8_t *str_, int8_t *c_) +{ + int8_t c; + + while( (c = *str_) ) + { + if( (c != ' ') && (c != '\n') && (c != ':') && (c != ';') ) + break; + str_++; + } + + while( (c = *str_) ) + { + if( (c == ' ') || (c == '\n') || (c == ':') || (c == ';') ) + { + *c_ = c; + *str_ = '\0'; + return(str_); + } + str_++; + } + return NULL; +} + +static void THMemoryFile_grow(THMemoryFile *self, ssize_t size) +{ + ssize_t missingSpace; + + if(size <= self->size) + return; + else + { + if(size < self->storage->size) /* note the "<" and not "<=" */ + { + self->size = size; + THCharStorage_data(self->storage)[self->size] = '\0'; + return; + } + } + + missingSpace = size-self->storage->size+1; /* +1 for the '\0' */ + THCharStorage_resize(self->storage, (self->storage->size/2 > missingSpace ? + self->storage->size + (self->storage->size/2) + : self->storage->size + missingSpace)); +} + +static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable) +{ + *isReadable = 0; + *isWritable = 0; + if(strlen(mode) == 1) + { + if(*mode == 'r') + { + *isReadable = 1; + return 1; + } + else if(*mode == 'w') + { + *isWritable = 1; + return 1; + } + } + else if(strlen(mode) == 2) + { + if(mode[0] == 'r' && mode[1] == 'w') + { + *isReadable = 1; + *isWritable = 1; + return 1; + } + } + return 0; +} + +/********************************************************/ + +#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM, INSIDE_SPACING) \ + static ssize_t THMemoryFile_read##TYPEC(THFile *self, TYPE *data, ssize_t n) \ + { \ + THMemoryFile *mfself = (THMemoryFile*)self; \ + ssize_t nread = 0; \ + \ + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); \ + THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file"); \ + \ + if (n == 0) \ + return 0; \ + \ + if(mfself->file.isBinary) \ + { \ + ssize_t nByte = sizeof(TYPE)*n; \ + ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); \ + nread = nByteRemaining/sizeof(TYPE); \ + memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nread*sizeof(TYPE)); \ + mfself->position += nread*sizeof(TYPE); \ + } \ + else \ + { \ + ssize_t i; \ + for(i = 0; i < n; i++) \ + { \ + ssize_t nByteRead = 0; \ + int8_t spaceChar = 0; \ + int8_t *spacePtr = THMemoryFile_strnextspace(THCharStorage_data(mfself->storage)+mfself->position, &spaceChar); \ + ASCII_READ_ELEM; \ + if(ret == EOF) \ + { \ + while(THCharStorage_data(mfself->storage)[mfself->position]) \ + mfself->position++; \ + } \ + else \ + mfself->position += nByteRead; \ + if(spacePtr) \ + *spacePtr = spaceChar; \ + } \ + if(mfself->file.isAutoSpacing && (n > 0)) \ + { \ + if( (mfself->position < mfself->size) && (THCharStorage_data(mfself->storage)[mfself->position] == '\n') ) \ + mfself->position++; \ + } \ + } \ + \ + if(nread != n) \ + { \ + mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \ + if(!mfself->file.isQuiet) \ + THError("read error: read %d blocks instead of %d", nread, n); \ + } \ + \ + return nread; \ + } \ + \ + static ssize_t THMemoryFile_write##TYPEC(THFile *self, TYPE *data, ssize_t n) \ + { \ + THMemoryFile *mfself = (THMemoryFile*)self; \ + \ + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); \ + THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file"); \ + \ + if (n == 0) \ + return 0; \ + \ + if(mfself->file.isBinary) \ + { \ + ssize_t nByte = sizeof(TYPE)*n; \ + THMemoryFile_grow(mfself, mfself->position+nByte); \ + memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByte); \ + mfself->position += nByte; \ + if(mfself->position > mfself->size) \ + { \ + mfself->size = mfself->position; \ + THCharStorage_data(mfself->storage)[mfself->size] = '\0'; \ + } \ + } \ + else \ + { \ + ssize_t i; \ + for(i = 0; i < n; i++) \ + { \ + ssize_t nByteWritten; \ + while (1) \ + { \ + ASCII_WRITE_ELEM; \ + if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) ) \ + { \ + mfself->position += nByteWritten; \ + break; \ + } \ + THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2); \ + } \ + if(mfself->file.isAutoSpacing) \ + { \ + if(i < n-1) \ + { \ + THMemoryFile_grow(mfself, mfself->position+1); \ + sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, " "); \ + mfself->position++; \ + } \ + if(i == n-1) \ + { \ + THMemoryFile_grow(mfself, mfself->position+1); \ + sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, "\n"); \ + mfself->position++; \ + } \ + } \ + } \ + if(mfself->position > mfself->size) \ + { \ + mfself->size = mfself->position; \ + THCharStorage_data(mfself->storage)[mfself->size] = '\0'; \ + } \ + } \ + \ + return n; \ + } + + +void THMemoryFile_longSize(THFile *self, int size) +{ + THMemoryFile *dfself = (THMemoryFile*)(self); + THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified"); + dfself->longSize = size; +} + +THCharStorage *THMemoryFile_storage(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + + THCharStorage_resize(mfself->storage, mfself->size+1); + + return mfself->storage; +} + +static void THMemoryFile_synchronize(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); +} + +static void THMemoryFile_seek(THFile *self, ssize_t position) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + THArgCheck(position >= 0, 2, "position must be positive"); + + if(position <= mfself->size) + mfself->position = position; + else + { + mfself->file.hasError = 1; + if(!mfself->file.isQuiet) + THError("unable to seek at position %zu", position); + } +} + +static void THMemoryFile_seekEnd(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + + mfself->position = mfself->size; +} + +static ssize_t THMemoryFile_position(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + return mfself->position; +} + +static void THMemoryFile_close(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + THCharStorage_free(mfself->storage); + mfself->storage = NULL; +} + +static void THMemoryFile_free(THFile *self) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + + if(mfself->storage) + THCharStorage_free(mfself->storage); + + THFree(mfself); +} + +/* READ_WRITE_METHODS(bool, Bool, */ +/* int value = 0; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */ +/* int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%d", value), */ +/* 1) */ + +READ_WRITE_METHODS(uint8_t, Byte, + ssize_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position); \ + if(spacePtr) *spacePtr = spaceChar; \ + nByteRead = ret; \ + nread = ret; \ + i = n-1; \ + memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead), + nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \ + i = n-1; \ + if(nByteWritten > -1) + memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten), + 0) + +/* DEBUG: we should check if %n is count or not as a element (so ret might need to be ret-- on some systems) */ +/* Note that we do a trick for char */ +READ_WRITE_METHODS(int8_t, Char, + ssize_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position); \ + if(spacePtr) *spacePtr = spaceChar; \ + nByteRead = ret; \ + nread = ret; \ + i = n-1; \ + memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead), + nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \ + i = n-1; \ + if(nByteWritten > -1) + memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten), + 0) + +READ_WRITE_METHODS(int16_t, Short, + int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%hd", data[i]), + 1) + +READ_WRITE_METHODS(int32_t, Int, + int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]), + 1) + +READ_WRITE_METHODS(float, Float, + int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]), + 1) + +READ_WRITE_METHODS(THHalf, Half, + int nByteRead_; float buf; \ + int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \ + data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%.9g", TH_half2float(data[i])), + 1) + +READ_WRITE_METHODS(double, Double, + int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]), + 1) + +static ssize_t THMemoryFile_readLong(THFile *self, int64_t *data, ssize_t n) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + ssize_t nread = 0L; + + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file"); + + if (n == 0) + return 0; + + if(mfself->file.isBinary) + { + if(mfself->longSize == 0 || mfself->longSize == sizeof(int64_t)) + { + ssize_t nByte = sizeof(int64_t)*n; + ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); + nread = nByteRemaining/sizeof(int64_t); + memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nread*sizeof(int64_t)); + mfself->position += nread*sizeof(int64_t); + } else if(mfself->longSize == 4) + { + ssize_t nByte = 4*n; + ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); + int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position); + nread = nByteRemaining/4; + ssize_t i; + for(i = 0; i < nread; i++) + data[i] = storage[i]; + mfself->position += nread*4; + } + else /* if(mfself->longSize == 8) */ + { + int big_endian = !THDiskFile_isLittleEndianCPU(); + ssize_t nByte = 8*n; + int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position); + ssize_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); + nread = nByteRemaining/8; + ssize_t i; + for(i = 0; i < nread; i++) + data[i] = storage[2*i + big_endian]; + mfself->position += nread*8; + } + } + else + { + ssize_t i; + for(i = 0; i < n; i++) + { + ssize_t nByteRead = 0; + int8_t spaceChar = 0; + int8_t *spacePtr = THMemoryFile_strnextspace(THCharStorage_data(mfself->storage)+mfself->position, &spaceChar); + int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%" PRId64 "%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++; + if(ret == EOF) + { + while(THCharStorage_data(mfself->storage)[mfself->position]) + mfself->position++; + } + else + mfself->position += nByteRead; + if(spacePtr) + *spacePtr = spaceChar; + } + if(mfself->file.isAutoSpacing && (n > 0)) + { + if( (mfself->position < mfself->size) && (THCharStorage_data(mfself->storage)[mfself->position] == '\n') ) + mfself->position++; + } + } + + if(nread != n) + { + mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ + if(!mfself->file.isQuiet) + THError("read error: read %d blocks instead of %d", nread, n); + } + + return nread; +} + +static ssize_t THMemoryFile_writeLong(THFile *self, int64_t *data, ssize_t n) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file"); + + if (n == 0) + return 0; + + if(mfself->file.isBinary) + { + if(mfself->longSize == 0 || mfself->longSize == sizeof(int64_t)) + { + ssize_t nByte = sizeof(int64_t)*n; + THMemoryFile_grow(mfself, mfself->position+nByte); + memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByte); + mfself->position += nByte; + } else if(mfself->longSize == 4) + { + ssize_t nByte = 4*n; + THMemoryFile_grow(mfself, mfself->position+nByte); + int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position); + ssize_t i; + for(i = 0; i < n; i++) + storage[i] = (int32_t) data[i]; + mfself->position += nByte; + } + else /* if(mfself->longSize == 8) */ + { + int big_endian = !THDiskFile_isLittleEndianCPU(); + ssize_t nByte = 8*n; + THMemoryFile_grow(mfself, mfself->position+nByte); + int32_t *storage = (int32_t *)(THCharStorage_data(mfself->storage) + mfself->position); + ssize_t i; + for(i = 0; i < n; i++) + { + storage[2*i + !big_endian] = 0; + storage[2*i + big_endian] = (int32_t) data[i]; + } + mfself->position += nByte; + } + if(mfself->position > mfself->size) + { + mfself->size = mfself->position; + THCharStorage_data(mfself->storage)[mfself->size] = '\0'; + } + } + else + { + ssize_t i; + for(i = 0; i < n; i++) + { + ssize_t nByteWritten; + while (1) + { + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size-mfself->position, "%" PRId64, data[i]); + if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) ) + { + mfself->position += nByteWritten; + break; + } + THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2); + } + if(mfself->file.isAutoSpacing) + { + if(i < n-1) + { + THMemoryFile_grow(mfself, mfself->position+1); + sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, " "); + mfself->position++; + } + if(i == n-1) + { + THMemoryFile_grow(mfself, mfself->position+1); + sprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, "\n"); + mfself->position++; + } + } + } + if(mfself->position > mfself->size) + { + mfself->size = mfself->position; + THCharStorage_data(mfself->storage)[mfself->size] = '\0'; + } + } + + return n; +} + +static int8_t* THMemoryFile_cloneString(const int8_t *str, ssize_t size) +{ + int8_t *cstr = static_cast(THAlloc(size)); + memcpy(cstr, str, size); + return cstr; +} + +static ssize_t THMemoryFile_readString(THFile *self, const char *format, char **str_) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file"); + THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'"); + + if(mfself->position == mfself->size) /* eof ? */ + { + mfself->file.hasError = 1; + if(!mfself->file.isQuiet) + THError("read error: read 0 blocks instead of 1"); + + *str_ = NULL; + return 0; + } + + if(format[1] == 'a') + { + ssize_t str_size = mfself->size-mfself->position; + + *str_ = (char*) THMemoryFile_cloneString(THCharStorage_data(mfself->storage)+mfself->position, str_size); + mfself->position = mfself->size; + + return str_size; + } + else + { + int8_t *p = THCharStorage_data(mfself->storage)+mfself->position; + int eolFound = 0; + ssize_t posEol; + ssize_t i; + for(i = 0; i < mfself->size-mfself->position; i++) + { + if(p[i] == '\n') + { + posEol = i; + eolFound = 1; + break; + } + } + + if(eolFound) + { + *str_ = (char*) THMemoryFile_cloneString(THCharStorage_data(mfself->storage)+mfself->position, posEol); + mfself->position += posEol+1; + return posEol; + } + else /* well, we read all! */ + { + ssize_t str_size = mfself->size-mfself->position; + + *str_ = (char*) THMemoryFile_cloneString(THCharStorage_data(mfself->storage)+mfself->position, str_size); + mfself->position = mfself->size; + + return str_size; + } + } + + *str_ = NULL; + return 0; +} + +static ssize_t THMemoryFile_writeString(THFile *self, const char *str, ssize_t size) +{ + THMemoryFile *mfself = (THMemoryFile*)self; + + THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file"); + THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file"); + + THMemoryFile_grow(mfself, mfself->position+size); + memmove(THCharStorage_data(mfself->storage)+mfself->position, str, size); + mfself->position += size; + if(mfself->position > mfself->size) + { + mfself->size = mfself->position; + THCharStorage_data(mfself->storage)[mfself->size] = '\0'; + } + + return size; +} + +THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) +{ + static struct THFileVTable vtable = { + THMemoryFile_isOpened, + + THMemoryFile_readByte, + THMemoryFile_readChar, + THMemoryFile_readShort, + THMemoryFile_readInt, + THMemoryFile_readLong, + THMemoryFile_readFloat, + THMemoryFile_readDouble, + THMemoryFile_readHalf, + THMemoryFile_readString, + + THMemoryFile_writeByte, + THMemoryFile_writeChar, + THMemoryFile_writeShort, + THMemoryFile_writeInt, + THMemoryFile_writeLong, + THMemoryFile_writeFloat, + THMemoryFile_writeDouble, + THMemoryFile_writeHalf, + THMemoryFile_writeString, + + THMemoryFile_synchronize, + THMemoryFile_seek, + THMemoryFile_seekEnd, + THMemoryFile_position, + THMemoryFile_close, + THMemoryFile_free + }; + + THMemoryFile *mfself; + int isReadable; + int isWritable; + + if(storage) + { + THArgCheck(THCharStorage_data(storage)[storage->size-1] == '\0', 1, "provided CharStorage must be terminated by 0"); + THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'"); + THCharStorage_retain(storage); + } + else + { + THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'"); + storage = THCharStorage_newWithSize(1); + THCharStorage_data(storage)[0] = '\0'; + } + + mfself = static_cast(THAlloc(sizeof(THMemoryFile))); + + mfself->storage = storage; + mfself->size = (storage ? storage->size-1 : 0); + mfself->position = 0; + mfself->longSize = 0; + + mfself->file.vtable = &vtable; + mfself->file.isQuiet = 0; + mfself->file.isReadable = isReadable; + mfself->file.isWritable = isWritable; + mfself->file.isBinary = 0; + mfself->file.isAutoSpacing = 1; + mfself->file.hasError = 0; + + return (THFile*)mfself; +} + +THFile *THMemoryFile_new(const char *mode) +{ + return THMemoryFile_newWithStorage(NULL, mode); +} diff --git a/aten/src/TH/THMemoryFile.h b/aten/src/TH/THMemoryFile.h new file mode 100644 index 0000000..b54cdcc --- /dev/null +++ b/aten/src/TH/THMemoryFile.h @@ -0,0 +1,13 @@ +#ifndef TH_MEMORY_FILE_INC +#define TH_MEMORY_FILE_INC + +#include "THFile.h" +#include "THStorage.h" + +TH_API THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode); +TH_API THFile *THMemoryFile_new(const char *mode); + +TH_API THCharStorage *THMemoryFile_storage(THFile *self); +TH_API void THMemoryFile_longSize(THFile *self, int size); + +#endif diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp new file mode 100644 index 0000000..8755f77 --- /dev/null +++ b/aten/src/TH/THRandom.cpp @@ -0,0 +1,316 @@ +#include "THGeneral.h" +#include "THRandom.h" +#include "THGenerator.hpp" + +#ifndef _WIN32 +#include +#include +#endif + +/* Code for the Mersenne Twister random generator.... */ +#define n _MERSENNE_STATE_N +#define m _MERSENNE_STATE_M + +/* Creates (unseeded) new generator*/ +static THGenerator* THGenerator_newUnseeded() +{ + THGenerator *self = (THGenerator *)THAlloc(sizeof(THGenerator)); + memset(self, 0, sizeof(THGenerator)); + self->gen_state.left = 1; + self->gen_state.seeded = 0; + self->gen_state.normal_is_valid = 0; + new (&self->mutex) std::mutex(); + return self; +} + +/* Creates new generator and makes sure it is seeded*/ +THGenerator* THGenerator_new() +{ + THGenerator *self = THGenerator_newUnseeded(); + THRandom_seed(self); + return self; +} + +THGenerator* THGenerator_copy(THGenerator *self, THGenerator *from) +{ + THGeneratorState_copy(&self->gen_state, &from->gen_state); + return self; +} + +void THGenerator_free(THGenerator *self) +{ + self->mutex.~mutex(); + THFree(self); +} + +int THGeneratorState_isValid(THGeneratorState *_gen_state) +{ + if ((_gen_state->seeded == 1) && + (_gen_state->left > 0 && _gen_state->left <= n) && (_gen_state->next <= n)) + return 1; + + return 0; +} + +THGeneratorState* THGeneratorState_copy(THGeneratorState *self, THGeneratorState *from) +{ + memcpy(self, from, sizeof(THGeneratorState)); + return self; +} + +#ifndef _WIN32 +static uint64_t readURandomLong() +{ + int randDev = open("/dev/urandom", O_RDONLY); + uint64_t randValue; + if (randDev < 0) { + THError("Unable to open /dev/urandom"); + } + ssize_t readBytes = read(randDev, &randValue, sizeof(randValue)); + if (readBytes < (ssize_t) sizeof(randValue)) { + THError("Unable to read from /dev/urandom"); + } + close(randDev); + return randValue; +} +#endif // _WIN32 + +uint64_t THRandom_seed(THGenerator *_generator) +{ +#ifdef _WIN32 + uint64_t s = (uint64_t)time(0); +#else + uint64_t s = readURandomLong(); +#endif + THRandom_manualSeed(_generator, s); + return s; +} + +/* The next 4 methods are taken from http:www.math.keio.ac.jpmatumotoemt.html + Here is the copyright: + Some minor modifications have been made to adapt to "my" C... */ + +/* + A C-program for MT19937, with initialization improved 2002/2/10. + Coded by Takuji Nishimura and Makoto Matsumoto. + This is a faster version by taking Shawn Cokus's optimization, + Matthe Bellew's simplification, Isaku Wada's double version. + + Before using, initialize the state by using init_genrand(seed) + or init_by_array(init_key, key_length). + + Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + Any feedback is very welcome. + http://www.math.keio.ac.jp/matumoto/emt.html + email: matumoto@math.keio.ac.jp +*/ + +/* Macros for the Mersenne Twister random generator... */ +/* Period parameters */ +/* #define n 624 */ +/* #define m 397 */ +#define MATRIX_A 0x9908b0dfUL /* constant vector a */ +#define UMASK 0x80000000UL /* most significant w-r bits */ +#define LMASK 0x7fffffffUL /* least significant r bits */ +#define MIXBITS(u,v) ( ((u) & UMASK) | ((v) & LMASK) ) +#define TWIST(u,v) ((MIXBITS(u,v) >> 1) ^ ((v)&1UL ? MATRIX_A : 0UL)) +/*********************************************************** That's it. */ + +void THRandom_manualSeed(THGenerator *_generator, uint64_t the_seed_) +{ + int j; + + /* This ensures reseeding resets all of the state (i.e. state for Gaussian numbers) */ + THGenerator *blank = THGenerator_newUnseeded(); + THGenerator_copy(_generator, blank); + THGenerator_free(blank); + + _generator->gen_state.the_initial_seed = the_seed_; + _generator->gen_state.state[0] = _generator->gen_state.the_initial_seed & 0xffffffffUL; + for(j = 1; j < n; j++) + { + _generator->gen_state.state[j] = (1812433253UL * (_generator->gen_state.state[j-1] ^ (_generator->gen_state.state[j-1] >> 30)) + j); + /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ + /* In the previous versions, mSBs of the seed affect */ + /* only mSBs of the array state[]. */ + /* 2002/01/09 modified by makoto matsumoto */ + _generator->gen_state.state[j] &= 0xffffffffUL; /* for >32 bit machines */ + } + _generator->gen_state.left = 1; + _generator->gen_state.seeded = 1; +} + +uint64_t THRandom_initialSeed(THGenerator *_generator) +{ + return _generator->gen_state.the_initial_seed; +} + +void THRandom_nextState(THGenerator *_generator) +{ + uint64_t *p = _generator->gen_state.state; + int j; + + _generator->gen_state.left = n; + _generator->gen_state.next = 0; + + for(j = n-m+1; --j; p++) + *p = p[m] ^ TWIST(p[0], p[1]); + + for(j = m; --j; p++) + *p = p[m-n] ^ TWIST(p[0], p[1]); + + *p = p[m-n] ^ TWIST(p[0], _generator->gen_state.state[0]); +} + +// TODO: this only returns 32-bits of randomness but as a uint64_t. This is +// weird and should be fixed. We should also fix the state to be uint32_t +// instead of uint64_t. (Or switch to a 64-bit random number generator). +uint64_t THRandom_random(THGenerator *_generator) +{ + uint64_t y; + + if (--(_generator->gen_state.left) == 0) + THRandom_nextState(_generator); + y = *(_generator->gen_state.state + (_generator->gen_state.next)++); + + /* Tempering */ + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680UL; + y ^= (y << 15) & 0xefc60000UL; + y ^= (y >> 18); + + return y; +} + +uint64_t THRandom_random64(THGenerator *_generator) +{ + uint64_t hi = THRandom_random(_generator); + uint64_t lo = THRandom_random(_generator); + return (hi << 32) | lo; +} + +// doubles have 52 bits of mantissa (fractional part) +static uint64_t DOUBLE_MASK = (1ULL << 53) - 1; +static double DOUBLE_DIVISOR = 1.0 / (1ULL << 53); + +// floats have 23 bits of mantissa (fractional part) +static uint32_t FLOAT_MASK = (1 << 24) - 1; +static float FLOAT_DIVISOR = 1.0f / (1 << 24); + +/* generates a random number on [0,1)-double-interval */ +static double uniform_double(THGenerator *_generator) +{ + uint64_t x = THRandom_random64(_generator); + return (x & DOUBLE_MASK) * DOUBLE_DIVISOR; +} + +/* generates a random number on [0,1)-double-interval */ +static float uniform_float(THGenerator *_generator) +{ + uint32_t x = (uint32_t)THRandom_random(_generator); + return (x & FLOAT_MASK) * FLOAT_DIVISOR; +} + +/********************************************************* + + Thanks *a lot* Takuji Nishimura and Makoto Matsumoto! + + Now my own code... + +*********************************************************/ + +double THRandom_standard_uniform(THGenerator *_generator) +{ + return uniform_double(_generator); +} + +double THRandom_uniform(THGenerator *_generator, double a, double b) +{ + return(uniform_double(_generator) * (b - a) + a); +} + +float THRandom_uniformFloat(THGenerator *_generator, float a, float b) +{ + return(uniform_float(_generator) * (b - a) + a); +} + +double THRandom_normal(THGenerator *_generator, double mean, double stdv) +{ + THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive"); + + /* This is known as the Box-Muller method */ + if(!_generator->gen_state.normal_is_valid) + { + _generator->gen_state.normal_x = uniform_double(_generator); + _generator->gen_state.normal_y = uniform_double(_generator); + _generator->gen_state.normal_rho = sqrt(-2. * log(1.0-_generator->gen_state.normal_y)); + _generator->gen_state.normal_is_valid = 1; + } + else + _generator->gen_state.normal_is_valid = 0; + + if(_generator->gen_state.normal_is_valid) + return _generator->gen_state.normal_rho*cos(2.*M_PI*_generator->gen_state.normal_x)*stdv+mean; + else + return _generator->gen_state.normal_rho*sin(2.*M_PI*_generator->gen_state.normal_x)*stdv+mean; +} + +double THRandom_exponential(THGenerator *_generator, double lambda) +{ + return(-1. / lambda * log(1-uniform_double(_generator))); +} + +double THRandom_cauchy(THGenerator *_generator, double median, double sigma) +{ + return(median + sigma * tan(M_PI*(uniform_double(_generator)-0.5))); +} + +/* Faut etre malade pour utiliser ca. + M'enfin. */ +double THRandom_logNormal(THGenerator *_generator, double mean, double stdv) +{ + THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive"); + return(exp(THRandom_normal(_generator, mean, stdv))); +} + +int THRandom_geometric(THGenerator *_generator, double p) +{ + THArgCheck(p > 0 && p < 1, 1, "must be > 0 and < 1"); + return((int)(log(1-uniform_double(_generator)) / log(p)) + 1); +} + +int THRandom_bernoulli(THGenerator *_generator, double p) +{ + THArgCheck(p >= 0 && p <= 1, 1, "must be >= 0 and <= 1"); + return(uniform_double(_generator) <= p); +} diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h new file mode 100644 index 0000000..5460d33 --- /dev/null +++ b/aten/src/TH/THRandom.h @@ -0,0 +1,83 @@ +#ifndef TH_RANDOM_INC +#define TH_RANDOM_INC + +#include "THGeneral.h" + +#define _MERSENNE_STATE_N 624 +#define _MERSENNE_STATE_M 397 + +/* Struct definition is moved to THGenerator.hpp, because THRandom.h +needs to be C-compatible in order to be included in C FFI extensions. */ +typedef struct THGenerator THGenerator; +typedef struct THGeneratorState THGeneratorState; + +#define torch_Generator "torch.Generator" + +/* Manipulate THGenerator objects */ +TH_API THGenerator * THGenerator_new(void); +TH_API THGenerator * THGenerator_copy(THGenerator *self, THGenerator *from); +TH_API void THGenerator_free(THGenerator *gen); + +/* Checks if given generator state is valid */ +TH_API int THGeneratorState_isValid(THGeneratorState *_gen_state); + +/* Manipulate THGeneratorState objects */ +TH_API THGeneratorState * THGeneratorState_copy(THGeneratorState *self, THGeneratorState *from); + +/* Initializes the random number generator from /dev/urandom (or on Windows +platforms with the current time (granularity: seconds)) and returns the seed. */ +TH_API uint64_t THRandom_seed(THGenerator *_generator); + +/* Initializes the random number generator with the given int64_t "the_seed_". */ +TH_API void THRandom_manualSeed(THGenerator *_generator, uint64_t the_seed_); + +/* Returns the starting seed used. */ +TH_API uint64_t THRandom_initialSeed(THGenerator *_generator); + +/* Generates a uniform 32 bits integer. */ +TH_API uint64_t THRandom_random(THGenerator *_generator); + +/* Generates a uniform 64 bits integer. */ +TH_API uint64_t THRandom_random64(THGenerator *_generator); + +/* Generates a uniform random double on [0,1). */ +TH_API double THRandom_standard_uniform(THGenerator *_generator); + +/* Generates a uniform random double on [a, b). */ +TH_API double THRandom_uniform(THGenerator *_generator, double a, double b); + +/* Generates a uniform random float on [0,1). */ +TH_API float THRandom_uniformFloat(THGenerator *_generator, float a, float b); + +/** Generates a random number from a normal distribution. + (With mean #mean# and standard deviation #stdv >= 0#). +*/ +TH_API double THRandom_normal(THGenerator *_generator, double mean, double stdv); + +/** Generates a random number from an exponential distribution. + The density is $p(x) = lambda * exp(-lambda * x)$, where + lambda is a positive number. +*/ +TH_API double THRandom_exponential(THGenerator *_generator, double lambda); + +/** Returns a random number from a Cauchy distribution. + The Cauchy density is $p(x) = sigma/(pi*(sigma^2 + (x-median)^2))$ +*/ +TH_API double THRandom_cauchy(THGenerator *_generator, double median, double sigma); + +/** Generates a random number from a log-normal distribution. + (#mean > 0# is the mean of the log-normal distribution + and #stdv# is its standard deviation). +*/ +TH_API double THRandom_logNormal(THGenerator *_generator, double mean, double stdv); + +/** Generates a random number from a geometric distribution. + It returns an integer #i#, where $p(i) = (1-p) * p^(i-1)$. + p must satisfy $0 < p < 1$. +*/ +TH_API int THRandom_geometric(THGenerator *_generator, double p); + +/* Returns true with probability $p$ and false with probability $1-p$ (p > 0). */ +TH_API int THRandom_bernoulli(THGenerator *_generator, double p); + +#endif diff --git a/aten/src/TH/THSize.cpp b/aten/src/TH/THSize.cpp new file mode 100644 index 0000000..2eb0039 --- /dev/null +++ b/aten/src/TH/THSize.cpp @@ -0,0 +1,26 @@ +#include "THSize.h" + +int THSize_isSameSizeAs(const int64_t *sizeA, int64_t dimsA, const int64_t *sizeB, int64_t dimsB) { + int d; + if (dimsA != dimsB) + return 0; + for(d = 0; d < dimsA; ++d) + { + if(sizeA[d] != sizeB[d]) + return 0; + } + return 1; +} + +ptrdiff_t THSize_nElement(int64_t dims, int64_t *size) { + if(dims == 0) + return 0; + else + { + ptrdiff_t nElement = 1; + int d; + for(d = 0; d < dims; d++) + nElement *= size[d]; + return nElement; + } +} diff --git a/aten/src/TH/THSize.h b/aten/src/TH/THSize.h new file mode 100644 index 0000000..2927f21 --- /dev/null +++ b/aten/src/TH/THSize.h @@ -0,0 +1,13 @@ +#ifndef TH_SIZE_INC +#define TH_SIZE_INC + +#include "THGeneral.h" +#include + +// THTensor functions that would work on a THSize if we had such a class in C++, +// i.e. THTensor functions that depend only on the shape of the tensor, not the type. + +TH_API int THSize_isSameSizeAs(const int64_t *sizeA, int64_t dimsA, const int64_t *sizeB, int64_t dimsB); +TH_API ptrdiff_t THSize_nElement(int64_t dims, int64_t *size); + +#endif diff --git a/aten/src/TH/THStorage.cpp b/aten/src/TH/THStorage.cpp new file mode 100644 index 0000000..f4910c3 --- /dev/null +++ b/aten/src/TH/THStorage.cpp @@ -0,0 +1,228 @@ +#include + +#include "THStorage.hpp" + +#include "generic/THStorage.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THStorage.cpp" +#include "THGenerateHalfType.h" + +#include "generic/THStorageCopy.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THStorageCopy.cpp" +#include "THGenerateHalfType.h" + +// Free a non-weak pointer to THStorage +void THStorage_free(THStorage *storage) { + if (!storage) { + return; + } + + if (storage->flag & TH_STORAGE_REFCOUNTED) { + if (--storage->refcount == 0) { + if (storage->finalizer) { + (*storage->finalizer)(); + } + storage->finalizer.~unique_ptr(); + storage->data_ptr.~DataPtr(); + THStorage_weakFree(storage); + } + } +} + +// Manually retains a weak reference +void THStorage_weakRetain(THStorage *weak_storage) { + weak_storage->weakcount++; +} + +// Releases a weak reference +void THStorage_weakFree(THStorage *weak_storage) { + if (--weak_storage->weakcount == 0) { + weak_storage->refcount.~atomic(); + weak_storage->weakcount.~atomic(); + THFree(weak_storage); + } +} + +// Given a weak reference, returns a strong reference to a storage (which must +// be freed when done) or null if the storage is already dead. +THStorage* THStorage_weakLock(THStorage *weak_storage) { + for (;;) { + int refcount = weak_storage->refcount.load(); + if (refcount == 0) return nullptr; + if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break; + } + return weak_storage; +} + +THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) { + return _THSizeDesc(THLongStorage_data(size), size->size); +} + +THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement) +{ + ptrdiff_t total_size = (size->size > 0 ? 1 : 0); + ptrdiff_t dim_infer = -1; + ptrdiff_t i; + for (i = 0; i < size->size; i++) { + if (THLongStorage_data(size)[i] == -1) { + THArgCheck(dim_infer == -1, 1, "only one dimension can be inferred"); + dim_infer = i; + } else { + total_size *= THLongStorage_data(size)[i]; + } + } + if (dim_infer != -1) { + THDescBuff buf = THLongStorage_sizeDesc(size); + THArgCheck(total_size > 0 && nElement % total_size == 0, 2, + "size '%s' is invalid for input with %td elements", buf.str, nElement); + } else { + THDescBuff buf = THLongStorage_sizeDesc(size); + THArgCheck(nElement == total_size, 2, + "size '%s' is invalid for input with %td elements", buf.str, nElement); + } + THLongStorage* copy = THLongStorage_newWithSize(size->size); + THLongStorage_copy(copy, size); + if (dim_infer != -1) { + THLongStorage_data(copy)[dim_infer] = nElement / total_size; + } + return copy; +} + +THStorage* THStorage_new(at::ScalarType scalar_type) +{ + return THStorage_newWithSize(scalar_type, 0); +} + +THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size) +{ + return THStorage_newWithAllocator(scalar_type, size, getTHDefaultAllocator()); +} + +THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size, + at::Allocator *allocator) +{ + THStorage *storage = static_cast(THAlloc(sizeof(THStorage))); + storage->scalar_type = scalar_type; + new (&storage->data_ptr) at::DataPtr(allocator->allocate(at::elementSize(scalar_type)*size)); + storage->size = size; + new (&storage->refcount) std::atomic(1); + new (&storage->weakcount) std::atomic(1); // from the strong reference + new (&storage->finalizer) std::unique_ptr(nullptr); + storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; + storage->allocator = allocator; + return storage; +} + +ptrdiff_t THStorage_size(const THStorage *self) +{ + return self->size; +} + +size_t THStorage_elementSize(const THStorage *self) +{ + return at::elementSize(self->scalar_type); +} + +THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags) +{ + size_t actual_size = -1; + THStorage *storage = THStorage_newWithDataAndAllocator(scalar_type, + THMapAllocator::makeDataPtr( + filename, + flags, + size * at::elementSize(scalar_type), + &actual_size), + size, + /* allocator */ nullptr); + + if (size <= 0) { + storage->size = actual_size/THStorage_elementSize(storage); + } + + THStorage_clearFlag(storage, TH_STORAGE_RESIZABLE); + + return storage; +} + +void THStorage_setFlag(THStorage *storage, const char flag) +{ + storage->flag |= flag; +} + +void THStorage_clearFlag(THStorage *storage, const char flag) +{ + storage->flag &= ~flag; +} + +void THStorage_retain(THStorage *storage) +{ + if (storage && (storage->flag & TH_STORAGE_REFCOUNTED)) { + ++storage->refcount; + } +} + +/* +// I don't think you should ever call this +THStorage* THStorage_newWithData(at::ScalarType scalar_type, std::unique_ptr data, ptrdiff_t size) +{ + return THStorage_newWithDataAndAllocator(scalar_type, data, size, + getTHDefaultAllocator()); +} +*/ + +THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type, + at::DataPtr&& data, ptrdiff_t size, + THAllocator* allocator) { + THStorage *storage = static_cast(THAlloc(sizeof(THStorage))); + storage->scalar_type = scalar_type; + new (&storage->data_ptr) at::DataPtr(std::move(data)); + storage->size = size; + new (&storage->refcount) std::atomic(1); + new (&storage->weakcount) std::atomic(1); // from the strong reference + new (&storage->finalizer) std::unique_ptr(nullptr); + storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; + storage->allocator = allocator; + return storage; +} + +void THStorage_resize(THStorage *storage, ptrdiff_t size) +{ + if (storage->flag & TH_STORAGE_RESIZABLE) + { + /* case when the allocator does not have a realloc defined */ + at::DataPtr old_data; + std::swap(old_data, storage->data_ptr); + ptrdiff_t old_size = storage->size; + if (size != 0) { + storage->data_ptr = storage->allocator->allocate(at::elementSize(storage->scalar_type)*size); + } + storage->size = size; + if (old_data != nullptr) { + ptrdiff_t copy_size = old_size; + if (storage->size < copy_size) { + copy_size = storage->size; + } + if (copy_size > 0) { + memcpy(storage->data_ptr.get(), old_data.get(), at::elementSize(storage->scalar_type)*copy_size); + } + } + } else { + THError("Trying to resize storage that is not resizable"); + } +} + +void THStorage_swap(THStorage *storage1, THStorage *storage2) +{ +#define SWAP(val) { std::swap(storage1->val, storage2->val); } + SWAP(scalar_type); + SWAP(data_ptr); + SWAP(size); + // don't swap refcount! + SWAP(flag); + SWAP(allocator); + SWAP(finalizer); +#undef SWAP +} diff --git a/aten/src/TH/THStorage.h b/aten/src/TH/THStorage.h new file mode 100644 index 0000000..ce53827 --- /dev/null +++ b/aten/src/TH/THStorage.h @@ -0,0 +1,25 @@ +#pragma once + +#include "THGeneral.h" +#include "THAllocator.h" + +#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME) + +#include "generic/THStorage.h" +#include "THGenerateAllTypes.h" + +#include "generic/THStorage.h" +#include "THGenerateHalfType.h" + +#include "generic/THStorageCopy.h" +#include "THGenerateAllTypes.h" + +#include "generic/THStorageCopy.h" +#include "THGenerateHalfType.h" + +// This exists to have a data-type independent way of freeing (necessary for THPPointer). +TH_API void THStorage_free(THStorage *storage); +TH_API void THStorage_weakFree(THStorage *storage); + +TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size); +TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement); diff --git a/aten/src/TH/THStorage.hpp b/aten/src/TH/THStorage.hpp new file mode 100644 index 0000000..d767ada --- /dev/null +++ b/aten/src/TH/THStorage.hpp @@ -0,0 +1,86 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include "THStorage.h" + +#include +#include +#include "THTypeConversion.hpp" +#include + +// Note [Weak references for intrusive refcounting] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Here's the scheme: +// +// - refcount == number of strong references to the object +// weakcount == number of weak references to the object, +// plus one more if refcount > 0 +// +// - THStorage stays live as long as there are any strong +// or weak pointers to it (weakcount > 0, since strong +// references count as a +1 to weakcount) +// +// - finalizers are called and data_ptr is deallocated when refcount == 0 +// +// - Once refcount == 0, it can never again be > 0 (the transition +// from > 0 to == 0 is monotonic) +// +// - When you access THStorage via a weak pointer, you must +// atomically increment the use count, if it is greater than 0. +// If it is not, you must report that the storage is dead. +// + +struct THFinalizer { + virtual void operator()() = 0; + virtual ~THFinalizer() {}; +}; + +typedef struct THStorage +{ + at::ScalarType scalar_type; + at::DataPtr data_ptr; + ptrdiff_t size; + std::atomic refcount; + std::atomic weakcount; + char flag; + at::Allocator *allocator; + std::unique_ptr finalizer; + + template + inline T * data() const { + auto scalar_type_T = at::CTypeToScalarType>::to(); + if (scalar_type != scalar_type_T) { + AT_ERROR("Attempt to access Storage having data type ", at::toString(scalar_type), + " as data type ", at::toString(scalar_type_T)); + } + return unsafe_data(); + } + + template + inline T * unsafe_data() const { + return static_cast(this->data_ptr.get()); + } +} THStorage; + +TH_API THStorage* THStorage_new(at::ScalarType scalar_type); +TH_API THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size); +TH_API THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size, + at::Allocator *allocator); + +TH_API ptrdiff_t THStorage_size(const THStorage *self); +TH_API size_t THStorage_elementSize(); +TH_API THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags); +TH_API void THStorage_setFlag(THStorage *storage, const char flag); +TH_API void THStorage_clearFlag(THStorage *storage, const char flag); +TH_API void THStorage_retain(THStorage *storage); +TH_API THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type, + at::DataPtr&& data, ptrdiff_t size, + at::Allocator* allocator); +TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size); +TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2); + +TH_API void THStorage_weakRetain(THStorage *weak_storage); +TH_API void THStorage_weakFree(THStorage *weak_storage); +TH_API THStorage* THStorage_weakLock(THStorage *weak_storage); diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp new file mode 100644 index 0000000..48ddcd2 --- /dev/null +++ b/aten/src/TH/THTensor.cpp @@ -0,0 +1,116 @@ +#include +#include + +#include +#include "THTensor.hpp" +#include "THVector.h" +#include "generic/simd/simd.h" + +#include "THBlas.h" +#include "THLapack.h" +#include "THRandom.h" +#include "THTensorDimApply.h" +#include "THMath.h" + +#include "generic/THTensor.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THTensor.cpp" +#include "THGenerateHalfType.h" + +#include "generic/THTensorCopy.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THTensorCopy.cpp" +#include "THGenerateHalfType.h" + +#include "generic/THTensorRandom.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THTensorMath.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THTensorConv.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THTensorLapack.cpp" +#include "THGenerateFloatTypes.h" + +#include + +void THTensor_free(THTensor *self) +{ + if(!self) + return; + + if(--self->refcount == 0) + { + delete self; + } +} + +// On a high level, +// 1. separate oldshape chunks of dimensions, where the dimensions are +// ``contiguous'' in each chunk, i.e., oldstride[i] = oldshape[i+1] * oldstride[i+1] +// 2. newshape must be able to be separated into same number of chunks as oldshape was separated into, +// where each chunk of newshape has matching ``numel'', i.e., number of subspaces, +// as the corresponding chunk of oldshape. +at::optional> +THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape) { + if (oldshape.empty()) { + return std::vector(newshape.size(), 1); + } + + // NOTE: stride is arbitrary is somewhat arbitrary in the numel() == 0 case; + // to match NumPy behavior we copy the strides if the size matches, otherwise + // we use the stride as if it were computed via resize. + // This could perhaps be combined with the below code, but the complexity didn't seem worth it. + int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1, std::multiplies()); + if (numel == 0 && oldshape.equals(newshape)) { + return std::vector(oldstride); + } + + std::vector newstride(newshape.size()); + if (numel == 0) { + int64_t view_numel = 1; + for (int64_t view_d = newshape.size() - 1; view_d >= 0; view_d--) { + if (view_d == newshape.size() - 1) { + newstride[view_d] = 1; + } else { + newstride[view_d] = std::max(newshape[view_d+1], 1) * newstride[view_d+1]; + } + } + return newstride; + } + + int64_t view_d = newshape.size() - 1; + // stride for each subspace in the chunk + int64_t chunk_base_stride = oldstride.back(); + // numel in current chunk + int64_t tensor_numel = 1; + int64_t view_numel = 1; + for (int64_t tensor_d = oldshape.size() - 1; tensor_d >= 0; tensor_d--) { + tensor_numel *= oldshape[tensor_d]; + // if end of tensor size chunk, check view + if ((tensor_d == 0) || + (oldshape[tensor_d - 1] != 1 && oldstride[tensor_d - 1] != tensor_numel * chunk_base_stride)) { + while (view_d >= 0 && (view_numel < tensor_numel || newshape[view_d] == 1)) { + newstride[view_d] = view_numel * chunk_base_stride; + view_numel *= newshape[view_d]; + view_d--; + } + if (view_numel != tensor_numel) { + return at::nullopt; + } + if (tensor_d > 0) { + chunk_base_stride = oldstride[tensor_d - 1]; + tensor_numel = 1; + view_numel = 1; + } + } + } + if (view_d != -1) { + return at::nullopt; + } + return newstride; +} diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h new file mode 100644 index 0000000..3984bf9 --- /dev/null +++ b/aten/src/TH/THTensor.h @@ -0,0 +1,38 @@ +#ifndef TH_TENSOR_INC +#define TH_TENSOR_INC + +#include "THStorage.h" +#include "THTensorApply.h" + +#define THTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME) + +/* basics */ +#include "generic/THTensor.h" +#include "THGenerateAllTypes.h" + +#include "generic/THTensor.h" +#include "THGenerateHalfType.h" + +#include "generic/THTensorCopy.h" +#include "THGenerateAllTypes.h" + +#include "generic/THTensorCopy.h" +#include "THGenerateHalfType.h" + +/* random numbers */ +#include "THRandom.h" +#include "generic/THTensorRandom.h" +#include "THGenerateAllTypes.h" + +/* maths */ +#include "generic/THTensorMath.h" +#include "THGenerateAllTypes.h" + +/* convolutions */ +#include "generic/THTensorConv.h" +#include "THGenerateAllTypes.h" + +/* lapack support */ +#include "generic/THTensorLapack.h" +#include "THGenerateFloatTypes.h" +#endif diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp new file mode 100644 index 0000000..1d268a7 --- /dev/null +++ b/aten/src/TH/THTensor.hpp @@ -0,0 +1,87 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include "THTensor.h" +#include "THStorage.hpp" + +#include +#include + +struct THTensor +{ + THTensor(THStorage* storage) + : refcount(1) + , storage(storage) + , storageOffset(0) + // TODO: Naughty naughty! + , size(static_cast(THAlloc(sizeof(int64_t)))) + , stride(static_cast(THAlloc(sizeof(int64_t)))) + , dim_(1) + { + size[0] = 0; + stride[0] = 1; + } + + ~THTensor() { + THFree(size); + THFree(stride); + if (storage) { + THStorage_free(storage); + } + } + + std::atomic refcount; + + // Note: storage->size may be greater than the recorded size + // of a tensor + THStorage *storage; + ptrdiff_t storageOffset; + + int64_t *size; + int64_t *stride; + int64_t dim_; + + template + inline T * data() const { + return storage->data() + storageOffset; + } + + template + inline T * unsafe_data() const { + return storage->unsafe_data() + storageOffset; + } + + // [NOTE: _dim() vs dim()] + // _dim() returns the "old" TH dimension view where no dimensions represents an empty tensor. + // dim() returns the ATen view of the dimensionality, i.e. 0-sized dimensions are supported. + inline int64_t _dim() const { + return is_empty() ? 0 : dim_; + } + + inline int64_t dim() const { + return dim_; + } + + // represents that numel() == 0. + inline bool is_empty() const { + for (int64_t i = 0; i < dim_; ++i) { + if (size[i] == 0) { + return true; + } + } + return false; + } + + inline at::IntList sizes() { + return at::IntList(size, dim_); + } +}; + +#include "generic/THTensorFastGetSet.hpp" +#include "THGenerateAllTypes.h" + +TH_API void THTensor_free(THTensor *self); +TH_CPP_API at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, + at::IntList newshape); diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h new file mode 100644 index 0000000..0b699e8 --- /dev/null +++ b/aten/src/TH/THTensorApply.h @@ -0,0 +1,526 @@ +#ifndef TH_TENSOR_APPLY_INC +#define TH_TENSOR_APPLY_INC + +/* + * The basic strategy for apply is as follows: + * + * 1. Starting with the outermost index, loop until we reach a dimension where the + * data is no longer contiguous, i.e. the stride at that dimension is not equal to + * the size of the tensor defined by the outer dimensions. Let's call this outer + * (contiguous) tensor A. Note that if the Tensor is contiguous, then A is equal + * to the entire Tensor. Let's call the inner tensor B. + * + * 2. We loop through the indices in B, starting at its outermost dimension. For + * example, if B is a 2x2 matrix, then we do: + * + * B[0][0] + * B[0][1] + * B[1][0] + * B[1][1] + * + * We set the offset into the underlying storage as (storageOffset + stride_B * index_B), + * i.e. basically we compute the offset into the storage as we would normally for a + * Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we + * can simply loop for sizeof(A) iterations and perform the operation, without having to + * follow the order described by the strides of A. + * + * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For + * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two + * dimensions can be merged for the purposes of APPLY, reducing the number of nested + * loops. + */ + +#define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \ + TYPE *TENSOR##_data = NULL; \ + int64_t *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \ + int64_t TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \ + int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \ + TENSOR##_n = 1; \ + for(TENSOR##_i = 0; TENSOR##_i < TENSOR->dim(); TENSOR##_i++) \ + TENSOR##_n *= TENSOR->size[TENSOR##_i]; \ +\ + if(TENSOR->is_empty()) \ + TH_TENSOR_APPLY_hasFinished = 1; \ + else \ + { \ + TENSOR##_data = TENSOR->storage->data()+TENSOR->storageOffset; \ + TENSOR##_size = 1; \ + TENSOR##_stride = 1; \ + for(TENSOR##_i = TENSOR->_dim()-1; TENSOR##_i >= 0; TENSOR##_i--) { \ + if(TENSOR->size[TENSOR##_i] != 1) { \ + if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \ + TENSOR##_size *= TENSOR->size[TENSOR##_i]; \ + else{ \ + TENSOR##_contiguous = 0; \ + break; \ + } \ + } \ + } \ + if (!TENSOR##_contiguous) { \ + /* Find the dimension of contiguous sections */ \ + TENSOR##_dim = 1; \ + for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; TENSOR##_i--) \ + { \ + if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \ + TENSOR##_dim++; \ + } \ + /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \ + TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*(3*TENSOR##_dim)); \ + TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \ + TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \ + TH_TENSOR_dim_index = TENSOR##_dim-1; \ + TENSOR##_dimOffset = (DIM == TENSOR->_dim()-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \ + TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->_dim()-1]; \ + TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->_dim()-1]; \ + /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \ + /* storage is given by storage_offset + (i * j), where i is the stride */ \ + /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \ + for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \ + TENSOR##_counter[TENSOR##_i] = 0; \ + } \ + for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; --TENSOR##_i) { \ + if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \ + TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \ + if (DIM != TENSOR->_dim()-1 && TENSOR##_i < DIM) \ + TENSOR##_dimOffset--; \ + } else { \ + --TH_TENSOR_dim_index; \ + TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \ + TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \ + } \ + } \ + /* Size of the inner most section */ \ + TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \ + /* Stride of the inner most section */ \ + TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \ + } \ + else{\ + TENSOR##_dim = 1;\ + TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*3);\ + TENSOR##_sizes = TENSOR##_counter + 1;\ + TENSOR##_strides = TENSOR##_counter + 2;\ + TENSOR##_sizes[0] = TENSOR##_n;\ + TENSOR##_strides[0] = 1;\ + TENSOR##_size = TENSOR##_sizes[0];\ + TENSOR##_stride = TENSOR##_strides[0];\ + }\ + } \ + TENSOR##_i = 0; + +#define __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \ + if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \ + { \ + if(TENSOR##_contiguous) \ + break; \ +\ + if(TENSOR##_dim == 1) \ + break; \ +\ + /* Reset pointer to beginning of loop */ \ + TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \ + for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \ + { \ + TENSOR##_counter[TENSOR##_i]++; \ + /* Jump ahread by the stride of this dimension */ \ + TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \ +\ + if(TENSOR##_counter[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]) \ + { \ + if(TENSOR##_i == 0) \ + { \ + TH_TENSOR_APPLY_hasFinished = 1; \ + break; \ + } \ + else \ + { \ + /* Reset the pointer to the beginning of the chunk defined by this dimension */ \ + TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \ + TENSOR##_counter[TENSOR##_i] = 0; \ + } \ + } \ + else \ + break; \ + } \ + TENSOR##_i = 0; \ + } \ + +#define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \ +{ \ + int TH_TENSOR_APPLY_hasFinished = 0; \ + int64_t TH_TENSOR_dim_index = 0; \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \ + \ + int elements_equal = 1; \ + if(TENSOR1##_n != TENSOR2##_n) { \ + elements_equal = 0; \ + } \ + else if(TENSOR1##_n != TENSOR3##_n) { \ + elements_equal = 0; \ + } \ + if (elements_equal == 0) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ + THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \ + THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \ + "number of elements, but got %d, %d and %d elements respectively", \ + #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, \ + TENSOR1##_n, TENSOR2##_n, TENSOR3##_n); \ + } \ + \ + while(!TH_TENSOR_APPLY_hasFinished) \ + { \ + /* Loop through the inner most region of the Tensor */ \ + for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \ + { \ + CODE \ + } \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \ + } \ + if(TENSOR1##_counter != NULL) \ + THFree(TENSOR1##_counter); \ + if(TENSOR2##_counter != NULL) \ + THFree(TENSOR2##_counter); \ + if(TENSOR3##_counter != NULL) \ + THFree(TENSOR3##_counter); \ +} + +#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \ + TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE) + +#define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \ +{ \ + int TH_TENSOR_APPLY_hasFinished = 0; \ + int64_t TH_TENSOR_dim_index = 0; \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \ +\ + if(TENSOR1##_n != TENSOR2##_n) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ + THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \ + "number of elements, but got %d and %d elements respectively", \ + #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, TENSOR1##_n, TENSOR2##_n); \ + } \ + while(!TH_TENSOR_APPLY_hasFinished) \ + { \ + /* Loop through the inner most region of the Tensor */ \ + for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \ + { \ + CODE \ + } \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \ + } \ + if(TENSOR1##_counter != NULL) \ + THFree(TENSOR1##_counter); \ + if(TENSOR2##_counter != NULL) \ + THFree(TENSOR2##_counter); \ +} + +#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ + TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE) + +#define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \ +{ \ + int TH_TENSOR_APPLY_hasFinished = 0; \ + int64_t TH_TENSOR_dim_index = 0; \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \ +\ + while(!TH_TENSOR_APPLY_hasFinished) \ + { \ + /* Loop through the inner most region of the Tensor */ \ + for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \ + { \ + CODE \ + } \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \ + } \ + THFree(TENSOR##_counter); \ +} + +#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \ + TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE) + + +#ifdef _OPENMP + +#ifndef _WIN32 +#define PRAGMA(P) _Pragma(#P) +#else +#define PRAGMA(P) __pragma(P) +#endif + +#include + +/* + * Calcuate the memory offset of an element in a tensor. The strategy is below: + * + * 1. convert the line index(the index of the element) to the indexs(coordinates) in the tensor. + * It can hinted by a classical problem: Getting each individual digit from a whole integer(Decimal base). + * A N-digit decimal base number could be view as a N-dimension tensor and the sizes of the tensor are 10. + * So the value the whole interger is the line index. And the digits could be viewed as the indexes in + * different dimentions. + * + * 2. convert the indexs(coordinates) in the tensor to the memory offset. + * + * You can get the detailes in the for-statement iterations. + * + * The macro is only used in the first element in each thread. For the rest, the memory offset could update + * according to info of the tensor in order to get better performance. So we should also record the each + * indexs in coresponding dimension of first element. + * The recorded info is stored in the TENSOR##_counter_tmp. + * + */ +#define __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR) \ + int64_t *TENSOR##_counter_tmp = (int64_t*)THAlloc(sizeof(int64_t) * TENSOR##_dim); \ + ptrdiff_t TENSOR##_memory_offset = 0; \ + ptrdiff_t TENSOR##_quot = line_index_start; \ + for (TENSOR##_i = TENSOR##_dim-1; TENSOR##_i>=0; --TENSOR##_i) { \ + TENSOR##_counter_tmp[TENSOR##_i] = TENSOR##_quot%TENSOR##_sizes[TENSOR##_i]; \ + TENSOR##_quot /= TENSOR##_sizes[TENSOR##_i]; \ + TENSOR##_memory_offset += TENSOR##_counter_tmp[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \ + } + +/* + * The macro update the indexes in each dimension of the elements except for the first one allocated in + * each thread. + * For a tensor, if the index of some dimension reaches the size of the corresponding dimension. It will carry and clear. + * If the index of next high dimension does do, the index of next high dimension should carry and clear, too. + * + * The momery offset calculatation is a little confusing. If current index carries, the current index is set to 0. So + * the offset should decrease by size*stride of the last dimension. Then the index next high dimension increases by 1. So + * the offset should increase by stride of next high dimension. + */ +#define __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR) \ + if(TENSOR##_i == TENSOR##_size && TENSOR##_dim > 1){ /*reaches the edge*/ \ + int TENSOR##_carry_coord = 1; /*set carry flag to true*/ \ + TENSOR##_start = 0; /*the current index be cleared to 0*/\ + TENSOR##_data -= TENSOR##_size * TENSOR##_stride; /*the momery offset reset to the first one in current dimension */\ + for(TENSOR##_i = TENSOR##_dim - 2; (TENSOR##_i >= 0) && (TENSOR##_carry_coord); TENSOR##_i--){ \ + TENSOR##_counter_tmp[TENSOR##_i]++; /*the index of next high dimension update*/ \ + TENSOR##_data += TENSOR##_strides[TENSOR##_i]; /*memory offset increase by stride of next high dimension*/\ + if(TENSOR##_counter_tmp[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]){ /*The next high dimension also carry, continue + to clear and carry*/\ + TENSOR##_data -= TENSOR##_sizes[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \ + TENSOR##_counter_tmp[TENSOR##_i] = 0; \ + } else { \ + TENSOR##_carry_coord = 0; \ + } \ + } \ + } else { \ + TENSOR##_start = TENSOR##_i; \ + } + + +#define TH_TENSOR_APPLY_REDUCTION_OMP(TYPE, TENSOR, OPERATION, CODE, OMP_THRESHOLD) \ +{\ + int TENSOR##Contg = THTensor_(isContiguous)(TENSOR); \ + ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR); \ + if(TENSOR##Contg){ \ + ptrdiff_t iter = 0; \ + TYPE *rp = TENSOR->storage->data()+TENSOR->storageOffset; \ + PRAGMA( omp parallel for if (TENSOR##Size > OMP_THRESHOLD * 10) firstprivate(rp) reduction(OPERATION) ) \ + for (iter = 0; iter < TENSOR##Size; iter++) { \ + TYPE *TENSOR##_data = rp+iter; \ + CODE \ + } \ + } else { \ + int TH_TENSOR_APPLY_hasFinished = 0; \ + int64_t TH_TENSOR_dim_index = 0; \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, -1, 1);\ + if (0 == TH_TENSOR_APPLY_hasFinished) { \ + PRAGMA(omp parallel if (TENSOR##Size > OMP_THRESHOLD) firstprivate(TENSOR##_data, TENSOR##_sizes, TENSOR##_strides, TENSOR##_dim, TENSOR##_stride, TENSOR##_size, TENSOR##_i) reduction(OPERATION))\ + {\ + size_t num_threads = omp_get_num_threads();\ + size_t tid = omp_get_thread_num();\ + size_t line_seg_length_avg = TENSOR##Size/num_threads; \ + ptrdiff_t line_index_start = tid * line_seg_length_avg; \ + ptrdiff_t line_seg_length = (tid == num_threads - 1)? (TENSOR##Size - line_index_start):line_seg_length_avg; \ + __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR);\ + TENSOR##_data += TENSOR##_memory_offset;\ + ptrdiff_t count = 0;\ + ptrdiff_t TENSOR##_start = TENSOR##_counter_tmp[TENSOR##_dim - 1];\ + while(count < line_seg_length){\ + for(TENSOR##_i=TENSOR##_start; (count < line_seg_length)&&(TENSOR##_i < TENSOR##_size); ++TENSOR##_i, ++count){\ + CODE\ + TENSOR##_data += TENSOR##_stride;\ + }\ + if(count < line_seg_length){\ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR);\ + }\ + }\ + if(TENSOR##_counter_tmp != NULL) \ + THFree(TENSOR##_counter_tmp); \ + }\ + }\ + if(TENSOR##_counter != NULL)\ + THFree(TENSOR##_counter);\ + }\ +} + +#define TH_TENSOR_APPLY2_OMP(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, OMP_THRESHOLD) \ +{ \ + /* for advanced searching index*/ \ + if( CONTIG1 && CONTIG2 ){ \ + TYPE1 *rp = TENSOR1->storage->data()+TENSOR1->storageOffset; \ + TYPE2 *tp = TENSOR2->storage->data()+TENSOR2->storageOffset; \ + ptrdiff_t iter = 0; \ + if(tp != (TYPE2*)rp) { \ + PRAGMA(ivdep) \ + PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \ + for (iter = 0; iter < SIZE; iter++) { \ + TYPE2 *TENSOR2##_data = tp+iter; \ + TYPE1 *TENSOR1##_data = rp+iter; \ + CODE \ + }\ + } else {\ + PRAGMA(simd) \ + PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) ) \ + for (iter = 0; iter < SIZE; iter++) {\ + TYPE2* TENSOR2##_data = tp+iter;\ + TYPE1* TENSOR1##_data = rp+iter;\ + CODE \ + }\ + }\ + } else { \ + /* The following strategy is not easy to understand. + * 1. Collapse the dimension of the tensors in order to decrease the number of nested loops. + * 2. Calculate the numbers of elements allocated in each thread and the line index of the first one. + * 3. Calculate the memory offset of the first element and the indexes in each dimension of the + * first one. + * 4. iterate all elements in each thread. update the indexes in each dimension of the rest. + */ \ + int TH_TENSOR_APPLY_hasFinished = 0; \ + int64_t TH_TENSOR_dim_index = 0; \ + /*step 1*/ \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \ + if (0 == TH_TENSOR_APPLY_hasFinished) { \ + PRAGMA(omp parallel if (SIZE > OMP_THRESHOLD) firstprivate(TENSOR2##_data, TENSOR2##_sizes, TENSOR2##_strides, TENSOR2##_dim, TENSOR2##_stride, TENSOR2##_size, TENSOR2##_i, TENSOR1##_data, TENSOR1##_sizes, TENSOR1##_strides, TENSOR1##_dim, TENSOR1##_stride, TENSOR1##_size, TENSOR1##_i)) \ + { \ + /*step 2*/ \ + size_t num_threads = omp_get_num_threads(); \ + size_t tid = omp_get_thread_num(); \ + size_t line_seg_length_avg = SIZE/num_threads; \ + ptrdiff_t line_index_start = tid * line_seg_length_avg; \ + ptrdiff_t line_seg_length = (tid == num_threads - 1)? (SIZE - line_index_start):line_seg_length_avg; \ + /* step 3*/ \ + __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2); \ + __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1); \ + TENSOR2##_data += TENSOR2##_memory_offset; \ + TENSOR1##_data += TENSOR1##_memory_offset; \ + ptrdiff_t count = 0; \ + ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim-1]; \ + ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim-1]; \ + /* step 4*/ \ + while (count < line_seg_length) { \ + for(TENSOR2##_i=TENSOR2##_start, TENSOR1##_i = TENSOR1##_start; ((count < line_seg_length) && (TENSOR2##_i < TENSOR2##_size) && (TENSOR1##_i < TENSOR1##_size)); ++TENSOR2##_i, ++TENSOR1##_i, ++count){ \ + CODE \ + TENSOR2##_data += TENSOR2##_stride; \ + TENSOR1##_data += TENSOR1##_stride; \ + } \ + if (count < line_seg_length){ \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR2); \ + __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR1); \ + } \ + } \ + if(TENSOR1##_counter_tmp != NULL) \ + THFree(TENSOR1##_counter_tmp); \ + if(TENSOR2##_counter_tmp != NULL) \ + THFree(TENSOR2##_counter_tmp); \ + } \ + } \ + if(TENSOR2##_counter != NULL) \ + THFree(TENSOR2##_counter); \ + if(TENSOR1##_counter != NULL) \ + THFree(TENSOR1##_counter);\ + }\ +} + +#define TH_TENSOR_APPLY3_OMP(SIZE, CONTIG1, CONTIG2, CONTIG3, TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE, OMP_THRESHOLD) \ +{ \ + /* for adveanced searching index*/ \ + if(CONTIG1 && CONTIG2 && CONTIG3){ \ + TYPE1 *rp = TENSOR1->storage->data()+TENSOR1->storageOffset; \ + TYPE2 *tp = TENSOR2->storage->data()+TENSOR2->storageOffset; \ + TYPE3 *srcp = TENSOR3->storage->data()+TENSOR3->storageOffset; \ + ptrdiff_t iter = 0;\ + if(tp != (TYPE2*)rp) { \ + PRAGMA(ivdep) \ + PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \ + for (iter = 0; iter < SIZE; iter++) {\ + TYPE1 *TENSOR1##_data = rp+iter;\ + TYPE2 *TENSOR2##_data = tp+iter; \ + TYPE3 *TENSOR3##_data = srcp+iter;\ + CODE \ + } \ + } else {\ + PRAGMA(simd) \ + PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \ + for (iter = 0; iter < SIZE; iter++) {\ + TYPE1 *TENSOR1##_data = rp+iter;\ + TYPE2 *TENSOR2##_data = tp+iter; \ + TYPE3 *TENSOR3##_data = srcp+iter;\ + CODE \ + } \ + }\ + } else{ \ + int TH_TENSOR_APPLY_hasFinished = 0;\ + int64_t TH_TENSOR_dim_index = 0;\ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \ + __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, -1, 1) \ + if (0 == TH_TENSOR_APPLY_hasFinished) { \ + PRAGMA(omp parallel if (SIZE > OMP_THRESHOLD) firstprivate(TENSOR1##_data, TENSOR1##_sizes, TENSOR1##_strides, TENSOR1##_dim, TENSOR1##_stride, TENSOR1##_size, TENSOR1##_i, TENSOR2##_data, TENSOR2##_sizes, TENSOR2##_strides, TENSOR2##_dim, TENSOR2##_stride, TENSOR2##_size, TENSOR2##_i, TENSOR3##_data, TENSOR3##_sizes, TENSOR3##_strides, TENSOR3##_dim, TENSOR3##_stride, TENSOR3##_size, TENSOR3##_i))\ + {\ + size_t num_threads = omp_get_num_threads();\ + size_t tid = omp_get_thread_num();\ + size_t line_seg_length_avg = SIZE/num_threads; \ + ptrdiff_t line_index_start = tid * line_seg_length_avg; \ + ptrdiff_t line_seg_length = (tid == num_threads - 1)? (SIZE - line_index_start):line_seg_length_avg; \ + __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1);\ + __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2);\ + __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR3);\ + TENSOR1##_data += TENSOR1##_memory_offset;\ + TENSOR2##_data += TENSOR2##_memory_offset;\ + TENSOR3##_data += TENSOR3##_memory_offset;\ + ptrdiff_t count = 0;\ + ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim - 1];\ + ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim - 1];\ + ptrdiff_t TENSOR3##_start = TENSOR3##_counter_tmp[TENSOR3##_dim - 1];\ + while(count < line_seg_length){\ + for(TENSOR1##_i=TENSOR1##_start, TENSOR2##_i=TENSOR2##_start,TENSOR3##_i=TENSOR3##_start; (countdim(); TH_TENSOR_DIM_APPLY_i++) \ + { \ + if (TH_TENSOR_DIM_APPLY_i == DIMENSION) \ + continue; \ + if (TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \ + shape_check_flag = 1; \ + break; \ + } \ + if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) { \ + shape_check_flag = 1; \ + break; \ + } \ + } \ + if (shape_check_flag == 1) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ + THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \ + THError("Expected %s %s, %s %s and %s %s to have the same size apart from dimension %d", \ + #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \ + } \ +} + +#define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, SIZE_CHECK, CODE) \ +{ \ + TYPE1 *TENSOR1##_data = NULL; \ + TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \ + TYPE2 *TENSOR2##_data = NULL; \ + TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \ + TYPE3 *TENSOR3##_data = NULL; \ + TH_UNUSED int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \ + int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \ + int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ + int TH_TENSOR_DIM_APPLY_i; \ +\ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ + THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \ + int same_dims = 1; \ + if( TENSOR1->dim() != TENSOR2->dim() ) { \ + same_dims = 0; \ + } \ + if( TENSOR1->dim() != TENSOR3->dim() ) { \ + same_dims = 0; \ + } \ + if (same_dims == 0) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ + THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \ + THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \ + "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str); \ + } \ + SIZE_CHECK(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \ +\ + if (TH_TENSOR_DIM_APPLY_hasFinished) { \ + return; \ + } \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ +\ + TENSOR1##_data = (TENSOR1)->storage->data()+(TENSOR1)->storageOffset; \ + TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \ + TENSOR1##_size = TENSOR1->size[DIMENSION]; \ +\ + TENSOR2##_data = (TENSOR2)->storage->data()+(TENSOR2)->storageOffset; \ + TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \ + TENSOR2##_size = TENSOR2->size[DIMENSION]; \ +\ + TENSOR3##_data = (TENSOR3)->storage->data()+(TENSOR3)->storageOffset; \ + TENSOR3##_stride = (TENSOR3)->stride[DIMENSION]; \ + TENSOR3##_size = TENSOR3->size[DIMENSION]; \ +\ + while(!TH_TENSOR_DIM_APPLY_hasFinished) \ + { \ + CODE \ +\ + if(TENSOR1->dim() == 1) \ + break; \ + \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + { \ + TH_TENSOR_DIM_APPLY_hasFinished = 1; \ + break; \ + } \ + continue; \ + } \ +\ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \ + TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR3##_data += TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \ +\ + if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + { \ + TH_TENSOR_DIM_APPLY_hasFinished = 1; \ + break; \ + } \ + else \ + { \ + TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ + } \ + } \ + else \ + break; \ + } \ + } \ + THFree(TH_TENSOR_DIM_APPLY_counter); \ +} + +/** + * Similar to DIM_APPLY(...) but we maintain two sets of pointers: one for the first tensor + * and one for the second. The two tensors must have the same shape, other than at the + * specified DIMENSION. This function makes it easy to store the output from reducing the + * TENSOR at index. For example, in the sum example described below, we could instead do: + * + * int64_t i = 0; + * TYPE1 sum; + * + * for (i = 0; i < TENSOR1##_size; ++i) { + * sum += TENSOR1##_data[i * TENSOR1##_stride] + * } + * *TENSOR2##_data = (TYPE2) sum; + * + * In particular, we guarantee that the offset into TENSOR2 will be what you would get if + * you applied all of the index values used to generate the offset into TENSOR1. + */ +#define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \ +{ \ + TYPE1 *TENSOR1##_data = NULL; \ + TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \ + TYPE2 *TENSOR2##_data = NULL; \ + TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \ + int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \ + int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ + int TH_TENSOR_DIM_APPLY_i; \ +\ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ + THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->_dim()); \ + if( TENSOR1->dim() != TENSOR2->dim() ) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ + THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \ + "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str); \ + } \ + TH_UNUSED int shape_check_flag = 0; \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ + continue; \ + if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ + THError("Expected %s %s and %s %s to have the same size in dimension %d", \ + #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, DIMENSION); \ + } \ + } \ +\ + if (TH_TENSOR_DIM_APPLY_hasFinished) { \ + return; \ + } \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ +\ + TENSOR1##_data = (TENSOR1)->storage->data()+(TENSOR1)->storageOffset; \ + TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \ + TENSOR1##_size = TENSOR1->size[DIMENSION]; \ +\ + TENSOR2##_data = (TENSOR2)->storage->data()+(TENSOR2)->storageOffset; \ + TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \ + TENSOR2##_size = TENSOR2->size[DIMENSION]; \ +\ + while(!TH_TENSOR_DIM_APPLY_hasFinished) \ + { \ + CODE \ +\ + if(TENSOR1->dim() == 1) \ + break; \ + \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + { \ + TH_TENSOR_DIM_APPLY_hasFinished = 1; \ + break; \ + } \ + continue; \ + } \ +\ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \ + TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ +\ + if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + { \ + TH_TENSOR_DIM_APPLY_hasFinished = 1; \ + break; \ + } \ + else \ + { \ + TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ + } \ + } \ + else \ + break; \ + } \ + } \ + THFree(TH_TENSOR_DIM_APPLY_counter); \ +} + +/** + * The basic idea for DIM_APPLY: Given a TENSOR and a DIMENSION, provide access to the data stored + * at all sets of dimension values other than DIMENSION, such that we can get all the values at those + * fixed indices for the various values at DIMENSION. + * + * Suppose we have a 2x3x4 Tensor A, and we have DIMENSION=2. Then we will hit CODE (2x3) times, and the + * pointer into storage will be at: + * + * A[0][0] + * A[0][1] + * A[0][2] + * A[1][0] + * A[1][1] + * A[1][2] + * + * And at each point, we can access the data for each of the four elements of the Tensor via + * TENSOR##_stride. So for example, if we wanted to sum the elements there, we could do: + * + * int64_t i = 0; + * TYPE sum; + * for (i = 0; i < TENSOR##_size; i++) { + * sum += TENSOR##_data[i * TENSOR##_stride] + * } + * + * Note that we don't have to have DIMENSION be the last tensor. If we have DIMENSION=1, then we will hit the + * code (2x4) times, with pointer into the storage at: + * + * offset + + * stride_0 * 0 + stride_2 * 0 + * stride_0 * 1 + stride_2 * 0 + * stride_0 * 0 + stride_2 * 1 + * stride_0 * 1 + stride_2 * 1 + * stride_0 * 0 + stride_2 * 2 + * stride_0 * 1 + stride_2 * 2 + * stride_0 * 0 + stride_2 * 3 + * stride_0 * 1 + stride_2 * 3 + * + * So we can again sum over the values at DIMENSION with the other indices fixed. + */ +#define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \ +{ \ + TYPE *TENSOR##_data = NULL; \ + int64_t TENSOR##_stride = 0, TENSOR##_size = 0; \ + int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \ + int TH_TENSOR_DIM_APPLY_hasFinished = 0; \ + int TH_TENSOR_DIM_APPLY_i; \ +\ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR->_dim()) ) \ + THError("invalid dimension"); \ +\ + TENSOR##_data = (TENSOR)->storage->data()+(TENSOR)->storageOffset; \ + TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \ + TENSOR##_size = TENSOR->size[DIMENSION]; \ + /* Counter stores the indices into the Tensor at any time */ \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR->_dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ +\ + while(!TH_TENSOR_DIM_APPLY_hasFinished) \ + { \ + CODE \ +\ + if(TENSOR->_dim() == 1) \ + break; \ + \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + { \ + /* Check if the index is equal to DIMENSION. We don't need to update the */ \ + /* offset if this is the case, and can consider the next index. However, */ \ + /* in the case that the DIMENSION is the last index in the Tensor, then */ \ + /* we have parsed the entire tensor and can exit */ \ + if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ + { \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR->_dim()-1) \ + { \ + TH_TENSOR_DIM_APPLY_hasFinished = 1; \ + break; \ + } \ + continue; \ + } \ +\ + /* Bump the counter at this index, update the pointer */ \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \ + TENSOR##_data += TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \ +\ + if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size[TH_TENSOR_DIM_APPLY_i]) \ + { \ + /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR->_dim()-1) \ + { \ + TH_TENSOR_DIM_APPLY_hasFinished = 1; \ + break; \ + } \ + else \ + { \ + /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \ + TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \ + TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ + } \ + } \ + else \ + break; \ + } \ + } \ + THFree(TH_TENSOR_DIM_APPLY_counter); \ +} + +#endif diff --git a/aten/src/TH/THTypeConversion.hpp b/aten/src/TH/THTypeConversion.hpp new file mode 100644 index 0000000..30dd076 --- /dev/null +++ b/aten/src/TH/THTypeConversion.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include +#include "THHalf.h" + +// Type traits to convert types to TH-specific types. Used primarily to +// convert at::Half to TH's half type. This makes the conversion explicit. +// FIXME: we should just use the same type + +namespace th { + +template +struct FromTypeConversion { + using type = T; +}; + +template <> +struct FromTypeConversion { + using type = at::Half; +}; + +template +using from_type = typename FromTypeConversion::type; +} diff --git a/aten/src/TH/THVector.cpp b/aten/src/TH/THVector.cpp new file mode 100644 index 0000000..3460d17 --- /dev/null +++ b/aten/src/TH/THVector.cpp @@ -0,0 +1,30 @@ +#include "THVector.h" + +#include "generic/simd/simd.h" + +#ifdef __NEON__ +#include "vector/NEON.cpp" +#endif + +#ifdef __PPC64__ +#include "vector/VSX.cpp" +#endif + +#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ + || defined(USE_SSE4_1) || defined(USE_SSE4_2) +#include "vector/SSE.cpp" +#endif + +#if defined(USE_AVX) +#include "vector/AVX.h" +#endif + +#if defined(USE_AVX2) +#include "vector/AVX2.h" +#endif + +#include "generic/THVectorDefault.cpp" +#include "THGenerateAllTypes.h" + +#include "generic/THVectorDispatch.cpp" +#include "THGenerateAllTypes.h" diff --git a/aten/src/TH/THVector.h b/aten/src/TH/THVector.h new file mode 100644 index 0000000..8054f64 --- /dev/null +++ b/aten/src/TH/THVector.h @@ -0,0 +1,14 @@ +#ifndef TH_VECTOR_INC +#define TH_VECTOR_INC + +#include "THGeneral.h" +#include "THMath.h" + +#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME) + +/* We are going to use dynamic dispatch, and want only to generate declarations + * of the vector functions */ +#include "generic/THVector.h" +#include "THGenerateAllTypes.h" + +#endif // TH_VECTOR_INC diff --git a/aten/src/TH/generic/THBlas.cpp b/aten/src/TH/generic/THBlas.cpp new file mode 100644 index 0000000..d06ae6a --- /dev/null +++ b/aten/src/TH/generic/THBlas.cpp @@ -0,0 +1,435 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THBlas.cpp" +#else + + +#ifdef BLAS_F2C +# define ffloat double +#else +# define ffloat float +#endif + +TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy); +TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy); +TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx); +TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx); +TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy); +TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy); +TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy); +TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy); +TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy); +#ifdef BLAS_USE_CBLAS_DOT +TH_EXTERNC float cblas_sdot(const int n, const float *x, const int incx, const float *y, const int incy); +#ifndef THBlas_C_sdot_ +#define THBlas_C_sdot_ +static inline ffloat sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy) +{ + return cblas_sdot(*n, x, *incx, y, *incy); +} +#endif +#else +TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy); +#endif +TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy); +TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy); +TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda); +TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda); +TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); +TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc); + + + +void THBlas_(swap)(int64_t n, real *x, int64_t incx, real *y, int64_t incy) +{ + if(n == 1) + { + incx = 1; + incy = 1; + } + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_n = (int)n; + int i_incx = (int)incx; + int i_incy = (int)incy; + +#if defined(TH_REAL_IS_DOUBLE) + dswap_(&i_n, x, &i_incx, y, &i_incy); +#else + sswap_(&i_n, x, &i_incx, y, &i_incy); +#endif + return; + } +#endif + { + int64_t i; + for(i = 0; i < n; i++) + { + real z = x[i*incx]; + x[i*incx] = y[i*incy]; + y[i*incy] = z; + } + } +} + +void THBlas_(scal)(int64_t n, real a, real *x, int64_t incx) +{ + if(n == 1) + incx = 1; + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (n <= INT_MAX) && (incx <= INT_MAX) ) + { + int i_n = (int)n; + int i_incx = (int)incx; + +#if defined(TH_REAL_IS_DOUBLE) + dscal_(&i_n, &a, x, &i_incx); +#else + sscal_(&i_n, &a, x, &i_incx); +#endif + return; + } +#endif + { + int64_t i; + for(i = 0; i < n; i++) { + if (a == 0) { + x[i*incx] = 0; + } else { + x[i*incx] *= a; + } + } + } +} + +void THBlas_(copy)(int64_t n, real *x, int64_t incx, real *y, int64_t incy) +{ + if(n == 1) + { + incx = 1; + incy = 1; + } + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_n = (int)n; + int i_incx = (int)incx; + int i_incy = (int)incy; + +#if defined(TH_REAL_IS_DOUBLE) + dcopy_(&i_n, x, &i_incx, y, &i_incy); +#else + scopy_(&i_n, x, &i_incx, y, &i_incy); +#endif + return; + } +#endif + { + int64_t i; + for(i = 0; i < n; i++) + y[i*incy] = x[i*incx]; + } +} + +void THBlas_(axpy)(int64_t n, real a, real *x, int64_t incx, real *y, int64_t incy) +{ + if(n == 1) + { + incx = 1; + incy = 1; + } + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_n = (int)n; + int i_incx = (int)incx; + int i_incy = (int)incy; + +#if defined(TH_REAL_IS_DOUBLE) + daxpy_(&i_n, &a, x, &i_incx, y, &i_incy); +#else + saxpy_(&i_n, &a, x, &i_incx, y, &i_incy); +#endif + return; + } +#endif + { + int64_t i; + for(i = 0; i < n; i++) + y[i*incy] += a*x[i*incx]; + } +} + +real THBlas_(dot)(int64_t n, real *x, int64_t incx, real *y, int64_t incy) +{ + if(n == 1) + { + incx = 1; + incy = 1; + } + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_n = (int)n; + int i_incx = (int)incx; + int i_incy = (int)incy; + +#if defined(TH_REAL_IS_DOUBLE) + return (real) ddot_(&i_n, x, &i_incx, y, &i_incy); +#else + return (real) sdot_(&i_n, x, &i_incx, y, &i_incy); +#endif + } +#endif + { + int64_t i; + real sum = 0; + for(i = 0; i < n; i++) + sum += x[i*incx]*y[i*incy]; + return sum; + } +} + +void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy) +{ + if(n == 1) + lda = m; + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && + (incx > 0) && (incx <= INT_MAX) && + (incy > 0) && (incy <= INT_MAX) ) + { + THArgCheck(lda >= THMax(1, m), 6, + "lda should be at least max(1, m=%d), but have %d", m, lda); + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + +#if defined(TH_REAL_IS_DOUBLE) + dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy); +#else + sgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy); +#endif + return; + } +#endif + { + int64_t i, j; + + if( (trans == 'T') || (trans == 't') ) + { + for(i = 0; i < n; i++) + { + real sum = 0; + real *row_ = a+lda*i; + for(j = 0; j < m; j++) + sum += x[j*incx]*row_[j]; + if (beta == 0) + y[i*incy] = alpha*sum; + else + y[i*incy] = beta*y[i*incy] + alpha*sum; + } + } + else + { + if(beta != 1) + THBlas_(scal)(m, beta, y, incy); + + for(j = 0; j < n; j++) + { + real *column_ = a+lda*j; + real z = alpha*x[j*incx]; + for(i = 0; i < m; i++) + y[i*incy] += z*column_[i]; + } + } + } +} + +void THBlas_(ger)(int64_t m, int64_t n, real alpha, real *x, int64_t incx, real *y, int64_t incy, real *a, int64_t lda) +{ + if(n == 1) + lda = m; + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && + (incx > 0) && (incx <= INT_MAX) && + (incy > 0) && (incy <= INT_MAX) ) + { + THArgCheck(lda >= THMax(1, m), 9, + "lda should be at least max(1, m=%d), but have %d", m, lda); + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + +#if defined(TH_REAL_IS_DOUBLE) + dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda); +#else + sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda); +#endif + return; + } +#endif + { + int64_t i, j; + for(j = 0; j < n; j++) + { + real *column_ = a+j*lda; + real z = alpha*y[j*incy]; + for(i = 0; i < m; i++) + column_[i] += z*x[i*incx] ; + } + } +} + +void THBlas_(gemm)(char transa, char transb, int64_t m, int64_t n, int64_t k, real alpha, real *a, int64_t lda, real *b, int64_t ldb, real beta, real *c, int64_t ldc) +{ + int transa_ = ((transa == 't') || (transa == 'T')); + int transb_ = ((transb == 't') || (transb == 'T')); + + if(n == 1) + ldc = m; + + if(transa_) + { + if(m == 1) + lda = k; + } + else + { + if(k == 1) + lda = m; + } + + if(transb_) + { + if(k == 1) + ldb = n; + } + else + { + if(n == 1) + ldb = k; + } + +#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) + if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && + (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) + { + THArgCheck(lda >= THMax(1, (transa_ ? k : m)), 8, + "lda should be at least max(1, %d), but have %d", (transa_ ? k : m), lda); + THArgCheck(ldb >= THMax(1, (transb_ ? n : k)), 10, + "ldb should be at least max(1, %d), but have %d", (transb_ ? n : k), ldb); + THArgCheck(ldc >= THMax(1, m), 13, + "ldc should be at least max(1, m=%d), but have %d", m, ldc); + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_lda = (int)lda; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + +#if defined(TH_REAL_IS_DOUBLE) + dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc); +#else + sgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc); +#endif + return; + } +#endif + { + int64_t i, j, l; + if(!transa_ && !transb_) + { + real *a_ = a; + for(i = 0; i < m; i++) + { + real *b_ = b; + for(j = 0; j < n; j++) + { + real sum = 0; + for(l = 0; l < k; l++) + sum += a_[l*lda]*b_[l]; + b_ += ldb; + if (beta == 0) + c[j*ldc+i] = alpha*sum; + else + c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum; + } + a_++; + } + } + else if(transa_ && !transb_) + { + real *a_ = a; + for(i = 0; i < m; i++) + { + real *b_ = b; + for(j = 0; j < n; j++) + { + real sum = 0; + for(l = 0; l < k; l++) + sum += a_[l]*b_[l]; + b_ += ldb; + if (beta == 0) + c[j*ldc+i] = alpha*sum; + else + c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum; + } + a_ += lda; + } + } + else if(!transa_ && transb_) + { + real *a_ = a; + for(i = 0; i < m; i++) + { + real *b_ = b; + for(j = 0; j < n; j++) + { + real sum = 0; + for(l = 0; l < k; l++) + sum += a_[l*lda]*b_[l*ldb]; + b_++; + if (beta == 0) + c[j*ldc+i] = alpha*sum; + else + c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum; + } + a_++; + } + } + else + { + real *a_ = a; + for(i = 0; i < m; i++) + { + real *b_ = b; + for(j = 0; j < n; j++) + { + real sum = 0; + for(l = 0; l < k; l++) + sum += a_[l]*b_[l*ldb]; + b_++; + if (beta == 0) + c[j*ldc+i] = alpha*sum; + else + c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum; + } + a_ += lda; + } + } + } +} + +#endif diff --git a/aten/src/TH/generic/THBlas.h b/aten/src/TH/generic/THBlas.h new file mode 100644 index 0000000..c36e796 --- /dev/null +++ b/aten/src/TH/generic/THBlas.h @@ -0,0 +1,19 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THBlas.h" +#else + +/* Level 1 */ +TH_API void THBlas_(swap)(int64_t n, real *x, int64_t incx, real *y, int64_t incy); +TH_API void THBlas_(scal)(int64_t n, real a, real *x, int64_t incx); +TH_API void THBlas_(copy)(int64_t n, real *x, int64_t incx, real *y, int64_t incy); +TH_API void THBlas_(axpy)(int64_t n, real a, real *x, int64_t incx, real *y, int64_t incy); +TH_API real THBlas_(dot)(int64_t n, real *x, int64_t incx, real *y, int64_t incy); + +/* Level 2 */ +TH_API void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy); +TH_API void THBlas_(ger)(int64_t m, int64_t n, real alpha, real *x, int64_t incx, real *y, int64_t incy, real *a, int64_t lda); + +/* Level 3 */ +TH_API void THBlas_(gemm)(char transa, char transb, int64_t m, int64_t n, int64_t k, real alpha, real *a, int64_t lda, real *b, int64_t ldb, real beta, real *c, int64_t ldc); + +#endif diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp new file mode 100644 index 0000000..8f3ccc8 --- /dev/null +++ b/aten/src/TH/generic/THLapack.cpp @@ -0,0 +1,270 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THLapack.cpp" +#else + + +TH_EXTERNC void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); +TH_EXTERNC void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info); +TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info); +TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info); +TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info); +TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info); +TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info); +TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info); +TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info); +TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info); +TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info); +TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info); +TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info); +TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info); +TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); +TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info); +TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); +TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info); +TH_EXTERNC void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info); +TH_EXTERNC void spotrf_(char *uplo, int *n, float *a, int *lda, int *info); +TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info); +TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info); +TH_EXTERNC void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info); +TH_EXTERNC void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info); +TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); +TH_EXTERNC void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); +TH_EXTERNC void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info); +TH_EXTERNC void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info); +TH_EXTERNC void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info); +TH_EXTERNC void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info); +TH_EXTERNC void spstrf_(char *uplo, int *n, float *a, int *lda, int *piv, int *rank, float *tol, float *work, int *info); +TH_EXTERNC void dpstrf_(char *uplo, int *n, double *a, int *lda, int *piv, int *rank, double *tol, double *work, int *info); + + +/* Compute the solution to a real system of linear equations A * X = B */ +void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info); +#else + sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info); +#endif +#else + THError("gesv : Lapack library not found in compile time\n"); +#endif + return; +} + +/* Solve a triangular system of the form A * X = B or A^T * X = B */ +void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dtrtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info); +#else + strtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info); +#endif +#else + THError("trtrs : Lapack library not found in compile time\n"); +#endif + return; +} + +/* Solve overdetermined or underdetermined real linear systems involving an +M-by-N matrix A, or its transpose, using a QR or LQ factorization of A */ +void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info); +#else + sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info); +#endif +#else + THError("gels : Lapack library not found in compile time\n"); +#endif +} + +/* Compute all eigenvalues and, optionally, eigenvectors of a real symmetric +matrix A */ +void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info); +#else + ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info); +#endif +#else + THError("syev : Lapack library not found in compile time\n"); +#endif +} + +/* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and, +optionally, the left and/or right eigenvectors */ +void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info); +#else + sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info); +#endif +#else + THError("geev : Lapack library not found in compile time\n"); +#endif +} + +/* Compute the singular value decomposition (SVD) of a real M-by-N matrix A, +optionally computing the left and/or right singular vectors */ +void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgesvd_( &jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, info); +#else + sgesvd_( &jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, info); +#endif +#else + THError("gesvd : Lapack library not found in compile time\n"); +#endif +} + +/* LU decomposition */ +void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgetrf_(&m, &n, a, &lda, ipiv, info); +#else + sgetrf_(&m, &n, a, &lda, ipiv, info); +#endif +#else + THError("getrf : Lapack library not found in compile time\n"); +#endif +} + +void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info); +#else + sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info); +#endif +#else + THError("getrs : Lapack library not found in compile time\n"); +#endif +} + +/* Matrix Inverse */ +void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgetri_(&n, a, &lda, ipiv, work, &lwork, info); +#else + sgetri_(&n, a, &lda, ipiv, work, &lwork, info); +#endif +#else + THError("getri : Lapack library not found in compile time\n"); +#endif +} + +/* Cholesky factorization */ +void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dpotrf_(&uplo, &n, a, &lda, info); +#else + spotrf_(&uplo, &n, a, &lda, info); +#endif +#else + THError("potrf : Lapack library not found in compile time\n"); +#endif +} + +/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */ +void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info); +#else + spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info); +#endif +#else + THError("potrs: Lapack library not found in compile time\n"); +#endif +} + +/* Cholesky factorization based Matrix Inverse */ +void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dpotri_(&uplo, &n, a, &lda, info); +#else + spotri_(&uplo, &n, a, &lda, info); +#endif +#else + THError("potri: Lapack library not found in compile time\n"); +#endif +} + +/* Cholesky factorization with complete pivoting */ +void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dpstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info); +#else + spstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info); +#endif +#else + THError("pstrf: Lapack library not found at compile time\n"); +#endif +} + +/* QR decomposition */ +void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info); +#else + sgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info); +#endif +#else + THError("geqrf: Lapack library not found in compile time\n"); +#endif +} + +/* Build Q from output of geqrf */ +void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info); +#else + sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info); +#endif +#else + THError("orgqr: Lapack library not found in compile time\n"); +#endif +} + +/* Multiply Q with a matrix using the output of geqrf */ +void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info) +{ +#ifdef USE_LAPACK +#if defined(TH_REAL_IS_DOUBLE) + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info); +#else + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info); +#endif +#else + THError("ormqr: Lapack library not found in compile time\n"); +#endif +} + + +#endif diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h new file mode 100644 index 0000000..fe64dae --- /dev/null +++ b/aten/src/TH/generic/THLapack.h @@ -0,0 +1,40 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THLapack.h" +#else + +/* AX=B */ +TH_API void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info); +/* Solve a triangular system of the form A * X = B or A^T * X = B */ +TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info); +/* ||AX-B|| */ +TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info); +/* Eigenvals */ +TH_API void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info); +/* Non-sym eigenvals */ +TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info); +/* svd */ +TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info); +/* LU decomposition */ +TH_API void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info); +TH_API void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info); +/* Matrix Inverse */ +TH_API void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info); + +/* Positive Definite matrices */ +/* Cholesky factorization */ +TH_API void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info); +/* Matrix inverse based on Cholesky factorization */ +TH_API void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info); +/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */ +TH_API void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info); +/* Cholesky factorization with complete pivoting. */ +TH_API void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info); + +/* QR decomposition */ +TH_API void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info); +/* Build Q from output of geqrf */ +TH_API void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info); +/* Multiply Q with a matrix from output of geqrf */ +TH_API void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info); + +#endif diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp new file mode 100644 index 0000000..2d499b0 --- /dev/null +++ b/aten/src/TH/generic/THStorage.cpp @@ -0,0 +1,136 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THStorage.cpp" +#else + +#include + +real* THStorage_(data)(const THStorage *self) +{ + return self->data(); +} + +ptrdiff_t THStorage_(size)(const THStorage *self) +{ + return THStorage_size(self); +} + +size_t THStorage_(elementSize)() +{ + return sizeof(real); +} + +THStorage* THStorage_(new)(void) +{ + return THStorage_new(at::CTypeToScalarType>::to()); +} + +THStorage* THStorage_(newWithSize)(ptrdiff_t size) +{ + return THStorage_newWithSize(at::CTypeToScalarType>::to(), size); +} + +THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, + at::Allocator *allocator) +{ + return THStorage_newWithAllocator(at::CTypeToScalarType>::to(), size, allocator); +} + + +THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags) +{ + return THStorage_newWithMapping(at::CTypeToScalarType>::to(), filename, size, flags); +} + +THStorage* THStorage_(newWithSize1)(real data0) +{ + THStorage *self = THStorage_(newWithSize)(1); + real *data = THStorage_(data)(self); + data[0] = data0; + return self; +} + +THStorage* THStorage_(newWithSize2)(real data0, real data1) +{ + THStorage *self = THStorage_(newWithSize)(2); + real *data = THStorage_(data)(self); + data[0] = data0; + data[1] = data1; + return self; +} + +THStorage* THStorage_(newWithSize3)(real data0, real data1, real data2) +{ + THStorage *self = THStorage_(newWithSize)(3); + real *data = THStorage_(data)(self); + data[0] = data0; + data[1] = data1; + data[2] = data2; + return self; +} + +THStorage* THStorage_(newWithSize4)(real data0, real data1, real data2, real data3) +{ + THStorage *self = THStorage_(newWithSize)(4); + real *data = THStorage_(data)(self); + data[0] = data0; + data[1] = data1; + data[2] = data2; + data[3] = data3; + return self; +} + +void THStorage_(setFlag)(THStorage *storage, const char flag) +{ + THStorage_setFlag(storage, flag); +} + +void THStorage_(clearFlag)(THStorage *storage, const char flag) +{ + THStorage_clearFlag(storage, flag); +} + +void THStorage_(retain)(THStorage *storage) +{ + THStorage_retain(storage); +} + +void THStorage_(free)(THStorage *storage) +{ + THStorage_free(storage); +} + +THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size, + at::Allocator* allocator) { + return THStorage_newWithDataAndAllocator(at::CTypeToScalarType>::to(), std::move(data), size, allocator); +} + +void THStorage_(resize)(THStorage *storage, ptrdiff_t size) +{ + return THStorage_resize(storage, size); +} + +void THStorage_(fill)(THStorage *storage, real value) +{ + ptrdiff_t i; + for(i = 0; i < storage->size; i++) + THStorage_(data)(storage)[i] = value; +} + +void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value) +{ + THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds"); + THStorage_(data)(self)[idx] = value; +} + +real THStorage_(get)(const THStorage *self, ptrdiff_t idx) +{ + THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds"); + return THStorage_(data)(self)[idx]; +} + +void THStorage_(swap)(THStorage *storage1, THStorage *storage2) +{ + THStorage_swap(storage1, storage2); +} + +#endif diff --git a/aten/src/TH/generic/THStorage.h b/aten/src/TH/generic/THStorage.h new file mode 100644 index 0000000..4850c47 --- /dev/null +++ b/aten/src/TH/generic/THStorage.h @@ -0,0 +1,74 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THStorage.h" +#else + +#ifdef __cplusplus +#include +#endif + +/* on pourrait avoir un liste chainee + qui initialise math, lab structures (or more). + mouais -- complique. + + Pb: THMapStorage is kind of a class + THLab_()... comment je m'en sors? + + en template, faudrait que je les instancie toutes!!! oh boy! + Et comment je sais que c'est pour Cuda? Le type float est le meme dans les <> + + au bout du compte, ca serait sur des pointeurs float/double... etc... = facile. + primitives?? + */ + +#define TH_STORAGE_REFCOUNTED 1 +#define TH_STORAGE_RESIZABLE 2 + +// Struct definition is moved to THStorage.hpp (so this file stays C compatible) +typedef struct THStorage THStorage; + +// These used to be distinct types; for some measure of backwards compatibility and documentation +// alias these to the single THStorage type. +#define THFloatStorage THStorage +#define THDoubleStorage THStorage +#define THHalfStorage THStorage +#define THByteStorage THStorage +#define THCharStorage THStorage +#define THShortStorage THStorage +#define THIntStorage THStorage +#define THLongStorage THStorage + +TH_API real* THStorage_(data)(const THStorage*); +TH_API ptrdiff_t THStorage_(size)(const THStorage*); +TH_API size_t THStorage_(elementSize)(void); + +/* slow access -- checks everything */ +TH_API void THStorage_(set)(THStorage*, ptrdiff_t, real); +TH_API real THStorage_(get)(const THStorage*, ptrdiff_t); + +TH_API THStorage* THStorage_(new)(void); +TH_API THStorage* THStorage_(newWithSize)(ptrdiff_t size); +TH_API THStorage* THStorage_(newWithSize1)(real); +TH_API THStorage* THStorage_(newWithSize2)(real, real); +TH_API THStorage* THStorage_(newWithSize3)(real, real, real); +TH_API THStorage* THStorage_(newWithSize4)(real, real, real, real); +TH_API THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags); + +TH_API THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, + THAllocator* allocator); +#ifdef __cplusplus +TH_API THStorage* THStorage_(newWithDataAndAllocator)( + at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator); +#endif + +/* should not differ with API */ +TH_API void THStorage_(setFlag)(THStorage *storage, const char flag); +TH_API void THStorage_(clearFlag)(THStorage *storage, const char flag); +TH_API void THStorage_(retain)(THStorage *storage); +TH_API void THStorage_(swap)(THStorage *storage1, THStorage *storage2); + +/* might differ with other API (like CUDA) */ +TH_API void THStorage_(free)(THStorage *storage); +TH_API void THStorage_(resize)(THStorage *storage, ptrdiff_t size); +TH_API void THStorage_(fill)(THStorage *storage, real value); + +#endif diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp new file mode 100644 index 0000000..946be62 --- /dev/null +++ b/aten/src/TH/generic/THStorageCopy.cpp @@ -0,0 +1,87 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THStorageCopy.cpp" +#else + +void THStorage_(rawCopy)(THStorage *storage, real *src) +{ + ptrdiff_t i; + real *data = THStorage_(data)(storage); + for(i = 0; i < storage->size; i++) + data[i] = src[i]; +} + +void THStorage_(copy)(THStorage *storage, THStorage *src) +{ + THArgCheck(storage->size == src->size, 2, "size mismatch"); + THStorage_(rawCopy)(storage, THStorage_(data)(src)); +} + +// NOTE: for performance, these macros generally use the raw data pointer in the inner loops, +// rather than repeated THStorage_(data) calls. + +#define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ + ptrdiff_t i; \ + auto data = THStorage_(data)(storage); \ + auto src_data = TH##TYPENAMESRC##Storage_data(src); \ + for(i = 0; i < storage->size; i++) \ + data[i] = static_cast(src_data[i]); \ +} + +#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ + THArgCheck(storage->size == src->size, 2, "size mismatch"); \ + ptrdiff_t i; \ + auto data = THStorage_(data)(storage); \ + auto src_data = TH##TYPENAMESRC##Storage_data(src); \ + for(i = 0; i < storage->size; i++) \ + data[i] = (real)TH_half2float(src_data[i]); \ +} + +#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ + THArgCheck(storage->size == src->size, 2, "size mismatch"); \ + ptrdiff_t i; \ + auto data = THStorage_(data)(storage); \ + auto src_data = TH##TYPENAMESRC##Storage_data(src); \ + for(i = 0; i < storage->size; i++) \ + data[i] = TH_float2half((float)(src_data[i])); \ +} + +#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ + THArgCheck(storage->size == src->size, 2, "size mismatch"); \ + ptrdiff_t i; \ + auto data = THStorage_(data)(storage); \ + auto src_data = TH##TYPENAMESRC##Storage_data(src); \ + for(i = 0; i < storage->size; i++) \ + data[i] = static_cast(src_data[i]); \ +} + +#ifndef TH_REAL_IS_HALF +IMPLEMENT_THStorage_COPY(Byte) +IMPLEMENT_THStorage_COPY(Char) +IMPLEMENT_THStorage_COPY(Short) +IMPLEMENT_THStorage_COPY(Int) +IMPLEMENT_THStorage_COPY(Long) +IMPLEMENT_THStorage_COPY(Float) +IMPLEMENT_THStorage_COPY(Double) +IMPLEMENT_THStorage_COPY_FROM_HALF(Half) +#else +/* only allow pass-through for Half */ +IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half) +IMPLEMENT_THStorage_COPY_TO_HALF(Byte) +IMPLEMENT_THStorage_COPY_TO_HALF(Char) +IMPLEMENT_THStorage_COPY_TO_HALF(Short) +IMPLEMENT_THStorage_COPY_TO_HALF(Int) +IMPLEMENT_THStorage_COPY_TO_HALF(Long) +IMPLEMENT_THStorage_COPY_TO_HALF(Float) +IMPLEMENT_THStorage_COPY_TO_HALF(Double) +#endif + + +#endif diff --git a/aten/src/TH/generic/THStorageCopy.h b/aten/src/TH/generic/THStorageCopy.h new file mode 100644 index 0000000..ce8a2a6 --- /dev/null +++ b/aten/src/TH/generic/THStorageCopy.h @@ -0,0 +1,18 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THStorageCopy.h" +#else + +/* Support for copy between different Storage types */ + +TH_API void THStorage_(rawCopy)(THStorage *storage, real *src); +TH_API void THStorage_(copy)(THStorage *storage, THStorage *src); +TH_API void THStorage_(copyByte)(THStorage *storage, struct THByteStorage *src); +TH_API void THStorage_(copyChar)(THStorage *storage, struct THCharStorage *src); +TH_API void THStorage_(copyShort)(THStorage *storage, struct THShortStorage *src); +TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src); +TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src); +TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src); +TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src); +TH_API void THStorage_(copyHalf)(THStorage *storage, struct THHalfStorage *src); + +#endif diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp new file mode 100644 index 0000000..0428c8f --- /dev/null +++ b/aten/src/TH/generic/THTensor.cpp @@ -0,0 +1,890 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensor.cpp" +#else + +#include + +/**** access methods ****/ +THStorage *THTensor_(storage)(const THTensor *self) +{ + return self->storage; +} + +ptrdiff_t THTensor_(storageOffset)(const THTensor *self) +{ + return self->storageOffset; +} + +int THTensor_(nDimension)(const THTensor *self) +{ + return self->dim(); +} + +int THTensor_(_nDimension)(const THTensor *self) +{ + return self->_dim(); +} + +int64_t THTensor_(size)(const THTensor *self, int dim) +{ + THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor", + dim+TH_INDEX_BASE, THTensor_(nDimension)(self)); + return self->size[dim]; +} + +int64_t THTensor_(stride)(const THTensor *self, int dim) +{ + THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor", + dim+TH_INDEX_BASE, THTensor_(nDimension)(self)); + return self->stride[dim]; +} + +THLongStorage *THTensor_(newSizeOf)(THTensor *self) +{ + THLongStorage *size = THLongStorage_newWithSize(self->dim()); + THLongStorage_rawCopy(size, self->size); + return size; +} + +THLongStorage *THTensor_(newStrideOf)(THTensor *self) +{ + THLongStorage *stride = THLongStorage_newWithSize(self->dim()); + THLongStorage_rawCopy(stride, self->stride); + return stride; +} + +real *THTensor_(data)(const THTensor *self) +{ + if(self->storage) + return (THStorage_(data)(self->storage)+self->storageOffset); + else + return NULL; +} + +/**** creation methods ****/ + +/* Empty init */ +THTensor *THTensor_(new)(void) +{ + return new THTensor(THStorage_(new)()); +} + +/* Pointer-copy init */ +THTensor *THTensor_(newWithTensor)(THTensor *tensor) +{ + THTensor *self = new THTensor(THStorage_(new)()); + THTensor_(setStorageNd)(self, + tensor->storage, + tensor->storageOffset, + tensor->dim(), + tensor->size, + tensor->stride); + return self; +} + +/* Storage init */ +THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride) +{ + if(size && stride) { + THArgCheck(size->size == stride->size, 4, "inconsistent size"); + } + AT_CHECK(size, "size must not be null"); + + THTensor *self = new THTensor(THStorage_(new)()); +#ifdef DEBUG + THAssert(size->size <= INT_MAX); +#endif + THTensor_(setStorageNd)(self, + storage, + storageOffset, + size->size, + THLongStorage_data(size), + (stride ? THLongStorage_data(stride) : NULL)); + + return self; +} + +THTensor *THTensor_(newWithStorageIntLists)(THStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) { + AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); + THTensor *self = new THTensor(THStorage_(new)()); + THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(), + const_cast(sizes.data()), const_cast(strides.data())); + + return self; +} + +THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0) +{ + return THTensor_(newWithStorageIntLists)(storage, storageOffset, {size0}, {stride0}); +} + +THTensor *THTensor_(newWithStorage2d)(THStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0, + int64_t size1, int64_t stride1) +{ + return THTensor_(newWithStorageIntLists)(storage, storageOffset, {size0, size1}, {stride0, stride1}); +} + +THTensor *THTensor_(newWithStorage3d)(THStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0, + int64_t size1, int64_t stride1, + int64_t size2, int64_t stride2) +{ + return THTensor_(newWithStorageIntLists)(storage, storageOffset, {size0, size1, size2}, {stride0, stride1, stride2}); +} + +THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0, + int64_t size1, int64_t stride1, + int64_t size2, int64_t stride2, + int64_t size3, int64_t stride3) +{ + return THTensor_(newWithStorageIntLists)(storage, storageOffset, + {size0, size1, size2, size3}, + {stride0, stride1, stride2, stride3}); +} + +THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride) +{ + return THTensor_(newWithStorage)(NULL, 0, size, stride); +} + +THTensor *THTensor_(newWithSizeIntList)(at::IntList sizes) { + THTensor *self = new THTensor(THStorage_(new)()); + THTensor_(resizeNd)(self, sizes.size(), const_cast(sizes.data()), nullptr); + + return self; +} + +THTensor *THTensor_(newWithSize1d)(int64_t size0) +{ + return THTensor_(newWithSizeIntList)({size0}); +} + +THTensor *THTensor_(newWithSize2d)(int64_t size0, int64_t size1) +{ + return THTensor_(newWithSizeIntList)({size0, size1}); +} + +THTensor *THTensor_(newWithSize3d)(int64_t size0, int64_t size1, int64_t size2) +{ + return THTensor_(newWithSizeIntList)({size0, size1, size2}); +} + +THTensor *THTensor_(newWithSize4d)(int64_t size0, int64_t size1, int64_t size2, int64_t size3) +{ + return THTensor_(newWithSizeIntList)({size0, size1, size2, size3}); +} + +THTensor *THTensor_(newClone)(THTensor *self) +{ + THTensor *tensor = THTensor_(new)(); + THTensor_(resizeAs)(tensor, self); + THTensor_(copy)(tensor, self); + return tensor; +} + +THTensor *THTensor_(newContiguous)(THTensor *self) +{ + if(!THTensor_(isContiguous)(self)) + return THTensor_(newClone)(self); + else + { + THTensor_(retain)(self); + return self; + } +} + +THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_) +{ + THTensor *self = THTensor_(newWithTensor)(tensor); + THTensor_(select)(self, NULL, dimension_, sliceIndex_); + return self; +} + +THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_) +{ + THTensor *self = THTensor_(newWithTensor)(tensor); + THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_); + return self; +} + +THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_) +{ + THTensor *self = THTensor_(newWithTensor)(tensor); + THTensor_(transpose)(self, NULL, dimension1_, dimension2_); + return self; +} + +THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, int64_t size_, int64_t step_) +{ + THTensor *self = THTensor_(newWithTensor)(tensor); + THTensor_(unfold)(self, NULL, dimension_, size_, step_); + return self; +} + +THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size) +{ + ptrdiff_t numel = THTensor_(nElement)(tensor); + THTensor *self = THTensor_(new)(); + THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel); + auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()), + at::IntList(tensor->stride, tensor->dim()), + at::IntList(inferred_size->data(), inferred_size->size)); + THArgCheck(stride.has_value(), 2, "view size is " + "not compatible with input tensor's size and stride (at least one dimension spans " + "across two contiguous subspaces). Call .contiguous() before .view()."); + auto stride_value = *stride; + THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size()); + THLongStorage_rawCopy(new_stride, stride_value.data()); + THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, new_stride); + THLongStorage_free(inferred_size); + THLongStorage_free(new_stride); + return self; +} + +/* Resize */ +void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride) +{ + THArgCheck(size != NULL, 2, "invalid size"); + if(stride) + THArgCheck(stride->size == size->size, 3, "invalid stride"); + +#ifdef DEBUG + THAssert(size->size <= INT_MAX); +#endif + THTensor_(resizeNd)(self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL)); +} + +void THTensor_(resizeAs)(THTensor *self, THTensor *src) +{ + if(!THTensor_(isSameSizeAs)(self, src)) + THTensor_(resizeNd)(self, src->dim(), src->size, NULL); +} + +void THTensor_(resize1d)(THTensor *tensor, int64_t size0) +{ + int64_t size[1] = {size0}; + THTensor_(resizeNd)(tensor, 1, size, nullptr); +} + +void THTensor_(resize2d)(THTensor *tensor, int64_t size0, int64_t size1) +{ + int64_t size[2] = {size0, size1}; + THTensor_(resizeNd)(tensor, 2, size, nullptr); +} + +void THTensor_(resize3d)(THTensor *tensor, int64_t size0, int64_t size1, int64_t size2) +{ + int64_t size[3] = {size0, size1, size2}; + THTensor_(resizeNd)(tensor, 3, size, nullptr); +} + +void THTensor_(resize4d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3) +{ + int64_t size[4] = {size0, size1, size2, size3}; + THTensor_(resizeNd)(self, 4, size, nullptr); +} + +void THTensor_(resize5d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4) +{ + int64_t size[5] = {size0, size1, size2, size3, size4}; + THTensor_(resizeNd)(self, 5, size, nullptr); +} + +void THTensor_(set)(THTensor *self, THTensor *src) +{ + if(self != src) + THTensor_(setStorageNd)(self, + src->storage, + src->storageOffset, + src->dim(), + src->size, + src->stride); +} + +void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_) +{ + if(size_ && stride_) + THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes"); + + AT_CHECK(size_, "size must not be null"); +#ifdef DEBUG + THAssert(size_ <= INT_MAX); +#endif + THTensor_(setStorageNd)(self, + storage_, + storageOffset_, + size_->size, + THLongStorage_data(size_), + (stride_ ? THLongStorage_data(stride_) : NULL)); +} + +void THTensor_(setStorageIntLists)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + at::IntList sizes, at::IntList strides) +{ + AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); + + THTensor_(setStorageNd)(self, storage_, storageOffset_, sizes.size(), + const_cast(sizes.data()), const_cast(strides.data())); +} + +void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_) +{ + THTensor_(setStorageIntLists)(self, storage_, storageOffset_, + {size0_}, {stride0_}); +} + +void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_) +{ + THTensor_(setStorageIntLists)(self, storage_, storageOffset_, + {size0_, size1_}, + {stride0_, stride1_}); +} + +void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_) +{ + THTensor_(setStorageIntLists)(self, storage_, storageOffset_, + {size0_, size1_, size2_}, + {stride0_, stride1_, stride2_}); +} + +void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_, + int64_t size3_, int64_t stride3_) +{ + + int64_t size[4] = {size0_, size1_, size2_, size3_}; + int64_t stride[4] = {stride0_, stride1_, stride2_, stride3_}; + + THTensor_(setStorageIntLists)(self, storage_, storageOffset_, size, stride); +} + + +void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t firstIndex, int64_t size) +{ + if(!src) + src = self; + + THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range"); + THArgCheck( firstIndex >= 0, 3, "out of range"); +#ifdef USE_TH_SIZE_ZERO_DIM + THArgCheck( size >= 0, 4, "out of range"); +#else + THArgCheck( size > 0, 4, "out of range"); +#endif + THArgCheck(firstIndex <= src->size[dimension] - size, 4, "out of range"); + + THTensor_(set)(self, src); + + if(firstIndex > 0) + self->storageOffset += firstIndex*self->stride[dimension]; + + self->size[dimension] = size; +} + +void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex) +{ + int d; + + if(!src) + src = self; + +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(src->_dim() > 1, 1, "cannot select on a vector"); +#else +#ifndef USE_TH_SCALAR + THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); +#endif +#endif + THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); + THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range"); + + THTensor_(set)(self, src); + THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1); + for(d = dimension; d < self->dim()-1; d++) + { + self->size[d] = self->size[d+1]; + self->stride[d] = self->stride[d+1]; + } + self->dim_--; +} + +void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2) +{ + int64_t z; + + if(!src) + src = self; + + THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); + + THTensor_(set)(self, src); + + if(dimension1 == dimension2) + return; + + z = self->stride[dimension1]; + self->stride[dimension1] = self->stride[dimension2]; + self->stride[dimension2] = z; + z = self->size[dimension1]; + self->size[dimension1] = self->size[dimension2]; + self->size[dimension2] = z; +} + +void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t size, int64_t step) +{ + int64_t *newSize; + int64_t *newStride; + int d; + + if(!src) + src = self; + +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); +#endif + THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); + THArgCheck(size <= src->size[dimension], 3, "out of range"); + THArgCheck(step > 0, 4, "invalid step"); + + THTensor_(set)(self, src); + + newSize = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1)); + newStride = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1)); + + newSize[self->dim()] = size; + newStride[self->dim()] = self->stride[dimension]; + for(d = 0; d < self->dim(); d++) + { + if(d == dimension) + { + newSize[d] = (self->size[d] - size) / step + 1; + newStride[d] = step*self->stride[d]; + } + else + { + newSize[d] = self->size[d]; + newStride[d] = self->stride[d]; + } + } + + THFree(self->size); + THFree(self->stride); + + self->size = newSize; + self->stride = newStride; + self->dim_++; +} + +/* we have to handle the case where the result is a number */ +void THTensor_(squeeze)(THTensor *self, THTensor *src) +{ + int ndim = 0; + int d; + + if(!src) + src = self; + + THTensor_(set)(self, src); + + for(d = 0; d < src->dim(); d++) + { + if(src->size[d] != 1) + { + if(d != ndim) + { + self->size[ndim] = src->size[d]; + self->stride[ndim] = src->stride[d]; + } + ndim++; + } + } + +#ifndef USE_TH_SCALAR + /* right now, we do not handle 0-dimension tensors */ + if(ndim == 0 && src->dim() > 0) + { + self->size[0] = 1; + self->stride[0] = 1; + ndim = 1; + } +#endif + self->dim_ = ndim; +} + +void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension) +{ + int d; + + if(!src) + src = self; + + THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "dimension out of range"); + + THTensor_(set)(self, src); + +#ifdef USE_TH_SCALAR + if(src->size[dimension] == 1) +#else + if(src->size[dimension] == 1 && src->dim() > 1) +#endif + { + for(d = dimension; d < self->dim()-1; d++) + { + self->size[d] = self->size[d+1]; + self->stride[d] = self->stride[d+1]; + } + self->dim_--; + } +} + +void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension) +{ + int d; + + if(!src) + src = self; + + THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range"); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 2, "cannot unsqueeze empty tensor"); +#endif + + THTensor_(set)(self, src); + + self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1)); + self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1)); + self->dim_++; + for (d = self->dim()-1; d > dimension; d--) { + self->size[d] = self->size[d-1]; + self->stride[d] = self->stride[d-1]; + } + if (dimension+1 < self->dim()) { + self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1]; + } else { + self->stride[dimension] = 1; + } + self->size[dimension] = 1; +} + +int THTensor_(isTransposed)(const THTensor *self) +{ + if (THTensor_(isContiguous)(self)) { + return 0; + } + int64_t max_stride = 1; + int64_t size_max_stride = 1; + int64_t z = 1; + int d; + for (d = 0; d < self->_dim(); ++d) { + if (self->stride[d] == 0 && self->size[d] != 1) + return 0; + if (self->stride[d] > max_stride) { + max_stride = self->stride[d]; + size_max_stride = self->size[d]; + } + z *= self->size[d]; + } + if (z == max_stride * size_max_stride) { + return 1; + } + return 0; +} + +int THTensor_(isContiguous)(const THTensor *self) +{ + if (self->is_empty()) return 1; + int64_t z = 1; + int d; + for(d = self->dim()-1; d >= 0; d--) + { + if(self->size[d] != 1) + { + if(self->stride[d] == z) + z *= self->size[d]; + else + return 0; + } + } + return 1; +} + +int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims) +{ + int d; + if (self->_dim() != dims->size) + return 0; + + for(d = 0; d < self->_dim(); ++d) + { + if(self->size[d] != THLongStorage_data(dims)[d]) + return 0; + } + return 1; +} + +int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src) +{ + int d; + if (self->dim() != src->dim()) + return 0; + for(d = 0; d < self->dim(); ++d) + { + if(self->size[d] != src->size[d]) + return 0; + } + return 1; +} + +int THTensor_(isSetTo)(const THTensor *self, const THTensor* src) +{ + if (!self->storage) + return 0; + if (self->storage == src->storage && + self->storageOffset == src->storageOffset && + self->_dim() == src->_dim()) + { + int d; + for (d = 0; d < self->_dim(); ++d) + { + if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d]) + return 0; + } + return 1; + } + return 0; +} + +ptrdiff_t THTensor_(nElement)(const THTensor *self) +{ + if(self->_dim() == 0) + return 0; + else + { + ptrdiff_t nElement = 1; + int d; + for(d = 0; d < self->_dim(); d++) + nElement *= self->size[d]; + return nElement; + } +} + +void THTensor_(retain)(THTensor *self) +{ + ++self->refcount; +} + +void THTensor_(free)(THTensor *self) +{ + THTensor_free(self); +} + +void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst) +{ + if(self != dst) + THTensor_(copy)(dst, self); + + THTensor_(free)(self); +} + +/*******************************************************************************/ + +void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) +{ + /* storage */ + if(self->storage != storage) + { + if(self->storage) + THStorage_(free)(self->storage); + + if(storage) + { + self->storage = storage; + THStorage_(retain)(self->storage); + } + else + self->storage = THStorage_(new)(); + } + + /* storageOffset */ + if(storageOffset < 0) + THError("Tensor: invalid storage offset"); + self->storageOffset = storageOffset; + + /* size and stride */ + THTensor_(resizeNd)(self, nDimension, size, stride); +} + +void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t *stride) +{ + int d; + ptrdiff_t totalSize; + bool hascorrectsize = true; + +#ifndef USE_TH_SCALAR + AT_CHECK(nDimension > 0, "resizeNd nDimension must be greater than 0"); +#else + AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative"); +#endif + + for(d = 0; d < nDimension; d++) + { +#ifndef USE_TH_SIZE_ZERO_DIM + // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this + // currently exist and expect a size [0] tensor to be returned. + if (d == 0 && size[d] == 0) { + nDimension = 1; + } else { + AT_CHECK(size[d] > 0, "sizes must be non-negative"); + } +#endif + if((self->dim() > d) && (size[d] != self->size[d])) { + hascorrectsize = false; + } + + // NB: this used to test that stride[d] was >= 0 + if((self->dim() > d) && stride && (stride[d] != self->stride[d])) { + hascorrectsize = false; + } + } + + if(nDimension != self->dim()) { + hascorrectsize = false; + } + + if(hascorrectsize) { + return; + } + + if(nDimension != self->dim()) + { + self->size = (int64_t *)THRealloc(self->size, sizeof(int64_t)*nDimension); + self->stride = (int64_t *)THRealloc(self->stride, sizeof(int64_t)*nDimension); + self->dim_ = nDimension; + } + + totalSize = 1; + for(d = nDimension-1; d >= 0; d--) + { + self->size[d] = size[d]; + if(stride && (stride[d] >= 0) ) { + self->stride[d] = stride[d]; + } else { + if(d == nDimension-1) { + self->stride[d] = 1; + } else { + // Keep stride monotonically increasing to match NumPy. + self->stride[d] = std::max(self->size[d+1], 1)*self->stride[d+1]; + } + } + totalSize += (self->size[d]-1)*self->stride[d]; + } + + if(totalSize+self->storageOffset > 0) + { + if(!self->storage) { + self->storage = THStorage_(new)(); + } + if(totalSize+self->storageOffset > self->storage->size) { + THStorage_(resize)(self->storage, totalSize+self->storageOffset); + } + } +} + +void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value) +{ + THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value); +} + +real THTensor_(get1d)(const THTensor *tensor, int64_t x0) +{ + THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]); +} + +void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value) +{ + THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value); +} + +real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1) +{ + THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]); +} + +void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value) +{ + THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value); +} + +real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2) +{ + THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]); +} + +void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) +{ + THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value); +} + +real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3) +{ + THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]); +} + +THDescBuff THTensor_(desc)(const THTensor *tensor) { + const int L = TH_DESC_BUFF_LEN; + THDescBuff buf; + char *str = buf.str; + int n = 0; +#define _stringify(x) #x + n += snprintf(str, L-n, "torch." _stringify(x) "Tensor of size "); +#undef _stringify + int i; + for(i = 0; i < tensor->_dim(); i++) { + if(n >= L) break; + n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]); + if(i < tensor->_dim()-1) { + n += snprintf(str+n, L-n, "x"); + } + } + if(n >= L) { + snprintf(str+L-4, 4, "..."); + } + return buf; +} + +THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) { + THLongStorage *size = THTensor_(newSizeOf)((THTensor*)tensor); + THDescBuff buf = THLongStorage_sizeDesc(size); + THLongStorage_free(size); + return buf; +} + +#endif diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h new file mode 100644 index 0000000..cdc8f7e --- /dev/null +++ b/aten/src/TH/generic/THTensor.h @@ -0,0 +1,137 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensor.h" +#else + +/* a la lua? dim, storageoffset, ... et les methodes ? */ + +#define THCTensor THTensor + +// Struct definition moved to THTensor.hpp +typedef struct THTensor THTensor; + +// These used to be distinct types; for some measure of backwards compatibility and documentation +// alias these to the single THTensor type. +#define THFloatTensor THTensor +#define THDoubleTensor THTensor +#define THHalfTensor THTensor +#define THByteTensor THTensor +#define THCharTensor THTensor +#define THShortTensor THTensor +#define THIntTensor THTensor +#define THLongTensor THTensor + +/**** access methods ****/ +TH_API THStorage* THTensor_(storage)(const THTensor *self); +TH_API ptrdiff_t THTensor_(storageOffset)(const THTensor *self); + +// See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim(). +TH_API int THTensor_(nDimension)(const THTensor *self); +TH_API int THTensor_(_nDimension)(const THTensor *self); +TH_API int64_t THTensor_(size)(const THTensor *self, int dim); +TH_API int64_t THTensor_(stride)(const THTensor *self, int dim); +TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self); +TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self); +TH_API real *THTensor_(data)(const THTensor *self); + + +/**** creation methods ****/ +TH_API THTensor *THTensor_(new)(void); +TH_API THTensor *THTensor_(newWithTensor)(THTensor *tensor); +/* stride might be NULL */ +TH_API THTensor *THTensor_(newWithStorage)(THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_); +TH_API THTensor *THTensor_(newWithStorage1d)(THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_); +TH_API THTensor *THTensor_(newWithStorage2d)(THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_); +TH_API THTensor *THTensor_(newWithStorage3d)(THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_); +TH_API THTensor *THTensor_(newWithStorage4d)(THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_, + int64_t size3_, int64_t stride3_); + +/* stride might be NULL */ +TH_API THTensor *THTensor_(newWithSize)(THLongStorage *size_, THLongStorage *stride_); +TH_API THTensor *THTensor_(newWithSize1d)(int64_t size0_); +TH_API THTensor *THTensor_(newWithSize2d)(int64_t size0_, int64_t size1_); +TH_API THTensor *THTensor_(newWithSize3d)(int64_t size0_, int64_t size1_, int64_t size2_); +TH_API THTensor *THTensor_(newWithSize4d)(int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_); + +TH_API THTensor *THTensor_(newClone)(THTensor *self); +TH_API THTensor *THTensor_(newContiguous)(THTensor *tensor); +TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_); +TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_); +TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_); +TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, int64_t size_, int64_t step_); +TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size); + +// resize* methods simply resize the storage. So they may not retain the current data at current indices. +// This is especially likely to happen when the tensor is not contiguous. In general, if you still need the +// values, unless you are doing some size and stride tricks, do not use resize*. +TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride); +TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, int64_t *size, int64_t *stride); +TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src); +TH_API void THTensor_(resize1d)(THTensor *tensor, int64_t size0_); +TH_API void THTensor_(resize2d)(THTensor *tensor, int64_t size0_, int64_t size1_); +TH_API void THTensor_(resize3d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_); +TH_API void THTensor_(resize4d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_); +TH_API void THTensor_(resize5d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_); +// Note: these are legacy resize functions that treat sizes as size->size == 0 and size->data() as being 0-terminated. + +TH_API void THTensor_(set)(THTensor *self, THTensor *src); +TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_); +TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, int64_t *size, int64_t *stride); +TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_); +TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_); +TH_API void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_); +TH_API void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_, + int64_t size3_, int64_t stride3_); + +TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, int64_t firstIndex_, int64_t size_); +TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, int64_t sliceIndex_); +TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_); +TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, int64_t size_, int64_t step_); + +TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src); +TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_); +TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_); + +TH_API int THTensor_(isContiguous)(const THTensor *self); +TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src); +TH_API int THTensor_(isSetTo)(const THTensor *self, const THTensor *src); +TH_API int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims); +TH_API ptrdiff_t THTensor_(nElement)(const THTensor *self); + +TH_API void THTensor_(retain)(THTensor *self); +TH_API void THTensor_(free)(THTensor *self); +TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst); + +/* Slow access methods [check everything] */ +TH_API void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value); +TH_API void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value); +TH_API void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value); +TH_API void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value); + +TH_API real THTensor_(get1d)(const THTensor *tensor, int64_t x0); +TH_API real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1); +TH_API real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2); +TH_API real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3); + +/* Debug methods */ +TH_API THDescBuff THTensor_(desc)(const THTensor *tensor); +TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor); + +#endif diff --git a/aten/src/TH/generic/THTensorConv.cpp b/aten/src/TH/generic/THTensorConv.cpp new file mode 100644 index 0000000..fb4670c --- /dev/null +++ b/aten/src/TH/generic/THTensorConv.cpp @@ -0,0 +1,1953 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorConv.cpp" +#else + +/* + 2D Input, 2D kernel : convolve given image with the given kernel. +*/ +void THTensor_(validXCorr2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc) +{ + int64_t or_ = (ir - kr) / sr + 1; + int64_t oc = (ic - kc) / sc + 1; + + int64_t xx, yy, kx, ky; + + if ((sc != 1) || (oc < 4)) { + /* regular convolution */ + for(yy = 0; yy < or_; yy++) { + for(xx = 0; xx < oc; xx++) { + /* Dot product in two dimensions... (between input image and the mask) */ + real *pi_ = t_ + yy*sr*ic + xx*sc; + real *pw_ = k_; + real sum = 0; + for(ky = 0; ky < kr; ky++) { + for(kx = 0; kx < kc; kx++) { + sum += pi_[kx]*pw_[kx]; + } + pi_ += ic; /* next input line */ + pw_ += kc; /* next mask line */ + } + /* Update output */ + *r_++ += alpha*sum; + } + } + + } else { + /* SSE-based convolution */ + for(yy = 0; yy < or_; yy++) { + real *pi_ = t_ + yy*sr*ic; + real *pw_ = k_; + for (ky = 0; ky < kr; ky++) { + real *pis_ = pi_; + for (kx = 0; kx < kc; kx++) { + THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc); + pis_++; + } + pi_ += ic; /* next input line */ + pw_ += kc; /* next mask line */ + } + r_ += oc; + } + } +} + +/* + 2D Input, 2D kernel : convolve given image with the given kernel. +*/ +void THTensor_(validConv2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc) +{ + int64_t or_ = (ir - kr) / sr + 1; + int64_t oc = (ic - kc) / sc + 1; + + int64_t xx, yy, kx, ky; + + if ((sc != 1) || (oc < 4)) { + /* regular convolution */ + for(yy = 0; yy < or_; yy++) { + for(xx = 0; xx < oc; xx++) { + /* Dot product in two dimensions... (between input image and the mask) */ + real *pi_ = t_ + yy*sr*ic + xx*sc; + real *pw_ = k_ + kr*kc - 1; + real sum = 0; + for(ky = 0; ky < kr; ky++) { + for(kx = 0; kx < kc; kx++) { + sum += pi_[kx]*pw_[-kx]; + } + pi_ += ic; /* next input line */ + pw_ -= kc; /* next mask line */ + } + /* Update output */ + *r_++ += alpha*sum; + } + } + + } else { + /* SSE-based convolution */ + for(yy = 0; yy < or_; yy++) { + real *pw_ = k_ + kr*kc - 1; + real *pi_ = t_ + yy*sr*ic; + for (ky = 0; ky < kr; ky++) { + real *pis_ = pi_; + for (kx = 0; kx < kc; kx++) { + THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc); + pis_++; + } + pi_ += ic; /* next input line */ + pw_ -= kc; /* next mask line */ + } + r_ += oc; + } + } +} + +/* + 2D Input, 2D kernel : convolve given image with the given kernel, full convolution. +*/ +void THTensor_(fullConv2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc) +{ + int64_t oc = (ic - 1) * sc + kc; + + int64_t xx, yy, kx, ky; + + if ((sc != 1) || (ic < 4)) { + /* regular convolution */ + for(yy = 0; yy < ir; yy++) { + for(xx = 0; xx < ic; xx++) { + /* Outer product in two dimensions... (between input image and the mask) */ + real *po_ = r_ + yy*sr*oc + xx*sc; + real *pw_ = k_; + for(ky = 0; ky < kr; ky++) + { + real z = *t_ * alpha; + for(kx = 0; kx < kc; kx++) { + po_[kx] += z * pw_[kx]; + } + po_ += oc; /* next input line */ + pw_ += kc; /* next mask line */ + } + t_++; + } + } + + } else { + /* SSE-based convolution */ + for(yy = 0; yy < ir; yy++) { + real *po_ = r_ + yy*sr*oc; + real *pw_ = k_; + for (ky = 0; ky < kr; ky++) { + real *pos_ = po_; + for (kx = 0; kx < kc; kx++) { + THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic); + pos_++; + } + po_ += oc; /* next input line */ + pw_ += kc; /* next mask line */ + } + t_ += ic; + } + } +} + +/* + 2D Input, 2D kernel : convolve given image with the given kernel, full convolution. +*/ +void THTensor_(fullXCorr2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc) +{ + int64_t oc = (ic - 1) * sc + kc; + + int64_t xx, yy, kx, ky; + + if ((sc != 1) || (ic < 4)) { + /* regular convolution */ + for(yy = 0; yy < ir; yy++) { + for(xx = 0; xx < ic; xx++) { + /* Outer product in two dimensions... (between input image and the mask) */ + real *po_ = r_ + yy*sr*oc + xx*sc; + real *pw_ = k_ + kr*kc -1; + int64_t kx, ky; + for(ky = 0; ky < kr; ky++) + { + real z = *t_ * alpha; + for(kx = 0; kx < kc; kx++) { + po_[kx] += z * pw_[-kx]; + } + po_ += oc; /* next input line */ + pw_ -= kc; /* next mask line */ + } + t_++; + } + } + + } else { + /* SSE-based convolution */ + for(yy = 0; yy < ir; yy++) { + real *po_ = r_ + yy*sr*oc; + real *pw_ = k_ + kr*kc -1; + for (ky = 0; ky < kr; ky++) { + real *pos_ = po_; + for (kx = 0; kx < kc; kx++) { + THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic); + pos_++; + } + po_ += oc; /* next input line */ + pw_ -= kc; /* next mask line */ + } + t_ += ic; + } + } +} + +/* + 2D Input, 2D kernel : convolve given image with the given kernel, valid convolution. + for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for + calculating derivatives wrt a kernel that is applied with stride sr,sc != 1 +*/ +void THTensor_(validXCorr2DRevptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc) +{ + int64_t or_ = ir - (kr - 1) * sr; + int64_t oc = ic - (kc - 1) * sc; + + int64_t xx, yy, kx, ky; + + if ((sc != 1) || (kc < 4)) { + /* regular convolution */ + for(yy = 0; yy < kr; yy++) { + for(xx = 0; xx < kc; xx++) { + real *po_ = r_; + real *pi_ = t_ + yy*sr*ic + xx*sc; + real z = *k_++ * alpha; + + for(ky = 0; ky < or_; ky++) { + for(kx = 0; kx < oc; kx++) + po_[kx] += z * pi_[kx]; + pi_ += ic; + po_ += oc; + } + } + } + + } else { + /* SSE-based convolution */ + for(yy = 0; yy < kr; yy++) { + for(xx = 0; xx < kc; xx++) { + real *po_ = r_; + real *pi_ = t_ + yy*sr*ic + xx*sc; + real z = *k_++ * alpha; + + for(ky = 0; ky < or_; ky++) { + THVector_(cadd)(po_, po_, pi_, z, oc); + pi_ += ic; + po_ += oc; + } + } + } + } +} +/* + 3D Input, 3D kernel : convolve given volume with the given kernel. +*/ +void THTensor_(validXCorr3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc) +{ + int64_t ot = (it - kt) / st + 1; + int64_t or_ = (ir - kr) / sr + 1; + int64_t oc = (ic - kc) / sc + 1; + + int64_t zz, xx, yy; + + for (zz = 0; zz < ot; zz++) + { + for(yy = 0; yy < or_; yy++) + { + for(xx = 0; xx < oc; xx++) + { + /* Dot product in two dimensions... (between input image and the mask) */ + real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc; + real *pw_ = k_; + real sum = 0; + int64_t kz, kx, ky; + for(kz = 0; kz < kt; kz++) + { + for(ky = 0; ky < kr; ky++) + { + for(kx = 0; kx < kc; kx++) { + sum += pi_[kx]*pw_[kx]; + } + pi_ += ic; /* next input line */ + pw_ += kc; /* next mask line */ + } + pi_ += (ir-kr)*ic; /* next input slice */ + } + /* Update output */ + *r_++ += sum*alpha; + } + } + } +} + +/* + 3D Input, 3D kernel : convolve given volume with the given kernel. +*/ +void THTensor_(validConv3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc) +{ + int64_t ot = (it - kt) / st + 1; + int64_t or_ = (ir - kr) / sr + 1; + int64_t oc = (ic - kc) / sc + 1; + + int64_t zz, xx, yy; + + for(zz = 0; zz < ot; zz++) + { + for(yy = 0; yy < or_; yy++) + { + for(xx = 0; xx < oc; xx++) + { + /* Dot product in two dimensions... (between input image and the mask) */ + real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc; + real *pw_ = k_ + kt*kr*kc - 1; + real sum = 0; + int64_t kz, kx, ky; + for(kz = 0; kz < kt; kz++) + { + for(ky = 0; ky < kr; ky++) + { + for(kx = 0; kx < kc; kx++) { + sum += pi_[kx]*pw_[-kx]; + } + pi_ += ic; /* next input line */ + pw_ -= kc; /* next mask line */ + } + pi_ += (ir-kr)*ic; /* next input slice */ + } + /* Update output */ + *r_++ += alpha*sum; + } + } + } +} + + +/* + 3D Input, 3D kernel : convolve given volume with the given kernel, full convolution. +*/ +void THTensor_(fullConv3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc) +{ + int64_t or_ = (ir - 1) * sr + kr; + int64_t oc = (ic - 1) * sc + kc; + + int64_t zz, xx, yy; + + for(zz = 0; zz < it; zz++) + { + for(yy = 0; yy < ir; yy++) + { + for(xx = 0; xx < ic; xx++) + { + /* Outer product in two dimensions... (between input image and the mask) */ + real *po_ = r_ + zz*st*or_*oc + yy*sr*oc + xx*sc; + real *pw_ = k_; + int64_t kz, kx, ky; + /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */ + for(kz = 0; kz < kt; kz++) + { + for(ky = 0; ky < kr; ky++) + { + real z = *t_ * alpha; + for(kx = 0; kx < kc; kx++) { + /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */ + po_[kx] += z * pw_[kx]; + /* printf("o=%g " , po_[kx]); */ + } + /* printf("\n"); */ + po_ += oc; /* next input line */ + pw_ += kc; /* next mask line */ + } + po_ += (or_-kr)*oc; /* next output slice */ + /* printf("\n"); */ + } + t_++; + } + } + } +} + +/* + 3D Input, 3D kernel : convolve given volume with the given kernel, full convolution. +*/ +void THTensor_(fullXCorr3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc) +{ + int64_t or_ = (ir - 1) * sr + kr; + int64_t oc = (ic - 1) * sc + kc; + + int64_t zz, xx, yy; + + for(zz = 0; zz < it; zz++) + { + for(yy = 0; yy < ir; yy++) + { + for(xx = 0; xx < ic; xx++) + { + /* Outer product in two dimensions... (between input image and the mask) */ + real *po_ = r_ + zz*st*or_*oc + yy*sr*oc + xx*sc; + real *pw_ = k_ + kt*kr*kc -1; + int64_t kz, kx, ky; + for(kz = 0; kz < kt; kz++) + { + for(ky = 0; ky < kr; ky++) + { + real z = *t_ * alpha; + for(kx = 0; kx < kc; kx++) { + po_[kx] += z * pw_[-kx]; + } + po_ += oc; /* next input line */ + pw_ -= kc; /* next mask line */ + } + po_ += (or_-kr)*oc; /* next output slice */ + } + t_++; + } + } + } +} + +/* + 3D Input, 3D kernel : convolve given image with the given kernel, valid convolution. + for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for + calculating derivatives wrt a kernel that is applied with stride sr,sc != 1 +*/ +void THTensor_(validXCorr3DRevptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc) +{ + int64_t ot = it - (kt - 1) * st; + int64_t or_ = ir - (kr - 1) * sr; + int64_t oc = ic - (kc - 1) * sc; + + int64_t zz, xx, yy; + for(zz = 0; zz < kt; zz++) + { + for(yy = 0; yy < kr; yy++) + { + for(xx = 0; xx < kc; xx++) + { + real *po_ = r_; + real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc; + real z = *k_++ * alpha; + int64_t kz, kx, ky; + for(kz = 0; kz < ot; kz++) + { + for(ky = 0; ky < or_; ky++) + { + for(kx = 0; kx < oc; kx++) + po_[kx] += z * pi_[kx]; + pi_ += ic; + po_ += oc; + } + pi_ += (ir-or_)*ic; /* next input slice */ + } + } + } + } +} + +void THTensor_(conv2d)(real* output_data, + real alpha, + real* ptr_input, int64_t nInputRows, int64_t nInputCols, + real* ptr_weight, int64_t nKernelRows, int64_t nKernelCols, + int64_t srow, int64_t scol, + const char *vf, const char *xc) +{ + THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'"); + if (*vf == 'F') + if (*xc == 'X') + THTensor_(fullXCorr2Dptr)(output_data, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(fullConv2Dptr)(output_data, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + if (*xc == 'X') + THTensor_(validXCorr2Dptr)(output_data, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(validConv2Dptr)(output_data, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); +} + +void THTensor_(conv3d)(real* output_data, + real alpha, + real* ptr_input, int64_t nInputDepth, int64_t nInputRows, int64_t nInputCols, + real* ptr_weight, int64_t nKernelDepth, int64_t nKernelRows, int64_t nKernelCols, + int64_t sdepth, int64_t srow, int64_t scol, + const char *vf, const char *xc) +{ + THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'"); + if (*vf == 'F') + if (*xc == 'X') + THTensor_(fullXCorr3Dptr)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol); + else + THTensor_(fullConv3Dptr)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol); + else + if (*xc == 'X') + THTensor_(validXCorr3Dptr)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol); + else + THTensor_(validConv3Dptr)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol); +} + +int64_t THTensor_(convsize)(int64_t x, int64_t k, int64_t s, const char* vf) +{ + THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'"); + if (*vf == 'V') + return (x-k)/s + 1; + else + return (x-1)*s + k; +} + + +/* + 3D input, 3D kernel, 4D output + like rank1 update + A <- xx' + beta*A + for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for + calculating derivatives wrt a kernel that is applied with stride sr,sc != 1 +*/ +void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol) +{ + int64_t nInputPlane, nInputRows, nInputCols; + int64_t nKernelPlane, nKernelRows, nKernelCols; + int64_t nOutputRows, nOutputCols; + int64_t istride0, kstride0; + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + nInputPlane = input->size[0]; + istride0 = input->stride[0]; + nInputRows = input->size[1]; + nInputCols = input->size[2]; + + kstride0 = kernel->stride[0]; + nKernelPlane = kernel->size[0]; + nKernelRows = kernel->size[1]; + nKernelCols = kernel->size[2]; + + THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel"); + + nOutputRows = nInputRows - (nKernelRows - 1) * srow; + nOutputCols = nInputCols - (nKernelCols - 1) * scol; + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + /*THTensor_(zero)(r_);*/ + +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]*r_->size[1]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] = 0.0; + } + } + else if (beta != 1) + { + /*THTensor_(mul)(r_, beta);*/ +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]*r_->size[1]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] *= beta; + } + } + +#pragma omp parallel for private(k) + for(k = 0; k < nKernelPlane; k++) + { + int64_t i; + /* get kernel */ + real *ptr_weight = weight_data+k*kstride0; + + for(i = 0; i < nInputPlane; i++) + { + /* get output */ + real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows; + /* get input */ + real *ptr_input = input_data+i*istride0; + + /* do image, kernel convolution */ + THTensor_(validXCorr2DRevptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + /* Next output plane */ + /* output_data += nOutputCols*nOutputRows; */ + } + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + + +/* + 3D input, 3D kernel, 4D output + like rank1 update + A <- xx' + beta*A + for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for + calculating derivatives wrt a kernel that is applied with stride sr,sc != 1 +*/ +void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol) +{ + int64_t nbatch, nInputPlane, nInputRows, nInputCols; + int64_t nKernelPlane, nKernelRows, nKernelCols; + int64_t nOutputRows, nOutputCols; + int64_t istride0, kstride0, istride1, kstride1; + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + istride0 = input->stride[0]; + istride1 = input->stride[1]; + nbatch = input->size[0]; + nInputPlane = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + kstride1 = kernel->stride[1]; + nKernelPlane = kernel->size[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + + THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel"); + THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size"); + + nOutputRows = nInputRows - (nKernelRows - 1) * srow; + nOutputCols = nInputCols - (nKernelCols - 1) * scol; + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + /*THTensor_(zero)(r_);*/ + +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]*r_->size[1]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] = 0.0; + } + } + else if (beta != 1) + { + /*THTensor_(mul)(r_, beta);*/ +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]*r_->size[1]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] *= beta; + } + } + +#pragma omp parallel for private(k) + for(k = 0; k < nKernelPlane; k++) + { + int64_t i; + for(i = 0; i < nInputPlane; i++) + { + int64_t p; + for(p = 0; p < nbatch; p++) + { + /* get kernel */ + real *ptr_weight = weight_data + p*kstride0 + k*kstride1; + /* get output */ + real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows; + /* get input */ + real *ptr_input = input_data + p*istride0 + i*istride1; + + /* do image, kernel convolution */ + THTensor_(validXCorr2DRevptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + /* Next output plane */ + /* output_data += nOutputCols*nOutputRows; */ + } + } + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + + +/* + 3D input, 3D kernel, 4D output + like rank1 update + A <- xx' + beta*A +*/ +void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputRows, nInputCols; + int64_t nKernelPlane, nKernelRows, nKernelCols; + int64_t nOutputRows, nOutputCols; + int64_t istride0, kstride0; + + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + nInputPlane = input->size[0]; + istride0 = input->stride[0]; + nInputRows = input->size[1]; + nInputCols = input->size[2]; + + kstride0 = kernel->stride[0]; + nKernelPlane = kernel->size[0]; + nKernelRows = kernel->size[1]; + nKernelCols = kernel->size[2]; + + THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel"); + + if (*vf == 'F') { + nOutputRows = (nInputRows - 1) * srow + nKernelRows; + nOutputCols = (nInputCols - 1) * scol + nKernelCols; + } else { /* valid */ + nOutputRows = (nInputRows - nKernelRows) / srow + 1; + nOutputCols = (nInputCols - nKernelCols) / scol + 1; + } + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + /*THTensor_(zero)(r_);*/ +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]*r_->size[1]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] = 0.0; + } + } + else if (beta != 1) + { + /*THTensor_(mul)(r_, beta);*/ +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]*r_->size[1]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] *= beta; + } + } + +#pragma omp parallel for private(k) + for(k = 0; k < nKernelPlane; k++) + { + int64_t i; + /* get kernel */ + real *ptr_weight = weight_data+k*kstride0; + + for(i = 0; i < nInputPlane; i++) + { + /* get output */ + real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows; + /* get input */ + real *ptr_input = input_data+i*istride0; + + /* do image, kernel convolution */ + if (*vf == 'F') + if (*xc == 'X') + THTensor_(fullXCorr2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(fullConv2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + if (*xc == 'X') + THTensor_(validXCorr2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(validConv2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + /* Next output plane */ + /* output_data += nOutputCols*nOutputRows; */ + } + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + + +/* + 3D input, 4D kernel, 3D output + matrix vector product like + y <- Ax + beta*y +*/ +void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputRows, nInputCols; + int64_t nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputRows, nOutputCols; + int64_t istride0, kstride0, kstride1; + THTensor *input; + THTensor* kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) { + kernel = THTensor_(newContiguous)(k_); + } else { + THTensor_(retain)(k_); + kernel = k_; + } + + nInputPlane = input->size[0]; + istride0 = input->stride[0]; + nInputRows = input->size[1]; + nInputCols = input->size[2]; + + kstride0 = kernel->stride[0]; + kstride1 = kernel->stride[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + nOutputPlane = kernel->size[0]; + THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); + + THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel"); + + if (*vf == 'F') { + nOutputRows = (nInputRows - 1) * srow + nKernelRows; + nOutputCols = (nInputCols - 1) * scol + nKernelCols; + } else { /* valid */ + nOutputRows = (nInputRows - nKernelRows) / srow + 1; + nOutputCols = (nInputCols - nKernelCols) / scol + 1; + } + + nelem = THTensor_(nElement)(r_); + THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + /*THTensor_(zero)(r_);*/ +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] = 0.0; + } + } + else if (beta != 1) + { + /*THTensor_(mul)(r_, beta);*/ +#pragma omp parallel for private(k) + for (k = 0; k < r_->size[0]; k++) + { + real* ptr_output = output_data + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] *= beta; + } + } + +#pragma omp parallel for private(k) + for(k = 0; k < nOutputPlane; k++) + { + int64_t i; + /* get output */ + real *ptr_output = output_data + k*nOutputCols*nOutputRows; + for(i = 0; i < nInputPlane; i++) + { + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0 + i*kstride1; + /* get input */ + real *ptr_input = input_data + i*istride0; + + /* do image, kernel convolution */ + if (*vf == 'F') + if (*xc == 'X') + THTensor_(fullXCorr2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(fullConv2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + if (*xc == 'X') + THTensor_(validXCorr2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(validConv2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + } + /* Next output plane */ + /* output_data += nOutputCols*nOutputRows;*/ + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + + +/* + 3D input, 4D kernel, 3D output + matrix vector product like + y <- Ax + beta*y +*/ +void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputRows, nInputCols; + int64_t nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputRows, nOutputCols; + int64_t kstride0, kstride1; + THTensor *input; + THTensor* kernel; + int64_t nbatch; + ptrdiff_t nelem; + real *input_data; + real *weight_data; + real *output_data; + int64_t p; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) { + kernel = THTensor_(newContiguous)(k_); + } else { + THTensor_(retain)(k_); + kernel = k_; + } + + nbatch = input->size[0]; + nInputPlane = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + kstride1 = kernel->stride[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + nOutputPlane = kernel->size[0]; + THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); + + THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel"); + + if (*vf == 'F') { + nOutputRows = (nInputRows - 1) * srow + nKernelRows; + nOutputCols = (nInputCols - 1) * scol + nKernelCols; + } else { /* valid */ + nOutputRows = (nInputRows - nKernelRows) / srow + 1; + nOutputCols = (nInputCols - nKernelCols) / scol + 1; + } + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + /*THTensor_(zero)(r_);*/ +#pragma omp parallel for private(p) + for (p=0; p < r_->size[0]; p++) + { + int64_t k; + for (k = 0; k < r_->size[1]; k++) + { + real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] = 0.0; + } + } + } + else if (beta != 1) + { + /*THTensor_(mul)(r_, beta);*/ +#pragma omp parallel for private(p) + for(p=0; p < r_->size[0]; p++) + { + int64_t k; + for (k = 0; k < r_->size[1]; k++) + { + real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows; + int64_t l; + for (l = 0; l < nOutputRows*nOutputCols; l++) + ptr_output[l] *= beta; + } + } + } + +#pragma omp parallel for private(p) + for(p=0; p < nbatch; p++) + { + int64_t k; + for(k = 0; k < nOutputPlane; k++) + { + int64_t i; + /* get output */ + real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows; + for(i = 0; i < nInputPlane; i++) + { + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0 + i*kstride1; + /* get input */ + real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols; + + /* do image, kernel convolution */ + if (*vf == 'F') + if (*xc == 'X') + THTensor_(fullXCorr2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(fullConv2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + if (*xc == 'X') + THTensor_(validXCorr2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + else + THTensor_(validConv2Dptr)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol); + } + /* Next output plane */ + /* output_data += nOutputCols*nOutputRows;*/ + } + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + + +/* + 2D input, 2D kernel, 2D output + scalar multiplication like + y <- x*y + beta*y +*/ +void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + THTensor *input; + THTensor* kernel; + int64_t nInputRows; + int64_t nInputCols; + int64_t nKernelRows; + int64_t nKernelCols; + int64_t nOutputRows, nOutputCols; + real *ptr_input; + real *ptr_weight; + real *output_data; + ptrdiff_t nelem; + + AT_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + nInputRows = input->size[0]; + nInputCols = input->size[1]; + nKernelRows = kernel->size[0]; + nKernelCols = kernel->size[1]; + + THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel"); + + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize2d)(r_, nOutputRows, nOutputCols); + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + THTensor_(zero)(r_); + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + ptr_input = THTensor_(data)(input); + ptr_weight = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + + /* do image, kernel convolution */ + THTensor_(conv2d)(output_data, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol, vf, xc); + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 3D input, 3D kernel, 3D output + component wise multiplication like + y <- y.*x + beta*y +*/ +void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputRows, nInputCols; + int64_t nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputRows, nOutputCols; + int64_t istride0, kstride0; + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + istride0 = input->stride[0]; + nInputPlane = input->size[0]; + nInputRows = input->size[1]; + nInputCols = input->size[2]; + + kstride0 = kernel->stride[0]; + nOutputPlane = kernel->size[0]; + nKernelRows = kernel->size[1]; + nKernelCols = kernel->size[2]; + + THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); + THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel"); + + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + for(k = 0; k < nOutputPlane; k++) + { + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0; + /* get input */ + real *ptr_input = input_data + k*istride0; + + /* do image, kernel convolution */ + THTensor_(conv2d)(output_data, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol, vf, xc); + /* Next output plane */ + output_data += nOutputCols*nOutputRows; + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 3D input, 3D kernel, 3D output + component wise multiplication like with a permutation map + y <- y.*x + beta*y +*/ +void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputRows, nInputCols; + int64_t nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputRows, nOutputCols; + int64_t istride0, kstride0; + THTensor *input; + THTensor* kernel; + real *input_data; + real *weight_data; + real *output_data; + int64_t nmaps; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + THArgCheck(map->_dim() == 2 , 4, "map: 2D Tensor expected"); + THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + istride0 = input->stride[0]; + nInputPlane = input->size[0]; + nInputRows = input->size[1]; + nInputCols = input->size[2]; + + kstride0 = kernel->stride[0]; + nOutputPlane = kernel->size[0]; + nKernelRows = kernel->size[1]; + nKernelCols = kernel->size[2]; + + THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); + THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) + || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel"); + + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + nmaps = map->size[0]; + + for(k = 0; k < nmaps; k++) + { + /* get indices */ + int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1; + int64_t to = (int64_t)THTensor_(get2d)(map,k,1)-1; + + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0; + /* get input */ + real *ptr_input = input_data + from*istride0; + /* get output */ + real *ptr_output = output_data + to*nOutputRows*nOutputCols; + + /* do image, kernel convolution */ + THTensor_(conv2d)(ptr_output, + alpha, + ptr_input, nInputRows, nInputCols, + ptr_weight, nKernelRows, nKernelCols, + srow, scol, vf, xc); + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 4D input, 4D kernel, 5D output + like rank1 update + A <- xx' + beta*A + for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for + calculating derivatives wrt a kernel that is applied with stride sr,sc != 1 +*/ +void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, + int64_t sdepth, int64_t srow, int64_t scol) +{ + int64_t nInputPlane, nInputDepth, nInputRows, nInputCols; + int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols; + int64_t nOutputDepth, nOutputRows, nOutputCols; + int64_t istride0, kstride0; + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k, i; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); + THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + nInputPlane = input->size[0]; + istride0 = input->stride[0]; + nInputDepth = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + nKernelPlane = kernel->size[0]; + nKernelDepth= kernel->size[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + + THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel"); + + nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth; + nOutputRows = nInputRows - (nKernelRows - 1) * srow; + nOutputCols = nInputCols - (nKernelCols - 1) * scol; + + nelem = THTensor_(nElement)(r_); + THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + for(k = 0; k < nKernelPlane; k++) + { + /* get kernel */ + real *ptr_weight = weight_data+k*kstride0; + + for(i = 0; i < nInputPlane; i++) + { + /* get input */ + real *ptr_input = input_data+i*istride0; + + /* do image, kernel convolution */ + THTensor_(validXCorr3DRevptr)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol); + /* Next output plane */ + output_data += nOutputDepth*nOutputCols*nOutputRows; + } + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + + +/* + 4D input, 4D kernel, 5D output + like rank1 update + A <- xx' + beta*A +*/ +void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, + int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputDepth, nInputRows, nInputCols; + int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols; + int64_t nOutputDepth, nOutputRows, nOutputCols; + int64_t istride0, kstride0; + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k, i; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); + THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + nInputPlane = input->size[0]; + istride0 = input->stride[0]; + nInputDepth = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + nKernelPlane = kernel->size[0]; + nKernelDepth = kernel->size[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + + THArgCheck((nInputDepth >= nKernelDepth + && nInputRows >= nKernelRows + && nInputCols >= nKernelCols) + || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel"); + + nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf); + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + for(k = 0; k < nKernelPlane; k++) + { + /* get kernel */ + real *ptr_weight = weight_data+k*kstride0; + + for(i = 0; i < nInputPlane; i++) + { + /* get input */ + real *ptr_input = input_data+i*istride0; + + /* do image, kernel convolution */ + THTensor_(conv3d)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol, vf, xc); + + /* Next output plane */ + output_data += nOutputDepth*nOutputCols*nOutputRows; + } + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 4D input, 5D kernel, 4D output + matrix vector product like + y <- Ax + beta*y +*/ +void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, + int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputDepth, nInputRows, nInputCols; + int64_t nKernelDepth, nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols; + int64_t istride0, kstride0, kstride1; + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k, i; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes()); + THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); + THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) { + kernel = THTensor_(newContiguous)(k_); + } else { + THTensor_(retain)(k_); + kernel = k_; + } + + nInputPlane = input->size[0]; + istride0 = input->stride[0]; + nInputDepth = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + kstride1 = kernel->stride[1]; + nKernelDepth = kernel->size[2]; + nKernelRows = kernel->size[3]; + nKernelCols = kernel->size[4]; + nOutputPlane = kernel->size[0]; + THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); + + THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel"); + + nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf); + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + for(k = 0; k < nOutputPlane; k++) + { + for(i = 0; i < nInputPlane; i++) + { + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0 + i*kstride1; + /* get input */ + real *ptr_input = input_data + i*istride0; + + /* do image, kernel convolution */ + THTensor_(conv3d)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol, vf, xc); + } + /* Next output plane */ + output_data += nOutputDepth*nOutputCols*nOutputRows; + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 3D input, 3D kernel, 3D output + scalar multiplication like + y <- x*y + beta*y +*/ +void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, + int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + THTensor *input; + THTensor* kernel; + int64_t nInputDepth; + int64_t nInputRows; + int64_t nInputCols; + int64_t nKernelDepth; + int64_t nKernelRows; + int64_t nKernelCols; + int64_t nOutputDepth, nOutputRows, nOutputCols; + real *ptr_input; + real *ptr_weight; + real *output_data; + ptrdiff_t nelem; + + AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); + THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + nInputDepth = input->size[0]; + nInputRows = input->size[1]; + nInputCols = input->size[2]; + nKernelDepth = kernel->size[0]; + nKernelRows = kernel->size[1]; + nKernelCols = kernel->size[2]; + + THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel"); + + nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf); + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols); + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + THTensor_(zero)(r_); + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + ptr_input = THTensor_(data)(input); + ptr_weight = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + + /* do image, kernel convolution */ + THTensor_(conv3d)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol, vf, xc); + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 4D input, 4D kernel, 4D output + component wise multiplication like + y <- y.*x + beta*y +*/ +void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, + int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputDepth, nInputRows, nInputCols; + int64_t nKernelDepth, nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols; + int64_t istride0, kstride0; + + THTensor *input; + THTensor *kernel; + real *input_data; + real *weight_data; + real *output_data; + ptrdiff_t nelem; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + istride0 = input->stride[0]; + nInputPlane = input->size[0]; + nInputDepth = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + nOutputPlane = kernel->size[0]; + nKernelDepth = kernel->size[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + + THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); + THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel"); + + nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf); + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + for(k = 0; k < nOutputPlane; k++) + { + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0; + /* get input */ + real *ptr_input = input_data + k*istride0; + + /* do image, kernel convolution */ + THTensor_(conv3d)(output_data, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol, vf, xc); + + /* Next output plane */ + output_data += nOutputDepth*nOutputCols*nOutputRows; + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} + +/* + 4D input, 4D kernel, 4D output + component wise multiplication like with a permutation map + y <- y.*x + beta*y +*/ +void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, + int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc) +{ + int64_t nInputPlane, nInputDepth, nInputRows, nInputCols; + int64_t nKernelDepth, nKernelRows, nKernelCols; + int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols; + int64_t istride0, kstride0; + + THTensor *input; + THTensor *kernel; + ptrdiff_t nelem; + real *input_data; + real *weight_data; + real *output_data; + int64_t nmaps; + int64_t k; + + AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + THArgCheck(map->_dim() == 2 , 4, "map: 2D Tensor expected"); + THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); + THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); + THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'"); + THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'"); + + input = THTensor_(newContiguous)(t_); + kernel = THTensor_(newContiguous)(k_); + + istride0 = input->stride[0]; + nInputPlane = input->size[0]; + nInputDepth = input->size[1]; + nInputRows = input->size[2]; + nInputCols = input->size[3]; + + kstride0 = kernel->stride[0]; + nOutputPlane = kernel->size[0]; + nKernelDepth = kernel->size[1]; + nKernelRows = kernel->size[2]; + nKernelCols = kernel->size[3]; + + THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); + THArgCheck((nInputDepth >= nKernelDepth + && nInputRows >= nKernelRows + && nInputCols >= nKernelCols) || *vf == 'F', + 2, "conv3Dmap : Input image is smaller than kernel"); + + nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf); + nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf); + nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf); + + nelem = THTensor_(nElement)(r_); + THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols); + + if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_)) + { + THTensor_(zero)(r_); + } + else if (beta != 1) + THTensor_(mul)(r_, r_, beta); + + input_data = THTensor_(data)(input); + weight_data = THTensor_(data)(kernel); + output_data = THTensor_(data)(r_); + + nmaps = map->size[0]; + + for(k = 0; k < nmaps; k++) + { + /* get indices */ + int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1; + int64_t to = (int64_t)THTensor_(get2d)(map,k,1)-1; + + /* get kernel */ + real *ptr_weight = weight_data + k*kstride0; + /* get input */ + real *ptr_input = input_data + from*istride0; + /* get output */ + real *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols; + + /* do image, kernel convolution */ + THTensor_(conv3d)(ptr_output, + alpha, + ptr_input, nInputDepth, nInputRows, nInputCols, + ptr_weight, nKernelDepth, nKernelRows, nKernelCols, + sdepth, srow, scol, vf, xc); + } + THTensor_(free)(input); + THTensor_(free)(kernel); +} +#endif diff --git a/aten/src/TH/generic/THTensorConv.h b/aten/src/TH/generic/THTensorConv.h new file mode 100644 index 0000000..279ece6 --- /dev/null +++ b/aten/src/TH/generic/THTensorConv.h @@ -0,0 +1,79 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorConv.h" +#else + +TH_API void THTensor_(validXCorr2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc); + +TH_API void THTensor_(validConv2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc); + +TH_API void THTensor_(fullXCorr2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc); + +TH_API void THTensor_(fullConv2Dptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc); + +TH_API void THTensor_(validXCorr2DRevptr)(real *r_, + real alpha, + real *t_, int64_t ir, int64_t ic, + real *k_, int64_t kr, int64_t kc, + int64_t sr, int64_t sc); + +TH_API void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol); +TH_API void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol); +TH_API void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc); + +TH_API void THTensor_(validXCorr3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc); + +TH_API void THTensor_(validConv3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc); + +TH_API void THTensor_(fullXCorr3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc); + +TH_API void THTensor_(fullConv3Dptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc); + +TH_API void THTensor_(validXCorr3DRevptr)(real *r_, + real alpha, + real *t_, int64_t it, int64_t ir, int64_t ic, + real *k_, int64_t kt, int64_t kr, int64_t kc, + int64_t st, int64_t sr, int64_t sc); + +TH_API void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol); +TH_API void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc); +TH_API void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc); + +#endif diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp new file mode 100644 index 0000000..939e5b8 --- /dev/null +++ b/aten/src/TH/generic/THTensorCopy.cpp @@ -0,0 +1,249 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorCopy.cpp" +#else + +#ifndef _WIN32 +#define PRAGMA(P) _Pragma(#P) +#else +#define PRAGMA(P) __pragma(P) +#endif + +#ifdef _OPENMP +#define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000 +#include +#endif + +int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) { + const int MIN_SZ = 60 * 60; + return THTensor_(isContiguous)(tensor) && + !src->is_empty() && + THTensor_(nDimension)(src) == 2 && + THTensor_(stride)(src, 0) == 1 && + THTensor_(stride)(src, 1) == THTensor_(size)(src, 0) && + THTensor_(nElement)(tensor) >= MIN_SZ; +} + +// special case copy where tensor is contiguous and src is a transposed matrix +// This can be generalized to most copies, but it's tricker +void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { + #define MIN(x, y) (((x) < (y)) ? (x) : (y)) + #define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +#ifdef TH_REAL_IS_BYTE + const int BLOCK_SZ = 120; +#else + const int BLOCK_SZ = 60; +#endif + + THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ); + real *sp = THTensor_(data)(src); + real *rp = THTensor_(data)(tensor); + real *bp = THTensor_(data)(buf); + + + int64_t NR = THTensor_(size)(src, 0); + int64_t NC = THTensor_(size)(src, 1); + for (int64_t R = 0; R < NR; R += BLOCK_SZ) { + for (int64_t C = 0; C < NC; C += BLOCK_SZ) { + real *spo = sp + R + C * NR; + real *rpo = rp + C + R * NC; + + int nr = MIN(NR - R, BLOCK_SZ); + int nc = MIN(NC - C, BLOCK_SZ); + + // 1. copy columns from src to buf + for (int c = 0; c < nc; c++) { + memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(real)); + } + + // 2. transpose buf in place + int rc_max = MAX(nr, nc); + int rc_min = MIN(nr, nc); + for (int r = 0; r < rc_max; r++) { + int end = MIN(r, rc_min); + for (int c = 0; c < end; c++) { + real tmp = bp[r + BLOCK_SZ * c]; + bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c]; + bp[r * BLOCK_SZ + c] = tmp; + } + } + + // 3. copy rows from buf to dst + for (int r = 0; r < nr; r++) { + memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(real)); + } + } + } + THTensor_(free)(buf); + #undef MIN + #undef MAX +} + +void THTensor_(copy)(THTensor *tensor, THTensor *src) +{ + if (tensor == src) return; + ptrdiff_t tensorSize = THTensor_(nElement)(tensor); + ptrdiff_t srcSize = THTensor_(nElement)(src); + int tensorContig = THTensor_(isContiguous)(tensor); + int srcContig = THTensor_(isContiguous)(src); + + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); +#endif + if (tensorSize == srcSize) { + if ( tensorContig && srcContig) { + real *sp = THTensor_(data)(src); + real *rp = THTensor_(data)(tensor); +#ifndef TH_REAL_IS_HALF +#ifdef _OPENMP + #pragma omp parallel if ( (tensorSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP) ) + { + size_t num_threads = omp_get_num_threads(); + size_t tid = omp_get_thread_num(); + ptrdiff_t offset = tid * (tensorSize / num_threads); + ptrdiff_t end = (tid == num_threads - 1) ? tensorSize : offset + tensorSize / num_threads; + ptrdiff_t len = end - offset; + real *tensorData = rp + offset; + real *srcData = sp + offset; + THVector_(copy)(tensorData, srcData, len); + } +#else + THVector_(copy)(rp, sp, srcSize); +#endif + +#else + +#ifdef _OPENMP + if ((srcSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP)) { + ptrdiff_t i; + #pragma omp parallel for private (i) + for(i=0; i (int64_t)-1 -> (uint8_t)255 is +// guaranteed to look like this, but we have (double)-1 -> (uint8_t) +// because it's UB. This also makes UBSan really angry. +// +// I think those rules are stupid and we really shouldn't conform to them. +// The structs below ensure that for all unsigned types we use (currently +// only uint8_t), we will do an intermediate convertion via int64_t, +// to ensure that any negative values are wrapped around correctly. +// +// Note that conversions from doubles to signed integral types that can't +// represent a particular value after truncating the fracitonal part are UB as well, +// but fixing them is not as simple as adding an int64_t intermediate, beacuse the +// int64_t -> conversion is UB for those large values anyway. +// I guess in that case we just have to live with that, but it's definitely less +// surprising than the thing above. +// +// For the curious: +// https://en.cppreference.com/w/cpp/language/implicit_conversion +// The relevant paragraph is "Floating–integral conversions". +template +struct inter_copy_type { + using type = T; +}; + +template<> +struct inter_copy_type { + using type = int64_t; +}; + +template +using inter_copy_type_t = typename inter_copy_type::type; + +#endif + +#define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \ +void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ +{ \ + TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, \ + *tensor_data = static_cast( \ + static_cast>(*src_data));) \ +} + +#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \ +void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ +{ \ + TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \ +} + +#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ +void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ +{ \ + TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, \ + *tensor_data = static_cast( \ + static_cast>( \ + TH_half2float(*src_data)));) \ +} + +#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ +void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ +{ \ + TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \ +} + +#ifndef TH_REAL_IS_HALF +IMPLEMENT_THTensor_COPY(Byte, uint8_t) +IMPLEMENT_THTensor_COPY(Char, int8_t) +IMPLEMENT_THTensor_COPY(Short, int16_t) +IMPLEMENT_THTensor_COPY(Int, int32_t) +IMPLEMENT_THTensor_COPY(Long, int64_t) +IMPLEMENT_THTensor_COPY(Float, float) +IMPLEMENT_THTensor_COPY(Double, double) +IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf) +#else +/* only allow pass-through for Half */ +IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf) +IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t) +IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t) +IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t) +IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t) +IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t) +IMPLEMENT_THTensor_COPY_TO_HALF(Float, float) +IMPLEMENT_THTensor_COPY_TO_HALF(Double, double) + +#endif /* REAL_IS_HALF */ + +#endif diff --git a/aten/src/TH/generic/THTensorCopy.h b/aten/src/TH/generic/THTensorCopy.h new file mode 100644 index 0000000..b9e5bfc --- /dev/null +++ b/aten/src/TH/generic/THTensorCopy.h @@ -0,0 +1,17 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorCopy.h" +#else + +/* Support for copy between different Tensor types */ + +TH_API void THTensor_(copy)(THTensor *tensor, THTensor *src); +TH_API void THTensor_(copyByte)(THTensor *tensor, struct THByteTensor *src); +TH_API void THTensor_(copyChar)(THTensor *tensor, struct THCharTensor *src); +TH_API void THTensor_(copyShort)(THTensor *tensor, struct THShortTensor *src); +TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src); +TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src); +TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src); +TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src); +TH_API void THTensor_(copyHalf)(THTensor *tensor, struct THHalfTensor *src); + +#endif diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp new file mode 100644 index 0000000..de65f08 --- /dev/null +++ b/aten/src/TH/generic/THTensorFastGetSet.hpp @@ -0,0 +1,45 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorFastGetSet.hpp" +#else + +static inline real THTensor_(fastGet1d)(THTensor *self, int64_t x0) { + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]]; +} + +static inline real THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) { + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]]; +} + +static inline real THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) { + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]]; +} + +static inline real THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]]; +} + +static inline real THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) { + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]]; +} + +static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, real value) { + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]] = value; +} + +static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, real value) { + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]] = value; +} + +static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, real value) { + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]] = value; +} + +static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]] = value; +} + +static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, real value) { + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]] = value; +} + +#endif diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp new file mode 100644 index 0000000..9bc5b19 --- /dev/null +++ b/aten/src/TH/generic/THTensorLapack.cpp @@ -0,0 +1,1139 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorLapack.cpp" +#else + +/* +Check if self is transpose of a contiguous matrix +*/ +static int THTensor_(isTransposedContiguous)(THTensor *self) +{ + return self->stride[0] == 1 && self->stride[1] == self->size[0]; +} +/* +If a matrix is a regular contiguous matrix, make sure it is transposed +because this is what we return from Lapack calls. +*/ +static void THTensor_(checkTransposed)(THTensor *self) +{ + if(THTensor_(isContiguous)(self)) + THTensor_(transpose)(self, NULL, 0, 1); + return; +} +/* +newContiguous followed by transpose +Similar to (newContiguous), but checks if the transpose of the matrix +is contiguous and also limited to 2D matrices. +*/ +static THTensor *THTensor_(newTransposedContiguous)(THTensor *self) +{ + THTensor *tensor; + if(THTensor_(isTransposedContiguous)(self)) + { + THTensor_(retain)(self); + tensor = self; + } + else + { + tensor = THTensor_(newContiguous)(self); + THTensor_(transpose)(tensor, NULL, 0, 1); + } + + return tensor; +} + +/* +Given the result tensor and src tensor, decide if the lapack call should use the +provided result tensor or should allocate a new space to put the result in. + +The returned tensor have to be freed by the calling function. + +nrows is required, because some lapack calls, require output space smaller than +input space, like underdetermined gels. +*/ +static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows) +{ + /* check if user wants to reuse src and if it is correct shape/size */ + if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows) + THTensor_(retain)(result); + else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */ + result = THTensor_(new)(); + else + THTensor_(retain)(result); + return result; +} + +/* +Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require +the resulting tensor to be larger than src. +*/ +static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, int nrows) +{ + THTensor *result; + THTensor *view; + + if (src == NULL) + src = self; + result = THTensor_(checkLapackClone)(self, src, nrows); + if (src == result) + return result; + + THTensor_(resize2d)(result, src->size[1], nrows); + THTensor_(checkTransposed)(result); + + if (src->size[0] == nrows) + THTensor_(copy)(result, src); + else + { + view = THTensor_(newNarrow)(result, 0, 0, src->size[0]); + THTensor_(copy)(view, src); + THTensor_(free)(view); + } + return result; +} + +/* +Create a clone of src in self column major order for use with Lapack. +If src == self, a new tensor is allocated, in any case, the return tensor should be +freed by calling function. +*/ +static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src) +{ + return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]); +} + +void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) +{ + int free_b = 0; + if (a == NULL) a = ra_; + if (b == NULL) b = rb_; + THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d", + a->dim()); + THArgCheck(!a->is_empty(), 2, "A should not be empty"); + THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 " + "dimensions, but has %d", b->dim()); + THArgCheck(!b->is_empty(), 2, "B should not be empty"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld", + a->size[0], a->size[1]); + THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size[0], b->size[0]); + + if (b->dim() == 1) { + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], + b->stride[0], 1, 0); + free_b = 1; + } + + int n, nrhs, lda, ldb, info; + THIntTensor *ipiv; + THTensor *ra__; // working version of A matrix to be passed into lapack GELS + THTensor *rb__; // working version of B matrix to be passed into lapack GELS + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + rb__ = THTensor_(cloneColumnMajor)(rb_, b); + + n = (int)ra__->size[0]; + nrhs = (int)rb__->size[1]; + lda = n; + ldb = n; + + ipiv = THIntTensor_newWithSize1d((int64_t)n); + THLapack_(gesv)(n, nrhs, + THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), + THTensor_(data)(rb__), ldb, &info); + + THLapackCheckWithCleanup("Lapack Error in %s : U(%d,%d) is zero, singular U.", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(rb__); + THIntTensor_free(ipiv); + if (free_b) THTensor_(free)(b);), + "gesv", info, info); + + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(freeCopyTo)(rb__, rb_); + THIntTensor_free(ipiv); + if (free_b) THTensor_(free)(b); +} + +void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a, + const char *uplo, const char *trans, const char *diag) +{ + int free_b = 0; + if (a == NULL) a = ra_; + if (b == NULL) b = rb_; + THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d", + a->_dim()); + THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 " + "dimensions, but has %d", b->_dim()); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld", + a->size[0], a->size[1]); + THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size[0], b->size[0]); + + if (b->_dim() == 1) { + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], + b->stride[0], 1, 0); + free_b = 1; + } + + int n, nrhs, lda, ldb, info; + THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS + THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + rb__ = THTensor_(cloneColumnMajor)(rb_, b); + + n = (int)ra__->size[0]; + nrhs = (int)rb__->size[1]; + lda = n; + ldb = n; + + THLapack_(trtrs)(uplo[0], trans[0], diag[0], n, nrhs, + THTensor_(data)(ra__), lda, + THTensor_(data)(rb__), ldb, &info); + + + THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(rb__); + if (free_b) THTensor_(free)(b);), + "trtrs", info, info); + + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(freeCopyTo)(rb__, rb_); + if (free_b) THTensor_(free)(b); +} + +void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) +{ + int free_b = 0; + // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_. + if (a == NULL) a = ra_; + if (b == NULL) b = rb_; + THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d", + a->dim()); + THArgCheck(!a->is_empty(), 2, "A should not be empty"); + THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 " + "dimensions, but has %d", b->dim()); + THArgCheck(!b->is_empty(), 1, "B should not be empty"); + THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size[0], b->size[0]); + + if (b->_dim() == 1) { + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], + b->stride[0], 1, 0); + free_b = 1; + } + + int m, n, nrhs, lda, ldb, info, lwork; + THTensor *work = NULL; + real wkopt = 0; + + THTensor *ra__ = NULL; // working version of A matrix to be passed into lapack GELS + THTensor *rb__ = NULL; // working version of B matrix to be passed into lapack GELS + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + m = ra__->size[0]; + n = ra__->size[1]; + lda = m; + ldb = (m > n) ? m : n; + + rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb); + + nrhs = rb__->size[1]; + info = 0; + + + /* get optimal workspace size */ + THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda, + THTensor_(data)(rb__), ldb, + &wkopt, -1, &info); + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda, + THTensor_(data)(rb__), ldb, + THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero", + THCleanup(THTensor_(free)(ra__); + THTensor_(free)(rb__); + THTensor_(free)(work); + if (free_b) THTensor_(free)(b);), + "gels", info,""); + + /* + * In the m < n case, if the input b is used as the result (so b == _rb), + * then rb_ was originally m by nrhs but now should be n by nrhs. + * This is larger than before, so we need to expose the new rows by resizing. + */ + if (m < n && b == rb_) { + THTensor_(resize2d)(rb_, n, nrhs); + } + + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(freeCopyTo)(rb__, rb_); + THTensor_(free)(work); + if (free_b) THTensor_(free)(b); +} + +void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr) +{ + int n, lda, lwork, info, ldvr; + THTensor *work=nullptr, *wi, *wr, *a; + real wkopt; + real *rv_data; + int64_t i; + + THTensor *re__ = NULL; + THTensor *rv__ = NULL; + + THArgCheck(a_->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square"); + + /* we want to definitely clone a_ for geev*/ + a = THTensor_(cloneColumnMajor)(NULL, a_); + + n = a->size[0]; + lda = n; + + wi = THTensor_(newWithSize1d)(n); + wr = THTensor_(newWithSize1d)(n); + + rv_data = NULL; + ldvr = 1; + if (*jobvr == 'V') + { + THTensor_(resize2d)(rv_,n,n); + /* guard against someone passing a correct size, but wrong stride */ + rv__ = THTensor_(newTransposedContiguous)(rv_); + rv_data = THTensor_(data)(rv__); + ldvr = n; + } + THTensor_(resize2d)(re_,n,2); + re__ = THTensor_(newContiguous)(re_); + + if (n > 0) { // lapack doesn't work with size 0 + /* get optimal workspace size */ + THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi), + NULL, 1, rv_data, ldvr, &wkopt, -1, &info); + + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + + THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi), + NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero", + THCleanup(THTensor_(free)(re__); + THTensor_(free)(rv__); + THTensor_(free)(a); + THTensor_(free)(wi); + THTensor_(free)(wr); + THTensor_(free)(work);), + "geev", info,""); + } + + { + real *re_data = THTensor_(data)(re__); + real *wi_data = THTensor_(data)(wi); + real *wr_data = THTensor_(data)(wr); + for (i=0; idim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1,"A should be square"); + + int n, lda, lwork, info; + THTensor *work = nullptr; + real wkopt; + + THTensor *rv__ = NULL; + THTensor *re__ = NULL; + + rv__ = THTensor_(cloneColumnMajor)(rv_, a); + + n = rv__->size[0]; + lda = n; + + THTensor_(resize1d)(re_,n); + re__ = THTensor_(newContiguous)(re_); + + /* get optimal workspace size */ + if (n != 0) { + THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda, + THTensor_(data)(re_), &wkopt, -1, &info); + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda, + THTensor_(data)(re_), THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero", + THCleanup(THTensor_(free)(rv__); + THTensor_(free)(re__); + THTensor_(free)(work);), + "syev", info,""); + } + + // No eigenvectors specified + if (*jobz == 'N') { + THTensor_(fill)(rv_, 0); + } + + THTensor_(freeCopyTo)(rv__, rv_); + THTensor_(freeCopyTo)(re__, re_); + THTensor_(free)(work); +} + +void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobu) +{ + THTensor *ra_ = THTensor_(new)(); + THTensor_(gesvd2)(ru_, rs_, rv_, ra_, a, jobu); + THTensor_(free)(ra_); +} + +void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu) +{ + if (a == NULL) a = ra_; + THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(!a->is_empty(), 1, "A should not be empty"); + + int k,m, n, lda, ldu, ldvt, lwork, info; + THTensor *work; + THTensor *rvf_ = THTensor_(new)(); + real wkopt; + + THTensor *ra__ = NULL; + THTensor *ru__ = NULL; + THTensor *rs__ = NULL; + THTensor *rv__ = NULL; + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + m = ra__->size[0]; + n = ra__->size[1]; + k = (m < n ? m : n); + + lda = m; + ldu = m; + ldvt = n; + + THTensor_(resize1d)(rs_,k); + THTensor_(resize2d)(rvf_,ldvt,n); + if (*jobu == 'A') + THTensor_(resize2d)(ru_,m,ldu); + else + THTensor_(resize2d)(ru_,k,ldu); + + THTensor_(checkTransposed)(ru_); + + /* guard against someone passing a correct size, but wrong stride */ + ru__ = THTensor_(newTransposedContiguous)(ru_); + rs__ = THTensor_(newContiguous)(rs_); + rv__ = THTensor_(newContiguous)(rvf_); + + THLapack_(gesvd)(jobu[0],jobu[0], + m,n,THTensor_(data)(ra__),lda, + THTensor_(data)(rs__), + THTensor_(data)(ru__), + ldu, + THTensor_(data)(rv__), ldvt, + &wkopt, -1, &info); + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + THLapack_(gesvd)(jobu[0],jobu[0], + m,n,THTensor_(data)(ra__),lda, + THTensor_(data)(rs__), + THTensor_(data)(ru__), + ldu, + THTensor_(data)(rv__), ldvt, + THTensor_(data)(work),lwork, &info); + + THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.", + THCleanup( + THTensor_(free)(ru__); + THTensor_(free)(rs__); + THTensor_(free)(rv__); + THTensor_(free)(ra__); + THTensor_(free)(work);), + "gesvd", info, ""); + + if (*jobu == 'S') + THTensor_(narrow)(rv__,NULL,1,0,k); + + THTensor_(freeCopyTo)(ru__, ru_); + THTensor_(freeCopyTo)(rs__, rs_); + THTensor_(freeCopyTo)(rv__, rvf_); + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(free)(work); + + if (*jobu == 'S') { + THTensor_(narrow)(rvf_,NULL,1,0,k); + } + THTensor_(resizeAs)(rv_, rvf_); + THTensor_(copy)(rv_, rvf_); + THTensor_(free)(rvf_); +} + +void THTensor_(getri)(THTensor *ra_, THTensor *a) +{ + if (a == NULL) a = ra_; + THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + + int m, n, lda, info, lwork; + real wkopt; + THIntTensor *ipiv; + THTensor *work; + THTensor *ra__ = NULL; + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + m = ra__->size[0]; + n = ra__->size[1]; + lda = m; + ipiv = THIntTensor_newWithSize1d((int64_t)m); + + /* Run LU */ + THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info); + THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular", + THCleanup( + THTensor_(free)(ra__); + THIntTensor_free(ipiv);), + "getrf", info, info); + + /* Run inverse */ + THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &wkopt, -1, &info); + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), THTensor_(data)(work), lwork, &info); + THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(work); + THIntTensor_free(ipiv);), + "getri", info, info); + + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(free)(work); + THIntTensor_free(ipiv); +} + +void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo) +{ + THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + + int n = a->size[0]; + + /* Build full matrix */ + real *p = THTensor_(data)(a); + int64_t i, j; + + /* Upper Triangular Case */ + if (uplo[0] == 'U') + { + /* Clear lower triangle (excluding diagonals) */ + for (i=0; i_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + + int n = a->size[0]; + + /* Build full matrix */ + real *p = THTensor_(data)(a); + int64_t i, j; + + /* Upper Triangular Case */ + if (uplo[0] == 'U') + { + /* Clear lower triangle (excluding diagonals) */ + for (i=0; i_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + + int n, lda, info; + THTensor *ra__ = NULL; + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + n = ra__->size[0]; + lda = n; + + /* Run Factorization */ + THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info); + THLapackCheckWithCleanup("Lapack Error in %s : the leading minor of order %d is not positive definite", + THCleanup(THTensor_(free)(ra__);), + "potrf", info, ""); + + THTensor_(clearUpLoTriangle)(ra__, uplo); + THTensor_(freeCopyTo)(ra__, ra_); +} + +void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo) +{ + int free_b = 0; + if (b == NULL) b = rb_; + + THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d", + a->_dim()); + THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 " + "dimensions, but has %d", b->_dim()); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld", + a->size[0], a->size[1]); + THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size[0], b->size[0]); + + if (b->_dim() == 1) { + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], + b->stride[0], 1, 0); + free_b = 1; + } + + int n, nrhs, lda, ldb, info; + THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS + THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS + + ra__ = THTensor_(cloneColumnMajor)(NULL, a); + rb__ = THTensor_(cloneColumnMajor)(rb_, b); + + n = (int)ra__->size[0]; + nrhs = (int)rb__->size[1]; + lda = n; + ldb = n; + + THLapack_(potrs)(uplo[0], n, nrhs, THTensor_(data)(ra__), + lda, THTensor_(data)(rb__), ldb, &info); + + + THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(rb__); + if (free_b) THTensor_(free)(b);), + "potrs", info, info); + + if (free_b) THTensor_(free)(b); + THTensor_(free)(ra__); + THTensor_(freeCopyTo)(rb__, rb_); +} + +void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo) +{ + if (a == NULL) a = ra_; + THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + + int n, lda, info; + THTensor *ra__ = NULL; + + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + n = ra__->size[0]; + lda = n; + + /* Run inverse */ + THLapack_(potri)(uplo[0], n, THTensor_(data)(ra__), lda, &info); + THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized", + THCleanup(THTensor_(free)(ra__);), + "potri", info, info); + + THTensor_(copyUpLoTriangle)(ra__, uplo); + THTensor_(freeCopyTo)(ra__, ra_); +} + +/* + Computes the Cholesky factorization with complete pivoting of a real symmetric + positive semidefinite matrix. + + Args: + * `ra_` - result Tensor in which to store the factor U or L from the + Cholesky factorization. + * `rpiv_` - result IntTensor containing sparse permutation matrix P, encoded + as P[rpiv_[k], k] = 1. + * `a` - input Tensor; the input matrix to factorize. + * `uplo` - string; specifies whether the upper or lower triangular part of + the symmetric matrix A is stored. "U"/"L" for upper/lower + triangular. + * `tol` - double; user defined tolerance, or < 0 for automatic choice. + The algorithm terminates when the pivot <= tol. + */ +void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) { + THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + + int n = a->size[0]; + + THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a); + THIntTensor_resize1d(rpiv_, n); + + // Allocate working tensor + THTensor *work = THTensor_(newWithSize1d)(2 * n); + + // Run Cholesky factorization + int lda = n; + int rank, info; + + THLapack_(pstrf)(uplo[0], n, THTensor_(data)(ra__), lda, + THIntTensor_data(rpiv_), &rank, tol, + THTensor_(data)(work), &info); + + THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(work);), + "pstrf", info,""); + + THTensor_(clearUpLoTriangle)(ra__, uplo); + + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(free)(work); +} + +/* + Perform a QR decomposition of a matrix. + + In LAPACK, two parts of the QR decomposition are implemented as two separate + functions: geqrf and orgqr. For flexibility and efficiency, these are wrapped + directly, below - but to make the common usage convenient, we also provide + this function, which calls them both and returns the results in a more + intuitive form. + + Args: + * `rq_` - result Tensor in which to store the Q part of the decomposition. + * `rr_` - result Tensor in which to store the R part of the decomposition. + * `a` - input Tensor; the matrix to decompose. + +*/ +void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a) +{ + int m = a->size[0]; + int n = a->size[1]; + int k = (m < n ? m : n); + THTensor *ra_ = THTensor_(new)(); + THTensor *rtau_ = THTensor_(new)(); + THTensor *rr__ = THTensor_(new)(); + THTensor_(geqrf)(ra_, rtau_, a); + THTensor_(resize2d)(rr__, k, ra_->size[1]); + THTensor_(narrow)(rr__, ra_, 0, 0, k); + THTensor_(triu)(rr_, rr__, 0); + THTensor_(resize2d)(rq_, ra_->size[0], k); + THTensor_(orgqr)(rq_, ra_, rtau_); + THTensor_(narrow)(rq_, rq_, 1, 0, k); + THTensor_(free)(ra_); + THTensor_(free)(rtau_); + THTensor_(free)(rr__); +} + +/* + The geqrf function does the main work of QR-decomposing a matrix. + However, rather than producing a Q matrix directly, it produces a sequence of + elementary reflectors which may later be composed to construct Q - for example + with the orgqr function, below. + + Args: + * `ra_` - Result matrix which will contain: + i) The elements of R, on and above the diagonal. + ii) Directions of the reflectors implicitly defining Q. + * `rtau_` - Result tensor which will contain the magnitudes of the reflectors + implicitly defining Q. + * `a` - Input matrix, to decompose. If NULL, `ra_` is used as input. + + For further details, please see the LAPACK documentation. + +*/ +void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a) +{ + if (a == NULL) ra_ = a; + THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(!a->is_empty(), 1, "A should not be empty"); + + THTensor *ra__ = NULL; + + /* Prepare the input for LAPACK, making a copy if necessary. */ + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + int m = ra__->size[0]; + int n = ra__->size[1]; + int k = (m < n ? m : n); + int lda = m; + THTensor_(resize1d)(rtau_, k); + + /* Dry-run to query the suggested size of the workspace. */ + int info = 0; + real wkopt = 0; + THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda, + THTensor_(data)(rtau_), + &wkopt, -1, &info); + + /* Allocate the workspace and call LAPACK to do the real work. */ + int lwork = (int)wkopt; + THTensor *work = THTensor_(newWithSize1d)(lwork); + THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda, + THTensor_(data)(rtau_), + THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup("Lapack Error %s : unknown Lapack error. info = %i", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(work);), + "geqrf", info,""); + + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(free)(work); +} + +/* + The orgqr function allows reconstruction of a matrix Q with orthogonal + columns, from a sequence of elementary reflectors, such as is produced by the + geqrf function. + + Args: + * `ra_` - result Tensor, which will contain the matrix Q. + * `a` - input Tensor, which should be a matrix with the directions of the + elementary reflectors below the diagonal. If NULL, `ra_` is used as + input. + * `tau` - input Tensor, containing the magnitudes of the elementary + reflectors. + + For further details, please see the LAPACK documentation. + +*/ +void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau) +{ + if (a == NULL) a = ra_; + THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + + THTensor *ra__ = NULL; + ra__ = THTensor_(cloneColumnMajor)(ra_, a); + + int m = ra__->size[0]; + int k = tau->size[0]; + int lda = m; + + /* Dry-run to query the suggested size of the workspace. */ + int info = 0; + real wkopt = 0; + THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda, + THTensor_(data)(tau), + &wkopt, -1, &info); + + /* Allocate the workspace and call LAPACK to do the real work. */ + int lwork = (int)wkopt; + THTensor *work = THTensor_(newWithSize1d)(lwork); + THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda, + THTensor_(data)(tau), + THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(work);), + "orgqr", info,""); + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(free)(work); +} + +/* + The ormqr function multiplies Q with another matrix from a sequence of + elementary reflectors, such as is produced by the geqrf function. + + Args: + * `ra_` - result Tensor, which will contain the matrix Q' c. + * `a` - input Tensor, which should be a matrix with the directions of the + elementary reflectors below the diagonal. If NULL, `ra_` is used as + input. + * `tau` - input Tensor, containing the magnitudes of the elementary + reflectors. + * `c` - input Tensor, containing the matrix to be multiplied. + * `side` - char, determining whether c is left- or right-multiplied with Q. + * `trans` - char, determining whether to transpose Q before multiplying. + + For further details, please see the LAPACK documentation. + +*/ +void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans) +{ + if (a == NULL) a = ra_; + THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + + THTensor *ra__ = NULL; + ra__ = THTensor_(cloneColumnMajor)(ra_, c); + + int m = c->size[0]; + int n = c->size[1]; + int k = tau->size[0]; + int lda; + if (*side == 'L') + { + lda = m; + } + else + { + lda = n; + } + int ldc = m; + + /* Dry-run to query the suggested size of the workspace. */ + int info = 0; + real wkopt = 0; + THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda, + THTensor_(data)(tau), THTensor_(data)(ra__), ldc, + &wkopt, -1, &info); + + /* Allocate the workspace and call LAPACK to do the real work. */ + int lwork = (int)wkopt; + THTensor *work = THTensor_(newWithSize1d)(lwork); + THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda, + THTensor_(data)(tau), THTensor_(data)(ra__), ldc, + THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i", + THCleanup( + THTensor_(free)(ra__); + THTensor_(free)(work);), + "ormqr", info,""); + THTensor_(freeCopyTo)(ra__, ra_); + THTensor_(free)(work); +} + +void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a) +{ + AT_CHECK(THTensor_(nDimension)(a) == 3, "expected 3D tensor, got size: ", a->sizes()); + if (!pivot) { + THError("btrifact without pivoting is not implemented on the CPU"); + } + + if (ra_ != a) { + THTensor_(resizeAs)(ra_, a); + THTensor_(copy)(ra_, a); + } + + int m = a->size[1]; + int n = a->size[2]; + if (m != n) { + THError("btrifact is only implemented for square matrices"); + } + int64_t num_batches = THTensor_(size)(a, 0); + THTensor *ra__; + int lda; + + if (ra_->stride[1] == 1) { + // column ordered, what BLAS wants + lda = ra_->stride[2]; + ra__ = ra_; + } else { + // not column ordered, need to make it such (requires copy) + THTensor *transp_r_ = THTensor_(newTranspose)(ra_, 1, 2); + ra__ = THTensor_(newClone)(transp_r_); + THTensor_(free)(transp_r_); + THTensor_(transpose)(ra__, NULL, 1, 2); + lda = ra__->stride[2]; + } + + THTensor *ai = THTensor_(new)(); + THTensor *rai = THTensor_(new)(); + THIntTensor *rpivoti = THIntTensor_new(); + + int info = 0; + int *info_ptr = &info; + if (rinfo_) { + THIntTensor_resize1d(rinfo_, num_batches); + info_ptr = THIntTensor_data(rinfo_); + } + + THIntTensor_resize2d(rpivots_, num_batches, n); + + int64_t batch = 0; + for (; batch < num_batches; ++batch) { + THTensor_(select)(ai, a, 0, batch); + THTensor_(select)(rai, ra__, 0, batch); + THIntTensor_select(rpivoti, rpivots_, 0, batch); + + THLapack_(getrf)(n, n, THTensor_(data)(rai), lda, + THIntTensor_data(rpivoti), info_ptr); + if (rinfo_) { + info_ptr++; + } else if (info != 0) { + break; + } + } + + THTensor_(free)(ai); + THTensor_(free)(rai); + THIntTensor_free(rpivoti); + + if (ra__ != ra_) { + THTensor_(freeCopyTo)(ra__, ra_); + } + + if (!rinfo_ && info != 0) { + THError("failed to factorize batch element %ld (info == %d)", batch, info); + } +} + +void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots) +{ + AT_CHECK(!atf->is_empty() && THTensor_(nDimension)(atf) == 3, "expected non-empty 3D tensor, got size: ", + atf->sizes()); + AT_CHECK(!b->is_empty() && (THTensor_(nDimension)(b) == 3 || + THTensor_(nDimension)(b) == 2), "expected non-empty 2D or 3D tensor, got size: ", b->sizes()); + THArgCheck(THTensor_(size)(atf, 0) == + THTensor_(size)(b, 0), 3, "number of batches must be equal"); + THArgCheck(THTensor_(size)(atf, 1) == + THTensor_(size)(atf, 2), 3, "A matrices must be square"); + THArgCheck(THTensor_(size)(atf, 1) == + THTensor_(size)(b, 1), 3, "dimensions of A and b must be equal"); + + if (rb_ != b) { + THTensor_(resizeAs)(rb_, b); + THTensor_(copy)(rb_, b); + } + + int64_t num_batches = atf->size[0]; + int64_t n = atf->size[1]; + int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1; + + int lda, ldb; + THTensor *atf_; + THTensor *rb__; + + // correct ordering of A + if (atf->stride[1] == 1) { + // column ordered, what BLAS wants + lda = atf->stride[2]; + atf_ = atf; + } else { + // not column ordered, need to make it such (requires copy) + // it would be nice if we could use the op(A) flags to automatically + // transpose A if needed, but this leads to unpredictable behavior if the + // user clones A_tf later with a different ordering + THTensor *transp_r_ = THTensor_(newTranspose)(atf, 1, 2); + atf_ = THTensor_(newClone)(transp_r_); + THTensor_(free)(transp_r_); + THTensor_(transpose)(atf_, NULL, 1, 2); + lda = atf_->stride[2]; + } + + // correct ordering of B + if (rb_->stride[1] == 1) { + // column ordered + if (rb_->_dim() == 2 || rb_->size[2] == 1) { + ldb = n; + } else { + ldb = rb_->stride[2]; + } + rb__ = rb_; + } else { + // make column ordered + if (rb_->_dim() > 2) { + THTensor *transp_r_ = THTensor_(newTranspose)(rb_, 1, 2); + rb__ = THTensor_(newClone)(transp_r_); + THTensor_(free)(transp_r_); + THTensor_(transpose)(rb__, NULL, 1, 2); + ldb = rb__->stride[2]; + } else { + rb__ = THTensor_(newClone)(rb_); + ldb = n; + } + } + + THTensor *ai = THTensor_(new)(); + THTensor *rbi = THTensor_(new)(); + THIntTensor *pivoti = THIntTensor_new(); + + if (!THIntTensor_isContiguous(pivots)) { + THError("Error: rpivots_ is not contiguous."); + } + + for (int64_t batch = 0; batch < num_batches; ++batch) { + THTensor_(select)(ai, atf_, 0, batch); + THTensor_(select)(rbi, rb__, 0, batch); + THIntTensor_select(pivoti, pivots, 0, batch); + +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + int info; + THLapack_(getrs)('N', n, nrhs, THTensor_(data)(ai), lda, + THIntTensor_data(pivoti), THTensor_(data)(rbi), + ldb, &info); + if (info != 0) { + THError("Error: Nonzero info."); + } +#else + THError("Unimplemented"); +#endif + } + + THTensor_(free)(ai); + THTensor_(free)(rbi); + THIntTensor_free(pivoti); + + if (atf_ != atf) { + THTensor_(free)(atf_); + } + + if (rb__ != rb_) { + THTensor_(freeCopyTo)(rb__, rb_); + } +} + +#endif diff --git a/aten/src/TH/generic/THTensorLapack.h b/aten/src/TH/generic/THTensorLapack.h new file mode 100644 index 0000000..8785943 --- /dev/null +++ b/aten/src/TH/generic/THTensorLapack.h @@ -0,0 +1,25 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorLapack.h" +#else + +TH_API void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_); +TH_API void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_, const char *uplo, const char *trans, const char *diag); +TH_API void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_); +TH_API void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobz, const char *uplo); +TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr); +TH_API void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *jobu); +TH_API void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *jobu); +TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a); +TH_API void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo); +TH_API void THTensor_(potrs)(THTensor *rb_, THTensor *b_, THTensor *a_, const char *uplo); +TH_API void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo); +TH_API void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a); +TH_API void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a); +TH_API void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau); +TH_API void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans); +TH_API void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor*a, const char* uplo, real tol); + +TH_API void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a); +TH_API void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots); + +#endif diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp new file mode 100644 index 0000000..8559d5d --- /dev/null +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -0,0 +1,4677 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorMath.cpp" +#else + +#ifndef NAN + #define NAN (nan(NULL)) +#endif + +#ifdef _OPENMP +#include +#endif + +#define HYPER_TH_OMP_OVERHEAD_THRESHOLD 2000 +#define ORDIN_TH_OMP_OVERHEAD_THRESHOLD 20000 +#define UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD 50000 +#define TH_OMP_OVERHEAD_THRESHOLD 100000 + +#ifdef _OPENMP + +#ifndef _WIN32 +#define PRAGMA(P) _Pragma(#P) +#else +#define PRAGMA(P) __pragma(P) +#endif + +#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \ +{ \ + int inOmp = omp_in_parallel(); \ + ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \ + PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \ + { \ + size_t num_threads = omp_get_num_threads(); \ + size_t tid = omp_get_thread_num(); \ + ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \ + ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \ + TH_TENSOR_offset + TH_TENSOR_size / num_threads; \ + ptrdiff_t TENSOR##_len = TH_TENSOR_end - TH_TENSOR_offset; \ + TYPE *TENSOR##_data = THTensor_(data)(TENSOR) + TH_TENSOR_offset; \ + CODE \ + } \ +} +#else +#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \ +{ \ + TYPE *TENSOR##_data = THTensor_(data)(TENSOR); \ + ptrdiff_t TENSOR##_len = THTensor_(nElement)(TENSOR); \ + CODE \ +} +#endif + +#ifdef _OPENMP +#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ +{ \ + int inOmp = omp_in_parallel(); \ + ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \ + PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \ + { \ + size_t num_threads = omp_get_num_threads(); \ + size_t tid = omp_get_thread_num(); \ + ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \ + ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \ + TH_TENSOR_offset + TH_TENSOR_size / num_threads; \ + ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \ + TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \ + TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \ + CODE \ + } \ +} +#else +#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ +{ \ + TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \ + TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \ + ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \ + CODE \ +} +#endif + +#ifdef _OPENMP +#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \ +{ \ + int inOmp = omp_in_parallel(); \ + ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \ + PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \ + { \ + size_t num_threads = omp_get_num_threads(); \ + size_t tid = omp_get_thread_num(); \ + ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \ + ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \ + TH_TENSOR_offset + TH_TENSOR_size / num_threads; \ + ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \ + TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \ + TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \ + TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3) + TH_TENSOR_offset; \ + CODE \ + } \ +} +#else +#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \ +{ \ + TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \ + TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \ + TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3); \ + ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \ + CODE \ +} +#endif + +#define TH_CHECK_SAME_SIZE(TENSOR1, TENSOR2) \ +{ \ + if(!THTensor_(isSameSizeAs)(TENSOR1, TENSOR2)) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ + THError("inconsistent tensor size, expected %s %s and %s %s to have the same size", \ + #TENSOR1, T1buff.str, #TENSOR2, T2buff.str); \ + } \ +} + +// Used for `scatter` and `scatterAdd` +// Assumes TENSOR1 is real +// TENSOR2 is src +// TENSOR3 is index +// Tests: +// 1. index->size[d] <= src->size[d] for all d +// 2. index->size[d] <= real->size[d] for all d != dim +#define TH_TENSOR_DIM_APPLY3_SIZE_SCATTER(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \ +{ \ + int shape_check_flag = 0; \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + { \ + int64_t TENSOR3##_dim_size = TENSOR3->size[TH_TENSOR_DIM_APPLY_i]; \ + if (TH_TENSOR_DIM_APPLY_i != DIMENSION) { \ + if (TENSOR3##_dim_size > TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) { \ + shape_check_flag = 1; \ + break; \ + } \ + } \ + if (TENSOR3##_dim_size > TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \ + shape_check_flag = 1; \ + break; \ + } \ + } \ + if (shape_check_flag == 1) { \ + THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ + THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ + THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \ + THError("Expected %s %s to be smaller size than %s %s and to be smaller than %s %s apart from dimension %d", \ + #TENSOR3, T3buff.str, #TENSOR2, T2buff.str, #TENSOR1, T1buff.str, DIMENSION); \ + } \ +} + +static inline real THTensor_(powOne)(real x, real y) { +#if defined(TH_REAL_IS_FLOAT) + return powf(x, y); +#elif defined(TH_REAL_IS_DOUBLE) + return pow(x, y); +#else + THArgCheck(y >= 0, 1, + "Integers to negative integer powers are not allowed"); + real result = 1; + while (y) { + if (y & 1) { + result *= x; + } + y /= 2; + x *= x; + } + return result; +#endif +} + +void THTensor_(fill)(THTensor *r_, real value) +{ + if (THTensor_(isContiguous)(r_) || THTensor_(isTransposed)(r_)) { + TH_TENSOR_APPLY_CONTIG(real, r_, THVector_(fill)(r__data, value, r__len);); + } else { + TH_TENSOR_APPLY(real, r_, + if (r__stride == 1) { + THVector_(fill)(r__data, value, r__size); + r__i = r__size; + r__data += r__stride * r__size; + break; + } else { + *r__data = value; + } + ); + } +} + +void THTensor_(zero)(THTensor *r_) +{ + THTensor_(fill)(r_, 0); +} + +void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value) +{ + TH_TENSOR_APPLY2(real, tensor, unsigned char, mask, + if (*mask_data > 1) + { + THFree(mask_counter); + THFree(tensor_counter); + THError("Mask tensor can take 0 and 1 values only"); + } + else if (*mask_data == 1) + { + *tensor_data = value; + }); +} + +void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src ) +{ + THTensor *srct = THTensor_(newContiguous)(src); + real *src_data = THTensor_(data)(srct); + ptrdiff_t cntr = 0; + ptrdiff_t nelem = THTensor_(nElement)(srct); + if (THTensor_(nElement)(tensor) != THByteTensor_nElement(mask)) + { + THTensor_(free)(srct); + THError("Number of elements of destination tensor != Number of elements in mask"); + } + TH_TENSOR_APPLY2(real, tensor, unsigned char, mask, + if (*mask_data > 1) + { + THTensor_(free)(srct); + THFree(mask_counter); + THFree(tensor_counter); + THError("Mask tensor can take 0 and 1 values only"); + } + else if (*mask_data == 1) + { + if (cntr == nelem) + { + THTensor_(free)(srct); + THFree(mask_counter); + THFree(tensor_counter); + THError("Number of elements of src < number of ones in mask"); + } + *tensor_data = *src_data; + src_data++; + cntr++; + }); + THTensor_(free)(srct); +} + +void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask) +{ + ptrdiff_t numel = THByteTensor_sumall(mask); + real *tensor_data; + +#ifdef DEBUG + THAssert(numel <= LONG_MAX); +#endif + THTensor_(resize1d)(tensor,numel); + tensor_data = THTensor_(data)(tensor); + TH_TENSOR_APPLY2(real, src, unsigned char, mask, + if (*mask_data > 1) + { + THFree(mask_counter); + THFree(src_counter); + THError("Mask tensor can take 0 and 1 values only"); + } + else if (*mask_data == 1) + { + *tensor_data = *src_data; + tensor_data++; + }); +} + +// Finds non-zero elements of a tensor and returns their subscripts +void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) +{ + ptrdiff_t numel = 0; + int64_t *subscript_data; + int64_t i = 0; + int64_t dim; + int64_t div = 1; +#ifdef TH_REAL_IS_HALF +#define IS_NONZERO(val) ((val.x & 0x7fff) != 0) +#else +#define IS_NONZERO(val) ((val)!=0) +#endif + + /* First Pass to determine size of subscripts */ + TH_TENSOR_APPLY(real, tensor, + if IS_NONZERO(*tensor_data) { + ++numel; + }); +#ifdef DEBUG + THAssert(numel <= LONG_MAX); +#endif + THLongTensor_resize2d(subscript, numel, tensor->dim()); + + /* Second pass populates subscripts */ + subscript_data = THLongTensor_data(subscript); + TH_TENSOR_APPLY(real, tensor, + if IS_NONZERO(*tensor_data) { + div = 1; + + for (dim = tensor->dim() - 1; dim >= 0; dim--) { + *(subscript_data + dim) = (i/div) % tensor->size[dim]; + div *= tensor->size[dim]; + } + + subscript_data += tensor->dim(); + } + ++i;); +} + +void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index) +{ + ptrdiff_t i, numel; + THLongStorage *newSize; + THTensor *tSlice, *sSlice; + int64_t *index_data; + real *tensor_data, *src_data; + +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(index->_dim() <= 1, 3, "Index is supposed to be an empty tensor or a vector"); + THArgCheck(dim < src->_dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); + THArgCheck(src->_dim() > 0, 2, "Source tensor is empty"); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional"); + THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); + //THArgCheck(src->dim() > 0, 2, "Source tensor is empty"); +#endif + + numel = THLongTensor_nElement(index); + + newSize = THLongStorage_newWithSize(src->dim()); + THLongStorage_rawCopy(newSize,src->size); +#ifdef DEBUG + THAssert(numel <= LONG_MAX); +#endif + THLongStorage_data(newSize)[dim] = numel; + THTensor_(resize)(tensor,newSize,NULL); + THLongStorage_free(newSize); + + index = THLongTensor_newContiguous(index); + index_data = THLongTensor_data(index); + + if (dim == 0 && THTensor_(isContiguous)(src) && THTensor_(isContiguous)(tensor)) + { + tensor_data = THTensor_(data)(tensor); + src_data = THTensor_(data)(src); + ptrdiff_t rowsize = src->size[0] == 0 ? 1: THTensor_(nElement)(src) / src->size[0]; + + // check that the indices are within range + int64_t max = src->size[0] - 1 + TH_INDEX_BASE; + for (i=0; i max) { + THLongTensor_free(index); + THError("index out of range"); + } + } + + if (src->dim() == 1) { + #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; idim() == 1) + { + for (i=0; idim() > 1 ) + { + tSlice = THTensor_(new)(); + sSlice = THTensor_(new)(); + + for (i=0; isize; + int64_t *stride = tensor->stride; + int nDim = tensor->_dim(); + ptrdiff_t dataOffset = 0; + for (int i = nDim - 1; i >= 0; i--) { + dataOffset += (linearIndex % size[i]) * stride[i]; + linearIndex /= size[i]; + } + return dataOffset; +} + +static inline void THTensor_(checkLinearIndex)(int64_t linearIndex, int64_t numel) { + THArgCheck(linearIndex < numel && linearIndex >= -numel, 2, "out of range: %d out of %d", (int)linearIndex, (int)numel); +} + +static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t numel) { + return linearIndex < 0 ? linearIndex + numel : linearIndex; +} + +void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index) +{ + THTensor_(resizeNd)(r_, index->dim(), index->size, NULL); + THTensor* dst = THTensor_(newContiguous)(r_); + + index = THLongTensor_newContiguous(index); + int64_t* index_data = THLongTensor_data(index); + ptrdiff_t srcElements = THTensor_(nElement)(src); + real* src_data = THTensor_(data)(src); + real* dst_data = THTensor_(data)(dst); + ptrdiff_t nIndices = THLongTensor_nElement(index); + int isContiguous = THTensor_(isContiguous)(src); + + // Exceptions must not be thrown across OpenMP parallel sections, so we + // record the position of the invalid index and throw the exception after the + // loop. + std::atomic invalidIdxPos(-1); + + ptrdiff_t i; + #pragma omp parallel for if(nIndices > TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i = 0; i < nIndices; i++) { + int64_t idx = index_data[i]; + if (idx < srcElements && idx >= -srcElements) { + idx = THTensor_(wrapLinearIndex)(idx, srcElements); + if (isContiguous) { + dst_data[i] = src_data[idx]; + } else { + dst_data[i] = src_data[THTensor_(dataOffset)(src, idx)]; + } + } else { + int64_t tmp = -1; + invalidIdxPos.compare_exchange_strong(tmp, i); + } + } + + if (invalidIdxPos >= 0) { + THTensor_(checkLinearIndex)(index_data[invalidIdxPos], srcElements); + } + + THLongTensor_free(index); + THTensor_(freeCopyTo)(dst, r_); +} + +void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate) +{ + THArgCheck(THLongTensor_nElement(index) == THTensor_(nElement)(src), 3, + "src should have the same number of elements as index"); + + index = THLongTensor_newContiguous(index); + src = THTensor_(newContiguous)(src); + real* data = THTensor_(data)(tensor); + ptrdiff_t numel = THTensor_(nElement)(tensor); + int is_contiguous = THTensor_(isContiguous)(tensor); + + TH_TENSOR_APPLY2(int64_t, index, real, src, + THTensor_(checkLinearIndex)(*index_data, numel); + int64_t linearIndex = THTensor_(wrapLinearIndex)(*index_data, numel); + int64_t dataOffset = is_contiguous ? linearIndex : THTensor_(dataOffset)(tensor, linearIndex); + if (accumulate) { + data[dataOffset] += *src_data; + } else { + data[dataOffset] = *src_data; + } + ); + + THTensor_(free)(src); + THLongTensor_free(index); +} + +void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src) +{ + ptrdiff_t i, numel; + THTensor *tSlice, *sSlice; + int64_t *index_data; + + numel = THLongTensor_nElement(index); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < src->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#endif + THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)"); + + index = THLongTensor_newContiguous(index); + index_data = THLongTensor_data(index); + + if (tensor->dim() > 1) + { + tSlice = THTensor_(new)(); + sSlice = THTensor_(new)(); + + for (i=0; i_dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < tensor->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#endif + + index = THLongTensor_newContiguous(index); + index_data = THLongTensor_data(index); + + for (i=0; idim() > 1) + { + tSlice = THTensor_(new)(); + THTensor_(select)(tSlice, tensor,dim,index_data[i] - TH_INDEX_BASE); + THTensor_(fill)(tSlice, val); + THTensor_(free)(tSlice); + } + else + { + THTensor_(set1d)(tensor, index_data[i] - TH_INDEX_BASE, val); + } + } + THLongTensor_free(index); +} + +void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index) +{ + int64_t elems_per_row, i, idx; + + THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(src), 4, + "Index tensor must have same dimensions as input tensor"); + THArgCheck(dim >= 0 && dim < THTensor_(nDimension)(tensor), 3, + "Index dimension is out of bounds"); + THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2, + "Input tensor must have same dimensions as output tensor"); + + elems_per_row = THLongTensor_size(index, dim); + + TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + for (i = 0; i < elems_per_row; ++i) + { + idx = *(index_data + i*index_stride); + if (idx < TH_INDEX_BASE || idx >= src_size + TH_INDEX_BASE) + { + THFree(TH_TENSOR_DIM_APPLY_counter); + THError("Invalid index in gather"); + } + *(tensor_data + i*tensor_stride) = src_data[(idx - TH_INDEX_BASE) * src_stride]; + }) +} + +void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src) +{ + int64_t elems_per_row, i, idx; + +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3, + "Index tensor must have same dimensions as output tensor"); + THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 4, + "Input tensor must have same dimensions as output tensor"); +#else + THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3, + "Index tensor must have same dimensions as output tensor"); + THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4, + "Input tensor must have same dimensions as output tensor"); +#endif + + elems_per_row = THLongTensor_size(index, dim); + + TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim, + TH_TENSOR_DIM_APPLY3_SIZE_SCATTER, + for (i = 0; i < elems_per_row; ++i) + { + idx = *(index_data + i*index_stride); + if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE) + { + THFree(TH_TENSOR_DIM_APPLY_counter); + THError("Invalid index in scatter"); + } + tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] = *(src_data + i*src_stride); + }) +} + +void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src) +{ + int64_t elems_per_row, i, idx; + + THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3, + "Index tensor must have same dimensions as output tensor"); + THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4, + "Input tensor must have same dimensions as output tensor"); + + elems_per_row = THLongTensor_size(index, dim); + + TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim, + TH_TENSOR_DIM_APPLY3_SIZE_SCATTER, + for (i = 0; i < elems_per_row; ++i) + { + idx = *(index_data + i*index_stride); + if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE) + { + THFree(TH_TENSOR_DIM_APPLY_counter); + THError("Invalid index in scatterAdd"); + } + tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] += *(src_data + i*src_stride); + }) +} + +void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val) +{ + int64_t elems_per_row, i, idx; + + THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3, + "Index tensor must have same dimensions as output tensor"); + + elems_per_row = THLongTensor_size(index, dim); + + TH_TENSOR_DIM_APPLY2(real, tensor, int64_t, index, dim, + for (i = 0; i < elems_per_row; ++i) + { + idx = *(index_data + i*index_stride); + if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE) + { + THFree(TH_TENSOR_DIM_APPLY_counter); + THError("Invalid index in scatter"); + } + tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] = val; + }) +} + +accreal THTensor_(dot)(THTensor *tensor, THTensor *src) +{ + accreal sum = 0; + /* we use a trick here. careful with that. */ + TH_TENSOR_APPLY2(real, tensor, real, src, + int64_t sz = (tensor_size-tensor_i < src_size-src_i ? tensor_size-tensor_i : src_size-src_i); + sum += THBlas_(dot)(sz, src_data, src_stride, tensor_data, tensor_stride); + tensor_i += sz; + src_i += sz; + tensor_data += sz*tensor_stride; + src_data += sz*src_stride; + break;); + return sum; +} + + +#undef th_isnan +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) +#define th_isnan(val) \ +(std::isnan(val)) +#else +#define th_isnan(val) (0) +#endif + +#undef th_isnan_break +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) +#define th_isnan_break(val) \ +if (std::isnan(val)) break; +#else +#define th_isnan_break(val) +#endif + +real THTensor_(minall)(THTensor *tensor) +{ + real theMin; + real value; + + THArgCheck(tensor->_dim() > 0, 1, "tensor must have one dimension"); + theMin = THTensor_(data)(tensor)[0]; + TH_TENSOR_APPLY(real, tensor, + value = *tensor_data; + /* This is not the same as value= theMin)) + { + theMin = value; + th_isnan_break(value) + }); + return theMin; +} + +real THTensor_(maxall)(THTensor *tensor) +{ + real theMax; + real value; + + THArgCheck(tensor->_dim() > 0, 1, "tensor must have one dimension"); + theMax = THTensor_(data)(tensor)[0]; + TH_TENSOR_APPLY(real, tensor, + value = *tensor_data; + /* This is not the same as value>theMax in the case of NaNs */ + if(!(value <= theMax)) + { + theMax = value; + th_isnan_break(value) + }); + return theMax; +} + +static void THTensor_(quickselectnoidx)(real *arr, int64_t k, int64_t elements, int64_t stride); + +real THTensor_(medianall)(THTensor *tensor) +{ + THArgCheck(tensor->_dim() > 0, 1, "tensor must have one dimension"); + + real theMedian; + ptrdiff_t numel; + int64_t k; + THTensor *temp_; + real *temp__data; + + numel = THTensor_(nElement)(tensor); + k = (numel-1) >> 1; + + temp_ = THTensor_(newClone)(tensor); + temp__data = THTensor_(data)(temp_); + + THTensor_(quickselectnoidx)(temp__data, k, numel, 1); + + theMedian = temp__data[k]; + + THTensor_(free)(temp_); + + return theMedian; +} + +accreal THTensor_(sumall)(THTensor *tensor) +{ + accreal sum = 0; + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if(inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, +:sum, sum += *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + if (serial_path) { + TH_TENSOR_APPLY(real, tensor, sum += *tensor_data;); + } + return sum; +} + +accreal THTensor_(prodall)(THTensor *tensor) +{ + accreal prod = 1; + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if(inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, *:prod, prod *= *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + if (serial_path) { + TH_TENSOR_APPLY(real, tensor, prod *= *tensor_data;); + } + return prod; +} + +void THTensor_(add)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int serial_path = 0; + if (r_Contig && tContig) { + TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(adds)(r__data, t_data, value, r__len);); + } else { +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data + value;, ORDIN_TH_OMP_OVERHEAD_THRESHOLD) + } +#else + (void)r_Size; + serial_path = 1; +#endif + } + if (serial_path) { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;); + } +} + +void THTensor_(sub)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(add)(r_, t, -value); +} + +void THTensor_(add_scaled)(THTensor *r_, THTensor *t, real value, real alpha) +{ + THTensor_(add)(r_, t, value * alpha); +} + +void THTensor_(sub_scaled)(THTensor *r_, THTensor *t, real value, real alpha) +{ + THTensor_(add)(r_, t, -value * alpha); +} + +void THTensor_(mul)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int serial_path = 0; + if (r_Contig && tContig) { + TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(muls)(r__data, t_data, value, r__len);); + } else { +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data * value;, ORDIN_TH_OMP_OVERHEAD_THRESHOLD) + } +#else + (void)r_Size; + serial_path = 1; +#endif + } + if (serial_path) { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;); + } +} + +void THTensor_(div)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int serial_path = 0; + if (r_Contig && tContig) { + TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(divs)(r__data, t_data, value, r__len);); + } else { +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = *t_data / value;, ORDIN_TH_OMP_OVERHEAD_THRESHOLD) + } +#else + (void)r_Size; + serial_path = 1; +#endif + } + if (serial_path) { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;); + } +} + +void THTensor_(lshift)(THTensor *r_, THTensor *t, real value) +{ +#if defined(TH_REAL_IS_FLOAT) + return THTensor_(mul)(r_, t, powf(2, value)); +#elif defined(TH_REAL_IS_DOUBLE) + return THTensor_(mul)(r_, t, pow(2, value)); +#elif defined(TH_REAL_IS_HALF) + return THError("lshift is not supported for torch.HalfTensor"); +#else + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int serial_path = 0; + if (r_Contig && tContig) { + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + int64_t i; + #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD * 100) private(i) + for (i=0; i> value; +#else + rp[i] = ((ureal) tp[i]) >> value; +#endif + } + } else { +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { +#if defined(TH_REAL_IS_BYTE) + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (((real) *t_data) >> value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); +#else + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (((ureal) *t_data) >> value);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); +#endif + } +#else + serial_path = 1; +#endif + } + if (serial_path) { +#if defined(TH_REAL_IS_BYTE) + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) >> value);); +#else + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((ureal) *t_data) >> value);); +#endif + } +#endif +} + +void THTensor_(fmod)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int serial_path = 0; + if (r_Contig && tContig) { + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + int64_t i; + #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD * 100) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD * 100) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD * 100) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i max_value ? max_value : tp[i]); + } else { +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + } + if (serial_path) { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data);); + } +} + +void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + int serial_path = 0; + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + if(r_ == t) { + THBlas_(axpy)(THTensor_(nElement)(t), value, THTensor_(data)(src), 1, THTensor_(data)(r_), 1); + } else { + TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cadd)(r__data, t_data, src_data, value, r__len);); + } + } else { +#if _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + } + } else { + serial_path = 1; + } + if (serial_path) { + TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;); + } +} + +void THTensor_(csub)(THTensor *r_, THTensor *t, real value, THTensor *src) +{ + THTensor_(cadd)(r_, t, -value, src); +} + +void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + int serial_path = 0; + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cmul)(r__data, t_data, src_data, r__len);); + } else { +#if _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data * *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + } + } else { + serial_path = 1; + } + if (serial_path) { + TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;); + } +} + +void THTensor_(pow)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + if(value == 1){ + THTensor_(copy)(r_, t); + } + else if(value == 2){ + THTensor_(cmul)(r_, t, t); + } + else if(value == 3){ + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * *t_data * *t_data;); + } +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) +#if defined (TH_REAL_IS_FLOAT) +#define TH_MATH_NAME(fn) fn##f +#else +#define TH_MATH_NAME(fn) fn +#endif + else if(value == 0.5){ + THTensor_(sqrt)(r_, t); + } + else if(value == -0.5){ + THTensor_(rsqrt)(r_, t); + } + else if(value == -1){ + THTensor_(cinv)(r_, t); + } + else if(value == -2){ + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = TH_MATH_NAME(1.0) / (*t_data * *t_data);); + } + else{ + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = TH_MATH_NAME(pow)(*t_data, value);); + } +#undef TH_MATH_NAME +#else + else { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = THTensor_(powOne)(*t_data, value);); + } +#endif +} + +void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + int serial_path = 0; + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + real *tp = THTensor_(data)(t); + real *sp = THTensor_(data)(src); + real *rp = THTensor_(data)(r_); + int64_t i; + #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i> sp[i]; +#else + rp[i] = ((ureal) tp[i]) >> sp[i]; +#endif + } + } else { +#if _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { +#if defined(TH_REAL_IS_FLOAT) + TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); +#elif defined(TH_REAL_IS_DOUBLE) + TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); +#elif defined(TH_REAL_IS_BYTE) + TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); +#else + TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, real, r_, real, t, real, src, *r__data = ((ureal)*t_data) >> *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); +#endif + } +#else + serial_path = 1; +#endif + } + } else { + serial_path = 1; + } + if (serial_path) { +#if defined(TH_REAL_IS_FLOAT) + TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data);); +#elif defined(TH_REAL_IS_DOUBLE) + TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data);); +#elif defined(TH_REAL_IS_BYTE) + TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;); +#else + TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((ureal)*t_data) >> *src_data;); +#endif + } +} + +void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src) +{ + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + int serial_path = 0; + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + real *tp = THTensor_(data)(t); + real *sp = THTensor_(data)(src); + real *rp = THTensor_(data)(r_); + int64_t i; + #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; i TH_OMP_OVERHEAD_THRESHOLD) private(i) + for (i=0; idim() != 2) || (vec->dim() != 1) ) + THError("matrix and vector expected, got %dD, %dD", + mat->dim(), vec->dim()); + + if( mat->size[1] != vec->size[0] ) { + THDescBuff bm = THTensor_(sizeDesc)(mat); + THDescBuff bv = THTensor_(sizeDesc)(vec); + THError("size mismatch, %s, %s", bm.str, bv.str); + } + + if(t->dim() != 1) + THError("vector expected, got t: %dD", t->dim()); + + if(t->size[0] != mat->size[0]) { + THDescBuff bt = THTensor_(sizeDesc)(t); + THDescBuff bm = THTensor_(sizeDesc)(mat); + THError("size mismatch, t: %s, mat: %s", bt.str, bm.str); + } + + if(r_ != t) + { + THTensor_(resizeAs)(r_, t); + THTensor_(copy)(r_, t); + } + + // n == 1 || lda >= max(1, m) + #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) + + if(mat->stride[0] == 1 && LDA_COND(mat->size[0], mat->size[1], mat->stride[1])) + { + THBlas_(gemv)('n', mat->size[0], mat->size[1], + alpha, THTensor_(data)(mat), mat->stride[1], + THTensor_(data)(vec), vec->stride[0], + beta, THTensor_(data)(r_), r_->stride[0]); + } + else if(mat->stride[1] == 1 && LDA_COND(mat->size[1], mat->size[0], mat->stride[0])) + { + THBlas_(gemv)('t', mat->size[1], mat->size[0], + alpha, THTensor_(data)(mat), mat->stride[0], + THTensor_(data)(vec), vec->stride[0], + beta, THTensor_(data)(r_), r_->stride[0]); + } + else + { + THTensor *cmat = THTensor_(newContiguous)(mat); + + THBlas_(gemv)('t', mat->size[1], mat->size[0], + alpha, THTensor_(data)(cmat), cmat->stride[0], + THTensor_(data)(vec), vec->stride[0], + beta, THTensor_(data)(r_), r_->stride[0]); + + THTensor_(free)(cmat); + } + + #undef LDA_COND +} + +void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain) +{ + int64_t N1 = m1->size[0]; + int64_t N2 = m2->size[0]; + int64_t dim; + real *m1_p; + real *m2_p; + real *r_p; + int64_t i; + + THTensor_(resize2d)(r_, N1, N2); + + m1 = THTensor_(newContiguous)(m1); + m2 = THTensor_(newContiguous)(m2); + + THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1); + THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2); + + dim = m1->size[1]; + THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim"); + + m1_p = THTensor_(data)(m1); + m2_p = THTensor_(data)(m2); + r_p = THTensor_(data)(r_); + +#pragma omp parallel for private(i) + for (i=0; idim() != 2) || (m2->dim() != 2)) + THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim()); + + if(m1->size[1] != m2->size[0]) { + THDescBuff bm1 = THTensor_(sizeDesc)(m1); + THDescBuff bm2 = THTensor_(sizeDesc)(m2); + THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str); + } + + if( t->dim() != 2 ) + THError("matrix expected, got %dD tensor for t", t->dim()); + + if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) { + THDescBuff bt = THTensor_(sizeDesc)(t); + THDescBuff bm1 = THTensor_(sizeDesc)(m1); + THDescBuff bm2 = THTensor_(sizeDesc)(m2); + THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str); + } + + if(t != r_) + { + THTensor_(resizeAs)(r_, t); + if (beta != 0.0) { + THTensor_(copy)(r_, t); + } + } + + // n == 1 || ldc >= max(1, m) + #define LDC_COND(M, N, LDC) ((N) == 1 || (LDC) >= THMax(1, M)) + + /* r_ */ + if(r_->stride[0] == 1 && + LDC_COND(r_->size[0], r_->size[1], r_->stride[1])) + { + transpose_r = 'n'; + r__ = r_; + } + else if(r_->stride[1] == 1 && + LDC_COND(r_->size[1], r_->size[0], r_->stride[0])) + { + THTensor *swap = m2; + m2 = m1; + m1 = swap; + transpose_r = 't'; + r__ = r_; + } + else + { + transpose_r = 'n'; + // make r__ FORTRAN contiguous + THTensor *transp_r_ = THTensor_(newTranspose)(r_, 0, 1); + r__ = THTensor_(newClone)(transp_r_); + THTensor_(free)(transp_r_); + THTensor_(transpose)(r__, NULL, 0, 1); + } + + #undef LDC_COND + + int64_t m = r__->size[(transpose_r == 'n' ? 0 : 1)]; + int64_t n = r__->size[(transpose_r == 'n' ? 1 : 0)]; + int64_t k = m1->size[(transpose_r == 'n' ? 1 : 0)]; + int64_t ldr__ = r__->stride[(transpose_r == 'n' ? 1 : 0)]; + + /* m1 */ + /* Need ldm1_ >= max(1, (transpose_m1 == 'n' ? m : k)) */ + if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && + m1->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, m)) + { + transpose_m1 = 'n'; + m1_ = m1; + } + else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && + m1->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, k)) + { + transpose_m1 = 't'; + m1_ = m1; + } + else + { + transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); + m1_ = THTensor_(newContiguous)(m1); + free_m1 = 1; + } + + /* m2 */ + /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */ + if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && + m2->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, k)) + { + transpose_m2 = 'n'; + m2_ = m2; + } + else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && + m2->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, n)) + { + transpose_m2 = 't'; + m2_ = m2; + } + else + { + transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); + m2_ = THTensor_(newContiguous)(m2); + free_m2 = 1; + } + + int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]); + int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]); + +#pragma omp critical(blasgemm) + /* do the operation */ + THBlas_(gemm)(transpose_m1, + transpose_m2, + m, + n, + k, + alpha, + THTensor_(data)(m1_), + ldm1_, + THTensor_(data)(m2_), + ldm2_, + beta, + THTensor_(data)(r__), + ldr__); + + /* free intermediate variables */ + if(free_m1) + THTensor_(free)(m1_); + + if(free_m2) + THTensor_(free)(m2_); + + if(r__ != r_) + THTensor_(freeCopyTo)(r__, r_); +} + +void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2) +{ + if( (vec1->dim() != 1) || (vec2->dim() != 1) ) + THError("vector and vector expected, got %dD, %dD tensors", + vec1->dim(), vec2->dim()); + + if(t->dim() != 2) + THError("expected matrix, got %dD tensor for t", t->dim()); + + if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) { + THDescBuff bt = THTensor_(sizeDesc)(t); + THDescBuff bv1 = THTensor_(sizeDesc)(vec1); + THDescBuff bv2 = THTensor_(sizeDesc)(vec2); + THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str); + } + + if(r_ != t) + { + THTensor_(resizeAs)(r_, t); + THTensor_(copy)(r_, t); + } + + if(beta == 0) { + THTensor_(zero)(r_); + } + else if(beta != 1) + THTensor_(mul)(r_, r_, beta); + + // n == 1 || lda >= max(1, m) + #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) + + if(r_->stride[0] == 1 && LDA_COND(vec1->size[0], vec2->size[0], r_->stride[1])) + { + THBlas_(ger)(vec1->size[0], vec2->size[0], + alpha, THTensor_(data)(vec1), vec1->stride[0], + THTensor_(data)(vec2), vec2->stride[0], + THTensor_(data)(r_), r_->stride[1]); + } + else if(r_->stride[1] == 1 && LDA_COND(vec2->size[0], vec1->size[0], r_->stride[0])) + { + THBlas_(ger)(vec2->size[0], vec1->size[0], + alpha, THTensor_(data)(vec2), vec2->stride[0], + THTensor_(data)(vec1), vec1->stride[0], + THTensor_(data)(r_), r_->stride[0]); + } + else + { + THTensor *cr = THTensor_(newClone)(r_); + + THBlas_(ger)(vec2->size[0], vec1->size[0], + alpha, THTensor_(data)(vec2), vec2->stride[0], + THTensor_(data)(vec1), vec1->stride[0], + THTensor_(data)(cr), cr->stride[0]); + + THTensor_(freeCopyTo)(cr, r_); + } + + #undef LDA_COND +} + +void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2) +{ + int64_t batch; + + THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor"); + THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor"); + THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2, + "equal number of batches expected, got %d, %d", + THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0)); + THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2, + "wrong matrix size, batch1: %dx%d, batch2: %dx%d", + THTensor_(size)(batch1, 1), THTensor_(size)(batch1,2), + THTensor_(size)(batch2, 1), THTensor_(size)(batch2,2)); + + int64_t dim1 = THTensor_(size)(batch1, 1); + int64_t dim2 = THTensor_(size)(batch2, 2); + THArgCheck(THTensor_(size)(t, 0) == dim1, 1, "output tensor of incorrect size"); + THArgCheck(THTensor_(size)(t, 1) == dim2, 1, "output tensor of incorrect size"); + + if (t != result) { + THTensor_(resizeAs)(result, t); + if (beta != 0.0) { + THTensor_(copy)(result, t); + } + } + + THTensor *matrix1 = THTensor_(new)(); + THTensor *matrix2 = THTensor_(new)(); + + for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) { + THTensor_(select)(matrix1, batch1, 0, batch); + THTensor_(select)(matrix2, batch2, 0, batch); + + THTensor_(addmm)(result, beta, result, alpha, matrix1, matrix2); + beta = 1; // accumulate output once + } + + THTensor_(free)(matrix1); + THTensor_(free)(matrix2); +} + +void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2) +{ + int64_t batch; + + THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1)); + THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2)); + THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2, + "equal number of batches expected, got %d, %d", + THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0)); + THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2, + "wrong matrix size, batch1: %dx%d, batch2: %dx%d", + THTensor_(size)(batch1, 1), THTensor_(size)(batch1, 2), + THTensor_(size)(batch2, 1), THTensor_(size)(batch2, 2)); + + int64_t bs = THTensor_(size)(batch1, 0); + int64_t dim1 = THTensor_(size)(batch1, 1); + int64_t dim2 = THTensor_(size)(batch2, 2); + THArgCheck(THTensor_(size)(t, 0) == bs, 1, "output tensor of incorrect size"); + THArgCheck(THTensor_(size)(t, 1) == dim1, 1, "output tensor of incorrect size"); + THArgCheck(THTensor_(size)(t, 2) == dim2, 1, "output tensor of incorrect size"); + + if (t != result) { + THTensor_(resizeAs)(result, t); + if (beta != 0.0) { + THTensor_(copy)(result, t); + } + } + + THTensor *matrix1 = THTensor_(new)(); + THTensor *matrix2 = THTensor_(new)(); + THTensor *result_matrix = THTensor_(new)(); + + for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) { + THTensor_(select)(matrix1, batch1, 0, batch); + THTensor_(select)(matrix2, batch2, 0, batch); + THTensor_(select)(result_matrix, result, 0, batch); + + THTensor_(addmm)(result_matrix, beta, result_matrix, alpha, matrix1, matrix2); + } + + THTensor_(free)(matrix1); + THTensor_(free)(matrix2); + THTensor_(free)(result_matrix); +} + +ptrdiff_t THTensor_(numel)(THTensor *t) +{ + return THTensor_(nElement)(t); +} + + +// Helper function to be used in a reduction operation. +// Due to resize semantics of outputs, if the specified output tensor r_ has +// same size as the output of the reduction operation, then any noncontiguities +// in r_ should be preserved. +// The reduction operation, however, needs to act on r_ with an extra dimension +// (the reduced dimension), so this function "resizes" r_ and preserves its +// noncontiguities if necessary. +void THTensor_(preserveReduceDimSemantics)( + THTensor *r_, int in_dims, int reduce_dimension, int keepdim) { + if (r_ && !keepdim && + THTensor_(_nDimension)(r_) == in_dims - 1 && + THTensor_(_nDimension)(r_) != 0) { + THTensor_(unsqueeze1d)(r_, r_, reduce_dimension); + } +} + +void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + int in_dims = THTensor_(_nDimension)(t); + THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim); + THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(values_, dim, NULL); + THLongTensor_resize(indices_, dim, NULL); + THLongStorage_free(dim); + + // two implementations optimized for data locality + if (t->stride[dimension] == 1) { + real theMax; + real value; + int64_t theIndex; + int64_t i; + TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + theMax = t_data[0]; + theIndex = 0; + + for(i = 0; i < t_size; i++) + { + value = t_data[i*t_stride]; + /* This is not the same as value>theMax in the case of NaNs */ + if(!(value <= theMax)) + { + theIndex = i; + theMax = value; + th_isnan_break(value) + } + } + *indices__data = theIndex; + *values__data = theMax;); + } else { + if (THTensor_(_nDimension)(t) > 1) { + THTensor *t0 = THTensor_(newSelect)(t, dimension, 0); + THTensor_(copy)(values_, t0); + THTensor_(free)(t0); + } else { + THTensor_(fill)(values_, THTensor_(get1d)(t, 0)); + } + THLongTensor_zero(indices_); + + if(t->size[dimension] == 1) { + if (!keepdim) { + THTensor_(squeeze1d)(values_, values_, dimension); + THLongTensor_squeeze1d(indices_, indices_, dimension); + } + return; + } + + THTensor *tempValues_ = THTensor_(newWithTensor)(values_); + // tempValues_.expand_as(t) + tempValues_->size[dimension] = t->size[dimension]; + tempValues_->stride[dimension] = 0; + + THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_); + // tempIndices_.expand_as(t) + tempIndices_->size[dimension] = t->size[dimension]; + tempIndices_->stride[dimension] = 0; + + TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension, + if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) { + *tempValues__data = *t_data; + *tempIndices__data = *tempIndices__dimOffset; + }); + + THTensor_(free)(tempValues_); + THLongTensor_free(tempIndices_); + } + + if (!keepdim) { + THTensor_(squeeze1d)(values_, values_, dimension); + THLongTensor_squeeze1d(indices_, indices_, dimension); + } +} + +void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + int in_dims = THTensor_(_nDimension)(t); + THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim); + THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(values_, dim, NULL); + THLongTensor_resize(indices_, dim, NULL); + THLongStorage_free(dim); + + // two implementations optimized for data locality + if (t->stride[dimension] == 1) { + real theMax; + real value; + int64_t theIndex; + int64_t i; + TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + theMax = t_data[0]; + theIndex = 0; + + for(i = 0; i < t_size; i++) + { + value = t_data[i*t_stride]; + /* This is not the same as value>theMax in the case of NaNs */ + if(!(value >= theMax)) + { + theIndex = i; + theMax = value; + th_isnan_break(value) + } + } + *indices__data = theIndex; + *values__data = theMax;); + } else { + if (THTensor_(_nDimension)(t) > 1) { + THTensor *t0 = THTensor_(newSelect)(t, dimension, 0); + THTensor_(copy)(values_, t0); + THTensor_(free)(t0); + } else { + THTensor_(fill)(values_, THTensor_(get1d)(t, 0)); + } + THLongTensor_zero(indices_); + + if(t->size[dimension] == 1) { + if (!keepdim) { + THTensor_(squeeze1d)(values_, values_, dimension); + THLongTensor_squeeze1d(indices_, indices_, dimension); + } + return; + } + + THTensor *tempValues_ = THTensor_(newWithTensor)(values_); + // tempValues_.expand_as(t) + tempValues_->size[dimension] = t->size[dimension]; + tempValues_->stride[dimension] = 0; + + THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_); + // tempIndices_.expand_as(t) + tempIndices_->size[dimension] = t->size[dimension]; + tempIndices_->stride[dimension] = 0; + + TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension, + if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) { + *tempValues__data = *t_data; + *tempIndices__data = *tempIndices__dimOffset; + }); + + THTensor_(free)(tempValues_); + THLongTensor_free(tempIndices_); + } + + if (!keepdim) { + THTensor_(squeeze1d)(values_, values_, dimension); + THLongTensor_squeeze1d(indices_, indices_, dimension); + } +} + +void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + int r_Contig = THTensor_(isContiguous)(r_); + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + if(r_Contig && (tp != rp)){ + ptrdiff_t iter = 0; + ptrdiff_t r_Size = THTensor_(nElement)(r_); + int r_Dim = r_->_dim(); + #pragma omp parallel for if ( r_Size > HYPER_TH_OMP_OVERHEAD_THRESHOLD) + for (iter = 0; iter < r_Size; iter++) { + int j; + int64_t quot; + int64_t rem = iter; + ptrdiff_t tBasicIndex = 0; + + for(j = 0; j < r_Dim; ++j) { + if(j != dimension){ + quot = rem/r_->stride[j]; + rem = rem%r_->stride[j]; + tBasicIndex += quot*t->stride[j]; + } + } + real *t_data = tp+tBasicIndex; + real *r__data = rp+iter; + *r__data = 0; + for(j=0; j < t->size[dimension]; ++j) { + *r__data += *(t_data + j*t->stride[dimension]); + } + } + } else { + serial_path = 1; + } + } +#else + serial_path = 1; +#endif + if (serial_path) { + // two implementations optimized for data locality + if (t->stride[dimension] == 1) { + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + accreal sum = 0; + int64_t i; + for(i = 0; i < t_size; i++) + sum += t_data[i*t_stride]; + *r__data = (real)sum;); + } else { + THTensor_(zero)(r_); + THTensor *temp_ = THTensor_(newWithTensor)(r_); + // r_.expand_as(t) + temp_->size[dimension] = t->size[dimension]; + temp_->stride[dimension] = 0; + + TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;); + THTensor_(free)(temp_); + } + } + + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } +} + +void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + int r_Contig = THTensor_(isContiguous)(r_); + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + if(r_Contig && (tp != rp)){ + ptrdiff_t iter = 0; + ptrdiff_t r_Size = THTensor_(nElement)(r_); + int r_Dim = r_->_dim(); + #pragma omp parallel for if ( r_Size > HYPER_TH_OMP_OVERHEAD_THRESHOLD) + for (iter = 0; iter < r_Size; iter++) { + int j; + int64_t quot; + int64_t rem = iter; + ptrdiff_t tBasicIndex = 0; + + for(j = 0; j < r_Dim; ++j) { + if(j != dimension){ + quot = rem/r_->stride[j]; + rem = rem%r_->stride[j]; + tBasicIndex += quot*t->stride[j]; + } + } + real *t_data = tp+tBasicIndex; + real *r__data = rp+iter; + *r__data = 1; + for(j=0; j < t->size[dimension]; ++j) { + *r__data *= *(t_data + j*t->stride[dimension]); + } + } + } else { + serial_path = 1; + } + } +#else + serial_path = 1; +#endif + + if(serial_path) { + // two implementations optimized for data locality + if (t->stride[dimension] == 1) { + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + accreal prod = 1; + int64_t i; + for(i = 0; i < t_size; i++) + prod *= t_data[i*t_stride]; + *r__data = (real)prod;); + } else { + THTensor_(fill)(r_, 1); + THTensor *temp_ = THTensor_(newWithTensor)(r_); + // r_.expand_as(t) + temp_->size[dimension] = t->size[dimension]; + temp_->stride[dimension] = 0; + + TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;); + THTensor_(free)(temp_); + } + } + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } +} + +void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension) +{ + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THTensor_(resizeAs)(r_, t); + + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + accreal cumsum = 0; + int64_t i; + for(i = 0; i < t_size; i++) + { + cumsum += t_data[i*t_stride]; + r__data[i*r__stride] = (real)cumsum; + }); +} + +void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension) +{ + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THTensor_(resizeAs)(r_, t); + + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + accreal cumprod = 1; + int64_t i; + for(i = 0; i < t_size; i++) + { + cumprod *= t_data[i*t_stride]; + r__data[i*r__stride] = (real)cumprod; + }); +} + + +void THTensor_(sign)(THTensor *r_, THTensor *t) +{ + THTensor_(resizeAs)(r_, t); + +#if defined (TH_REAL_IS_BYTE) + TH_TENSOR_APPLY2(real, r_, real, t, + if (*t_data > 0) *r__data = 1; + else *r__data = 0;); +#else + TH_TENSOR_APPLY2(real, r_, real, t, + if (*t_data > 0) *r__data = 1; + else if (*t_data < 0) *r__data = -1; + else *r__data = 0;); +#endif +} + + +accreal THTensor_(trace)(THTensor *t) +{ + real *t_data = THTensor_(data)(t); + accreal sum = 0; + int64_t i = 0; + int64_t t_stride_0, t_stride_1, t_diag_size; + + THArgCheck(THTensor_(_nDimension)(t) == 2, 1, "expected a matrix"); + + t_stride_0 = THTensor_(stride)(t, 0); + t_stride_1 = THTensor_(stride)(t, 1); + t_diag_size = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)); + while(i < t_diag_size) + { + sum += t_data[i*(t_stride_0+t_stride_1)]; + i++; + } + + return sum; +} + +void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension) +{ + int i; + + if(THTensor_(nDimension)(a) != THTensor_(nDimension)(b)) + THError("inconsistent tensor dimension %dD, %dD", + THTensor_(nDimension)(a), THTensor_(nDimension)(b)); + + for(i = 0; i < THTensor_(nDimension)(a); i++) + { + if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) { + THDescBuff ba = THTensor_(sizeDesc)(a); + THDescBuff bb = THTensor_(sizeDesc)(b); + THError("inconsistent tensor sizes %s, %s", ba.str, bb.str); + } + } + + if(dimension < 0) + { + for(i = 0; i < THTensor_(nDimension)(a); i++) + { + if(THTensor_(size)(a, i) == 3) + { + dimension = i; + break; + } + } + if(dimension < 0) { + THDescBuff ba = THTensor_(sizeDesc)(a); + THError("no dimension of size 3 in a: %s", ba.str); + } + } + + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(a), 3, "dimension %d out of range", + dimension + TH_INDEX_BASE); + THArgCheck(THTensor_(size)(a, dimension) == 3, 3, "dimension %d does not have size 3", + dimension + TH_INDEX_BASE); + + THTensor_(resizeAs)(r_, a); + + TH_TENSOR_DIM_APPLY3(real, a, real, b, real, r_, dimension, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + r__data[0*r__stride] = a_data[1*a_stride]*b_data[2*b_stride] - a_data[2*a_stride]*b_data[1*b_stride]; + r__data[1*r__stride] = a_data[2*a_stride]*b_data[0*b_stride] - a_data[0*a_stride]*b_data[2*b_stride]; + r__data[2*r__stride] = a_data[0*a_stride]*b_data[1*b_stride] - a_data[1*a_stride]*b_data[0*b_stride];); +} + +void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src) { + THTensor_(resizeAs)(r, t); + TH_TENSOR_APPLY3(real, r, real, t, real, src, + *r_data = *t_data > *src_data ? *t_data : *src_data;); +} + +void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src) { + THTensor_(resizeAs)(r, t); + TH_TENSOR_APPLY3(real, r, real, t, real, src, + *r_data = *t_data < *src_data ? *t_data : *src_data;); +} + +void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value) { + THTensor_(resizeAs)(r, t); + TH_TENSOR_APPLY2(real, r, real, t, + *r_data = *t_data < value ? value : *t_data;); // this order propagates NaN +} + +void THTensor_(cminValue)(THTensor *r, THTensor *t, real value) { + THTensor_(resizeAs)(r, t); + TH_TENSOR_APPLY2(real, r, real, t, + *r_data = *t_data > value ? value : *t_data;); // this order propagates NaN +} + +void THTensor_(zerosLike)(THTensor *r_, THTensor *input) +{ + THTensor_(resizeAs)(r_, input); + THTensor_(zero)(r_); +} + +void THTensor_(onesLike)(THTensor *r_, THTensor *input) +{ + THTensor_(resizeAs)(r_, input); + THTensor_(fill)(r_, 1); +} + +void THTensor_(diag)(THTensor *r_, THTensor *t, int k) +{ +#ifndef USE_TH_SIZE_ZERO_DIM + AT_ASSERT(!t->is_empty()) +#endif + THArgCheck(THTensor_(nDimension)(t) == 1 || THTensor_(nDimension)(t) == 2, 1, "matrix or a vector expected"); + + if(THTensor_(nDimension)(t) == 1) + { + real *t_data = THTensor_(data)(t); + int64_t t_stride_0 = THTensor_(stride)(t, 0); + int64_t t_size = THTensor_(size)(t, 0); + int64_t sz = t_size + (k >= 0 ? k : -k); + real *r__data; + int64_t r__stride_0; + int64_t r__stride_1; + int64_t i; + + THTensor_(resize2d)(r_, sz, sz); + THTensor_(zero)(r_); + r__data = THTensor_(data)(r_); + r__stride_0 = THTensor_(stride)(r_, 0); + r__stride_1 = THTensor_(stride)(r_, 1); + r__data += (k >= 0 ? k*r__stride_1 : -k*r__stride_0); + + for(i = 0; i < t_size; i++) + r__data[i*(r__stride_0+r__stride_1)] = t_data[i*t_stride_0]; + } + else + { + real *t_data = THTensor_(data)(t); + int64_t t_stride_0 = THTensor_(stride)(t, 0); + int64_t t_stride_1 = THTensor_(stride)(t, 1); + int64_t sz; + real *r__data; + int64_t r__stride_0; + int64_t i; + + if(k >= 0) + sz = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)-k); + else + sz = THMin(THTensor_(size)(t, 0)+k, THTensor_(size)(t, 1)); + THTensor_(resize1d)(r_, sz); + r__data = THTensor_(data)(r_); + r__stride_0 = THTensor_(stride)(r_, 0); + + t_data += (k >= 0 ? k*t_stride_1 : -k*t_stride_0); + for(i = 0; i < sz; i++) + r__data[i*r__stride_0] = t_data[i*(t_stride_0+t_stride_1)]; + } +} + +void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m) +{ + real *r__data; + int64_t i, sz; + + THArgCheck(n > 0, 1, "invalid argument"); + + if(m <= 0) + m = n; + + THTensor_(resize2d)(r_, n, m); + THTensor_(zero)(r_); + + i = 0; + r__data = THTensor_(data)(r_); + sz = THMin(THTensor_(size)(r_, 0), THTensor_(size)(r_, 1)); + for(i = 0; i < sz; i++) + r__data[i*(r_->stride[0]+r_->stride[1])] = 1; +} + + +void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step) +{ + ptrdiff_t size; + real i = 0; + + THArgCheck(step > 0 || step < 0, 3, "step must be nonzero"); + THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin)) + , 2, "upper bound and larger bound inconsistent with step sign"); + + size = (ptrdiff_t) (((xmax - xmin) / step) + 1); + + if (THTensor_(nElement)(r_) != size) { + THTensor_(resize1d)(r_, size); + } + + TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;); +} + +void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step) { + ptrdiff_t size; + real i = 0; + + THArgCheck(step > 0 || step < 0, 3, "step must be nonzero"); + THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin)) + , 2, "upper bound and larger bound inconsistent with step sign"); + + size = (ptrdiff_t) ceil((double)(xmax - xmin) / step); + + if (THTensor_(nElement)(r_) != size) { + THTensor_(resize1d)(r_, size); + } + + TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;); +} + +void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n) +{ + real *r__data; + int64_t r__stride_0; + int64_t i; + + THArgCheck(n > 0, 1, "must be strictly positive"); + + THTensor_(resize1d)(r_, n); + r__data = THTensor_(data)(r_); + r__stride_0 = THTensor_(stride)(r_,0); + + for(i = 0; i < n; i++) + r__data[i*r__stride_0] = (real)(i); + + for(i = 0; i < n-1; i++) + { + int64_t z = THRandom_random(_generator) % (n-i); + real sav = r__data[i*r__stride_0]; + r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0]; + r__data[(z+i)*r__stride_0] = sav; + } +} + +/* I cut and pasted (slightly adapted) the quicksort code from + Sedgewick's 1978 "Implementing Quicksort Programs" article + http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf + + It is the state of the art existing implementation. The macros + are here to make as close a match as possible to the pseudocode of + Program 2 p.851 + + Note that other partition schemes exist, and are typically presented + in textbook, but those are less efficient. See e.g. + http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto + + Julien, November 12th 2013 +*/ +#define MAX_LEVELS 300 +#define M_SMALL 10 /* Limit for small subfiles */ + +#define ARR(III) arr[(III)*stride] +#define IDX(III) idx[(III)*stride] + +#define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap +#define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap + +#define ARR_SWAP(III, JJJ) \ + REAL_SWAP(ARR(III), ARR(JJJ)); + +#define BOTH_SWAP(III, JJJ) \ + REAL_SWAP(ARR(III), ARR(JJJ)); \ + LONG_SWAP(IDX(III), IDX(JJJ)) + +static void THTensor_(quicksortascend)(real *arr, int64_t *idx, int64_t elements, int64_t stride) +{ + int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left; + real rswap, piv; + unsigned char done = 0; + + /* beg[0]=0; end[0]=elements; */ + stack = 0; + L = 0; R = elements-1; + done = elements-1 <= M_SMALL; + + while(!done) { + /* Use median of three for pivot choice */ + P=(L+R)>>1; + BOTH_SWAP(P, L+1); + if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); } + if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); } + if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); } + + i = L+1; j = R; piv = ARR(L); pid = IDX(L); + + do { + do { i = i+1; } while(ARR(i) < piv); + do { j = j-1; } while(ARR(j) > piv); + if (j < i) + break; + BOTH_SWAP(i, j); + } while(1); + BOTH_SWAP(L, j); + /* Left subfile is (L, j-1) */ + /* Right subfile is (i, R) */ + sz_left = j-L; + sz_right = R-i+1; + if (sz_left <= M_SMALL && sz_right <= M_SMALL) { + /* both subfiles are small */ + /* if stack empty */ + if (stack == 0) { + done = 1; + } else { + stack--; + L = beg[stack]; + R = end[stack]; + } + } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) { + /* exactly one of the subfiles is small */ + /* (L,R) = large subfile */ + if (sz_left > sz_right) { + /* Implicit: L = L; */ + R = j-1; + } else { + L = i; + /* Implicit: R = R; */ + } + } else { + /* none of the subfiles is small */ + /* push large subfile */ + /* (L,R) = small subfile */ + if (sz_left > sz_right) { + beg[stack] = L; + end[stack] = j-1; + stack++; + L = i; + /* Implicit: R = R */ + } else { + beg[stack] = i; + end[stack] = R; + stack++; + /* Implicit: L = L; */ + R = j-1; + } + } + } /* while not done */ + /* Now insertion sort on the concatenation of subfiles */ + for(i=elements-2; i>=0; i--) { + if (ARR(i) > ARR(i+1)) { + piv = ARR(i); + pid = IDX(i); + j = i+1; + do { + ARR(j-1) = ARR(j); + IDX(j-1) = IDX(j); + j = j+1; + } while(j < elements && ARR(j) < piv); + ARR(j-1) = piv; + IDX(j-1) = pid; + } + } +} + +static void THTensor_(quicksortdescend)(real *arr, int64_t *idx, int64_t elements, int64_t stride) +{ + int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left; + real rswap, piv; + unsigned char done = 0; + + /* beg[0]=0; end[0]=elements; */ + stack = 0; + L = 0; R = elements-1; + done = elements-1 <= M_SMALL; + + while(!done) { + /* Use median of three for pivot choice */ + P=(L+R)>>1; + BOTH_SWAP(P, L+1); + if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); } + if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); } + if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); } + + i = L+1; j = R; piv = ARR(L); pid = IDX(L); + + do { + do { i = i+1; } while(ARR(i) > piv); + do { j = j-1; } while(ARR(j) < piv); + if (j < i) + break; + BOTH_SWAP(i, j); + } while(1); + BOTH_SWAP(L, j); + /* Left subfile is (L, j-1) */ + /* Right subfile is (i, R) */ + sz_left = j-L; + sz_right = R-i+1; + if (sz_left <= M_SMALL && sz_right <= M_SMALL) { + /* both subfiles are small */ + /* if stack empty */ + if (stack == 0) { + done = 1; + } else { + stack--; + L = beg[stack]; + R = end[stack]; + } + } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) { + /* exactly one of the subfiles is small */ + /* (L,R) = large subfile */ + if (sz_left > sz_right) { + /* Implicit: L = L; */ + R = j-1; + } else { + L = i; + /* Implicit: R = R; */ + } + } else { + /* none of the subfiles is small */ + /* push large subfile */ + /* (L,R) = small subfile */ + if (sz_left > sz_right) { + beg[stack] = L; + end[stack] = j-1; + stack++; + L = i; + /* Implicit: R = R */ + } else { + beg[stack] = i; + end[stack] = R; + stack++; + /* Implicit: L = L; */ + R = j-1; + } + } + } /* while not done */ + /* Now insertion sort on the concatenation of subfiles */ + for(i=elements-2; i>=0; i--) { + if (ARR(i) < ARR(i+1)) { + piv = ARR(i); + pid = IDX(i); + j = i+1; + do { + ARR(j-1) = ARR(j); + IDX(j-1) = IDX(j); + j = j+1; + } while(j < elements && ARR(j) > piv); + ARR(j-1) = piv; + IDX(j-1) = pid; + } + } +} + +#undef MAX_LEVELS +#undef M_SMALL + +void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder) +{ + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + THTensor_(resizeAs)(rt_, t); + THTensor_(copy)(rt_, t); + + { + THLongStorage *size = THTensor_(newSizeOf)(t); + THLongTensor_resize(ri_, size, NULL); + THLongStorage_free(size); + } + + if(descendingOrder) + { + TH_TENSOR_DIM_APPLY2(real, rt_, int64_t, ri_, dimension, + int64_t i; + for(i = 0; i < ri__size; i++) + ri__data[i*ri__stride] = i; + THTensor_(quicksortdescend)(rt__data, ri__data, rt__size, rt__stride);) + } + else + { + TH_TENSOR_DIM_APPLY2(real, rt_, int64_t, ri_, dimension, + int64_t i; + for(i = 0; i < ri__size; i++) + ri__data[i*ri__stride] = i; + THTensor_(quicksortascend)(rt__data, ri__data, rt__size, rt__stride);) + } +} + +/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's +public domain implementation at http://ndevilla.free.fr/median/median/ +Adapted similarly to the above Quicksort algorithm. +This version does not produce indices along with values. */ +static void THTensor_(quickselectnoidx)(real *arr, int64_t k, int64_t elements, int64_t stride) +{ + int64_t P, L, R, i, j; + real rswap, piv; + L = 0; + R = elements-1; + + do { + if (R <= L) /* One element only */ + return; + + if (R == L+1) { /* Two elements only */ + if (ARR(L) > ARR(R)) { + ARR_SWAP(L, R); + } + return; + } + + /* Use median of three for pivot choice */ + P=(L+R)>>1; + ARR_SWAP(P, L+1); + if (ARR(L+1) > ARR(R)) { ARR_SWAP(L+1, R); } + if (ARR(L) > ARR(R)) { ARR_SWAP(L, R); } + if (ARR(L+1) > ARR(L)) { ARR_SWAP(L+1, L); } + + i = L+1; + j = R; + piv = ARR(L); + do { + do i++; while(ARR(i) < piv); + do j--; while(ARR(j) > piv); + if (j < i) + break; + ARR_SWAP(i, j); + } while(1); + ARR_SWAP(L, j); + + /* Re-set active partition */ + if (j <= k) L=i; + if (j >= k) R=j-1; + } while(1); +} + +/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's +public domain implementation at http://ndevilla.free.fr/median/median/ +Adapted similarly to the above Quicksort algorithm. */ +static void THTensor_(quickselect)(real *arr, int64_t *idx, int64_t k, int64_t elements, int64_t stride) +{ + int64_t P, L, R, i, j, swap; + real rswap, piv; + L = 0; + R = elements-1; + + do { + if (R <= L) /* One element only */ + return; + + if (R == L+1) { /* Two elements only */ + if (ARR(L) > ARR(R)) { + BOTH_SWAP(L, R); + } + return; + } + + /* Use median of three for pivot choice */ + P=(L+R)>>1; + BOTH_SWAP(P, L+1); + if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); } + if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); } + if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); } + + i = L+1; + j = R; + piv = ARR(L); + do { + do i++; while(ARR(i) < piv); + do j--; while(ARR(j) > piv); + if (j < i) + break; + BOTH_SWAP(i, j); + } while(1); + BOTH_SWAP(L, j); + + /* Re-set active partition */ + if (j <= k) L=i; + if (j >= k) R=j-1; + } while(1); +} + +#undef ARR +#undef IDX +#undef LONG_SWAP +#undef REAL_SWAP +#undef BOTH_SWAP + +void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + THTensor *temp_; + THLongTensor *tempi_; + real *temp__data; + int64_t *tempi__data; + int64_t t_size_dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range"); + + int in_dims = THTensor_(_nDimension)(t); + THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim); + THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(values_, dim, NULL); + THLongTensor_resize(indices_, dim, NULL); + THLongStorage_free(dim); + + t_size_dim = THTensor_(size)(t, dimension); + + temp_ = THTensor_(new)(); + THTensor_(resize1d)(temp_, t_size_dim); + temp__data = THTensor_(data)(temp_); + + tempi_ = THLongTensor_new(); + THLongTensor_resize1d(tempi_, t_size_dim); + tempi__data = THLongTensor_data(tempi_); + + TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + int64_t i; + real mode = 0; + int64_t modei = 0; + int64_t temp_freq = 0; + int64_t max_freq = 0; + for(i = 0; i < t_size_dim; i++) + temp__data[i] = t_data[i*t_stride]; + for(i = 0; i < t_size_dim; i++) + tempi__data[i] = i; + THTensor_(quicksortascend)(temp__data, tempi__data, t_size_dim, 1); + + for(i = 0; i < t_size_dim; i++) + { + temp_freq++; + if ((i == t_size_dim - 1) || (temp__data[i] != temp__data[i+1])) + { + if (temp_freq > max_freq) + { + mode = temp__data[i]; + modei = tempi__data[i]; + max_freq = temp_freq; + } + temp_freq = 0; + } + } + *values__data = mode; + *indices__data = modei;); + + THTensor_(free)(temp_); + THLongTensor_free(tempi_); + if (!keepdim) { + THTensor_(squeeze1d)(values_, values_, dimension); + THLongTensor_squeeze1d(indices_, indices_, dimension); + } +} + +void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim) +{ + THLongStorage *dim; + THTensor *temp_; + THLongTensor *tempi_; + real *temp__data; + int64_t *tempi__data; + int64_t t_size_dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range"); + THArgCheck(k > 0 && k <= t->size[dimension], 2, "selected index out of range"); + + int in_dims = THTensor_(_nDimension)(t); + THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim); + THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(values_, dim, NULL); + THLongTensor_resize(indices_, dim, NULL); + THLongStorage_free(dim); + + t_size_dim = THTensor_(size)(t, dimension); + + temp_ = THTensor_(new)(); + THTensor_(resize1d)(temp_, t_size_dim); + temp__data = THTensor_(data)(temp_); + + tempi_ = THLongTensor_new(); + THLongTensor_resize1d(tempi_, t_size_dim); + tempi__data = THLongTensor_data(tempi_); + + TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + int64_t i; + for(i = 0; i < t_size_dim; i++) + temp__data[i] = t_data[i*t_stride]; + for(i = 0; i < t_size_dim; i++) + tempi__data[i] = i; + THTensor_(quickselect)(temp__data, tempi__data, k - 1, t_size_dim, 1); + *values__data = temp__data[k-1]; + *indices__data = tempi__data[k-1];); + + THTensor_(free)(temp_); + THLongTensor_free(tempi_); + if (!keepdim) { + THTensor_(squeeze1d)(values_, values_, dimension); + THLongTensor_squeeze1d(indices_, indices_, dimension); + } +} + +void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim) +{ + int64_t t_size_dim, k; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range"); + + t_size_dim = THTensor_(size)(t, dimension); + k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */ + + THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim); +} + +void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted) +{ +#ifndef USE_TH_SIZE_ZERO_DIM + int numDims = THTensor_(_nDimension)(t); +#else + int numDims = THTensor_(nDimension)(t); +#endif + THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range"); + + int64_t sliceSize = THTensor_(size)(t, dim); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension"); +#else + THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension"); +#endif + + THTensor *tmpResults = THTensor_(new)(); + THTensor_(resize1d)(tmpResults, sliceSize); + real *tmp__data = THTensor_(data)(tmpResults); + + THLongTensor *tmpIndices = THLongTensor_new(); + THLongTensor_resize1d(tmpIndices, sliceSize); + int64_t *tmpi__data = THLongTensor_data(tmpIndices); + + THLongStorage *topKSize = THTensor_(newSizeOf)(t); + THLongStorage_set(topKSize, dim, k); + THTensor_(resize)(rt_, topKSize, NULL); + THLongTensor_resize(ri_, topKSize, NULL); + THLongStorage_free(topKSize); + + if (dir) { + /* k largest elements, descending order (optional: see sorted) */ + int64_t K = sliceSize - k; + TH_TENSOR_DIM_APPLY3(real, t, real, rt_, int64_t, ri_, dim, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + int64_t i; + for(i = 0; i < sliceSize; i++) + { + tmp__data[i] = t_data[i*t_stride]; + tmpi__data[i] = i; + } + if (K > 0) + THTensor_(quickselect)(tmp__data, tmpi__data, K - 1, sliceSize, 1); + if (sorted) + THTensor_(quicksortdescend)(tmp__data + K, tmpi__data + K, k, 1); + for(i = 0; i < k; i++) + { + rt__data[i*rt__stride] = tmp__data[i + K]; + ri__data[i*ri__stride] = tmpi__data[i + K]; + }) + } + else { + /* k smallest elements, ascending order (optional: see sorted) */ + TH_TENSOR_DIM_APPLY3(real, t, real, rt_, int64_t, ri_, dim, + TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, + int64_t i; + for(i = 0; i < sliceSize; i++) + { + tmp__data[i] = t_data[i*t_stride]; + tmpi__data[i] = i; + } + THTensor_(quickselect)(tmp__data, tmpi__data, k - 1, sliceSize, 1); + if (sorted) + THTensor_(quicksortascend)(tmp__data, tmpi__data, k - 1, 1); + for(i = 0; i < k; i++) + { + rt__data[i*rt__stride] = tmp__data[i]; + ri__data[i*ri__stride] = tmpi__data[i]; + }) + } + + THTensor_(free)(tmpResults); + THLongTensor_free(tmpIndices); +} + +void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k) +{ + int64_t t_size_0, t_size_1; + int64_t t_stride_0, t_stride_1; + int64_t r__stride_0, r__stride_1; + real *t_data, *r__data; + int64_t r, c; + + THArgCheck(THTensor_(_nDimension)(t) == 2, 1, "expected a matrix"); + + THTensor_(resizeAs)(r_, t); + + t_size_0 = THTensor_(size)(t, 0); + t_size_1 = THTensor_(size)(t, 1); + t_stride_0 = THTensor_(stride)(t, 0); + t_stride_1 = THTensor_(stride)(t, 1); + r__stride_0 = THTensor_(stride)(r_, 0); + r__stride_1 = THTensor_(stride)(r_, 1); + r__data = THTensor_(data)(r_); + t_data = THTensor_(data)(t); + + for(r = 0; r < t_size_0; r++) + { + int64_t sz = THMin(r+k+1, t_size_1); + for(c = THMax(0, r+k+1); c < t_size_1; c++) + r__data[r*r__stride_0+c*r__stride_1] = 0; + for(c = 0; c < sz; c++) + r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1]; + } +} + +void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k) +{ + int64_t t_size_0, t_size_1; + int64_t t_stride_0, t_stride_1; + int64_t r__stride_0, r__stride_1; + real *t_data, *r__data; + int64_t r, c; + + THArgCheck(THTensor_(_nDimension)(t) == 2, 1, "expected a matrix"); + + THTensor_(resizeAs)(r_, t); + + t_size_0 = THTensor_(size)(t, 0); + t_size_1 = THTensor_(size)(t, 1); + t_stride_0 = THTensor_(stride)(t, 0); + t_stride_1 = THTensor_(stride)(t, 1); + r__stride_0 = THTensor_(stride)(r_, 0); + r__stride_1 = THTensor_(stride)(r_, 1); + r__data = THTensor_(data)(r_); + t_data = THTensor_(data)(t); + + for(r = 0; r < t_size_0; r++) + { + int64_t sz = THMin(r+k, t_size_1); + for(c = THMax(0, r+k); c < t_size_1; c++) + r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1]; + for(c = 0; c < sz; c++) + r__data[r*r__stride_0+c*r__stride_1] = 0; + } +} + +void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension) +{ + THTensor* inputs[2]; + inputs[0] = ta; + inputs[1] = tb; + THTensor_(catArray)(r_, inputs, 2, dimension); +} + +void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension); +inline void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, int dimension) +{ + int first_dims = first->dim(); + int second_dims = second->dim(); + THArgCheck(first_dims == second_dims, 0, + "Tensors must have same number of dimensions: got %d and %d", + first_dims, second_dims); + for (int dim = 0; dim < first_dims; dim++) { + if (dim == dimension) { + continue; + } + int64_t first_dim_size = first->size[dim]; + int64_t second_dim_size = second->size[dim]; + THArgCheck(first_dim_size == second_dim_size, 0, + "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d", + dimension, (long long)first_dim_size, (long long)second_dim_size, dim); + } +} + +void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension) +{ + // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible + // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors + // to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific + // size (i.e. other empty sizes are not skipped). + // FIXME: warn if this is the case + bool allSkipped= true; + int64_t nDims = 0; + THTensor *notSkippedTensor; // non-owning reference + auto should_skip = [](THTensor *t) { return t->is_empty() && t->dim() == 1; }; + for (int i = 0; i < numInputs; i++) { + if (should_skip(inputs[i])) { + continue; + } + // We've found a non-empty tensor + allSkipped = false; + notSkippedTensor = inputs[i]; + nDims = notSkippedTensor->dim(); + break; + } + if (allSkipped) { + return; + } + + // Compute cat_dimension based on the non-empty tensor + THArgCheck(dimension < nDims, 4, "invalid dimension %d", dimension); + THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs); + + // Compute size of the result in the cat dimension + int64_t cat_dim_size = 0; + for (int i = 0; i < numInputs; i++) { + THTensor *tensor = inputs[i]; + if (should_skip(tensor)) { + continue; + } + THTensor_(check_shape_except_dim)(notSkippedTensor, tensor, dimension); + cat_dim_size += tensor->size[dimension]; + } + + // Compute the size of the result + THLongStorage *size = THLongStorage_newWithSize(nDims); + for (int dim = 0; dim < nDims; dim++) { + int64_t result_dim_size = notSkippedTensor->size[dim]; + if (dim == dimension) { + result_dim_size = cat_dim_size; + } + THLongStorage_data(size)[dim] = result_dim_size; + } + THTensor_(resize)(result, size, NULL); + + // Check contiguity of all inputs and result + bool allContiguous = true; + for (int i = 0; i < numInputs; i++) { + if(!should_skip(inputs[i])) { + allContiguous = allContiguous && THTensor_(isContiguous)(inputs[i]); + } + } + allContiguous = allContiguous && THTensor_(isContiguous)(result); + + // First path is for contiguous inputs along dim 0 + // Second path for non-contiguous + int64_t offset; + if (dimension == 0 && allContiguous) { + real* result_data = THStorage_(data)(result->storage) + result->storageOffset; + offset = 0; + for (int j = 0; j < numInputs; j++) { + if (!should_skip(inputs[j])) { + THTensor* input0 = inputs[j]; + real* input0_data = THStorage_(data)(input0->storage) + input0->storageOffset; + int64_t input0_size = THTensor_(nElement)(input0); + // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this. + if (input0_size != 0) { + memcpy(result_data + offset, input0_data, input0_size*sizeof(real)); + } + offset += input0_size; + } + } + } else { + offset = 0; + for (int j = 0; j < numInputs; j++) { + if (!should_skip(inputs[j])) { + int64_t dimSize = inputs[j]->size[dimension]; + THTensor *nt = THTensor_(newWithTensor)(result); + THTensor_(narrow)(nt, NULL, dimension, offset, dimSize); + THTensor_(copy)(nt, inputs[j]); + THTensor_(free)(nt); + offset += dimSize; + } + } + } + THLongStorage_free(size); +} + +int THTensor_(equal)(THTensor *ta, THTensor* tb) +{ + int equal = 1; + if(!THTensor_(isSameSizeAs)(ta, tb)) + return 0; + + if (THTensor_(isContiguous)(ta) && THTensor_(isContiguous)(tb)) { + real *tap = THTensor_(data)(ta); + real *tbp = THTensor_(data)(tb); + ptrdiff_t sz = THTensor_(nElement)(ta); + ptrdiff_t i; + for (i=0; idim(), t->size, NULL); \ + TH_TENSOR_APPLY2(unsigned char, r_, real, t, \ + *r__data = (*t_data OP value) ? 1 : 0;); \ + } \ + void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value) \ + { \ + THTensor_(resizeNd)(r_, t->dim(), t->size, NULL); \ + TH_TENSOR_APPLY2(real, r_, real, t, \ + *r__data = (*t_data OP value) ? 1 : 0;); \ + } \ + void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \ + { \ + THByteTensor_resizeNd(r_, ta->dim(), ta->size, NULL); \ + TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb, \ + *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \ + } \ + void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \ + { \ + THTensor_(resizeNd)(r_, ta->dim(), ta->size, NULL); \ + TH_TENSOR_APPLY3(real, r_, real, ta, real, tb, \ + *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \ + } \ + + +TENSOR_IMPLEMENT_LOGICAL(lt,<) +TENSOR_IMPLEMENT_LOGICAL(gt,>) +TENSOR_IMPLEMENT_LOGICAL(le,<=) +TENSOR_IMPLEMENT_LOGICAL(ge,>=) +TENSOR_IMPLEMENT_LOGICAL(eq,==) +TENSOR_IMPLEMENT_LOGICAL(ne,!=) + + +#ifdef _OPENMP + +#define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, OMP_THRESHOLD) \ + void THTensor_(NAME)(THTensor *r_, THTensor *t) \ + { \ + THTensor_(resizeAs)(r_, t); \ + ptrdiff_t r_Size = THTensor_(nElement)(r_); \ + int r_Contig = THTensor_(isContiguous)(r_); \ + int tContig = THTensor_(isContiguous)(t); \ + int inOMP = omp_in_parallel(); \ + if( !inOMP ){ \ + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = CFUNC(*t_data);, OMP_THRESHOLD); \ + } else { \ + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = CFUNC(*t_data);); \ + } \ + } + +#define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) \ + LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD) + +#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, OMP_THRESHOLD) \ + void THTensor_(NAME)(THTensor *r_, THTensor *t) \ + { \ + THTensor_(resizeAs)(r_, t); \ + ptrdiff_t r_Size = THTensor_(nElement)(r_); \ + int r_Contig = THTensor_(isContiguous)(r_); \ + int tContig = THTensor_(isContiguous)(t); \ + if (r_Contig && tContig) { \ + TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(NAME)(r__data, t_data, r__len);); \ + } else { \ + int inOMP = omp_in_parallel(); \ + if( !inOMP ){ \ + TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, real, r_, real, t, *r__data = CFUNC(*t_data);, OMP_THRESHOLD); \ + } \ + else { \ + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = CFUNC(*t_data);); \ + } \ + } \ + } + +#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \ + LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD) + +#else + +#define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) \ + void THTensor_(NAME)(THTensor *r_, THTensor *t) \ + { \ + THTensor_(resizeAs)(r_, t); \ + TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data);); \ + } \ + +#define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, PSEUDO_OMP_THRESHOLD) \ + LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) + +#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \ + void THTensor_(NAME)(THTensor *r_, THTensor *t) \ + { \ + THTensor_(resizeAs)(r_, t); \ + int r_Contig = THTensor_(isContiguous)(r_); \ + int tContig = THTensor_(isContiguous)(t); \ + if (r_Contig && tContig) { \ + TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(NAME)(r__data, t_data, r__len);); \ + } else { \ + TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data);); \ + } \ + } \ + +#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, PSEUDO_OMP_THRESHOLD) \ + LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) + +#endif + +#define EXPAND(...) __VA_ARGS__ + +#define GET_4TH_ARG(ARG0, ARG1, ARG2, ARG3, ...) ARG3 + +#define LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(...) \ + EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS, LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS, )) + +#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(...) \ + EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS, LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS, )) + +#define LAB_IMPLEMENT_BASIC_FUNCTION(...) EXPAND(LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__)) + +#define LAB_IMPLEMENT_VECTORIZED_FUNCTION(...) EXPAND(LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__)) + +/* + * LAB_IMPLEMENT_BASIC_FUNCTION is a macro with optional parameters, you can use it flexibly. + * The macro will discard the invalid openmp threshold if openmp is unavailable. The macro will give a default threshold even if you forget to pass one. + * In other word, + * (A), If openmp is UNavailable, the two usage below is both right. + * (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD) // discard the invalid openmp threshold + * (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity) + * (B), If openmp is available, the two usage below is also both right. + * (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD) + * (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity) // pass the default openmp threshold + * So do LAB_IMPLEMENT_VECTORIZED_FUNCTION. +*/ + +LAB_IMPLEMENT_BASIC_FUNCTION(neg,-) + +#if defined(TH_REAL_IS_LONG) +LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs) +#endif /* int64_t only part */ + +#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) +LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs) +#endif /* int only part */ + +#if defined(TH_REAL_IS_BYTE) + +int THTensor_(logicalAndAll)(THTensor *tensor) +{ + real prod = 1; + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if(inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, &&:prod, prod = prod && *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + if (serial_path) { + TH_TENSOR_APPLY(real, tensor, prod = prod && *tensor_data;); + } + return prod; +} + +int THTensor_(logicalAnyAll)(THTensor *tensor) +{ + real sum = 0; + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if(inOMP) { + serial_path = 1; + } else { + TH_TENSOR_APPLY_REDUCTION_OMP(real, tensor, ||:sum, sum = sum || *tensor_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#else + serial_path = 1; +#endif + if (serial_path) { + TH_TENSOR_APPLY(real, tensor, sum = sum || *tensor_data;); + } + return (bool)sum; +} + +void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + int r_Contig = THTensor_(isContiguous)(r_); + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + if(r_Contig && (tp != rp)){ + ptrdiff_t iter = 0; + ptrdiff_t r_Size = THTensor_(nElement)(r_); + int r_Dim = r_->_dim(); + #pragma omp parallel for if ( r_Size > TH_OMP_OVERHEAD_THRESHOLD) + for (iter = 0; iter < r_Size; iter++) { + int j; + int64_t quot; + int64_t rem = iter; + ptrdiff_t tBasicIndex = 0; + + for(j = 0; j < r_Dim; ++j) { + if(j != dimension){ + quot = rem/r_->stride[j]; + rem = rem%r_->stride[j]; + tBasicIndex += quot*t->stride[j]; + } + } + real *t_data = tp+tBasicIndex; + real *r__data = rp+iter; + *r__data = 1; + for(j=0; j < t->size[dimension]; ++j) { + *r__data = *r__data && *(t_data + j*t->stride[dimension]); + } + } + } else { + serial_path = 1; + } + } +#else + serial_path = 1; +#endif + + if(serial_path) { + // two implementations optimized for data locality + if (t->stride[dimension] == 1) { + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + accreal prod = 1; + int64_t i; + for(i = 0; i < t_size; i++) + prod = prod && t_data[i*t_stride]; + *r__data = (real)prod;); + } else { + THTensor_(fill)(r_, 1); + THTensor *temp_ = THTensor_(newWithTensor)(r_); + // r_.expand_as(t) + temp_->size[dimension] = t->size[dimension]; + temp_->stride[dimension] = 0; + + TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data && *t_data;); + THTensor_(free)(temp_); + } + } + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } +} + +void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + int serial_path = 0; +#ifdef _OPENMP + int inOMP = omp_in_parallel(); + if (inOMP) { + serial_path = 1; + } else { + int r_Contig = THTensor_(isContiguous)(r_); + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + if(r_Contig && (tp != rp)){ + ptrdiff_t iter = 0; + ptrdiff_t r_Size = THTensor_(nElement)(r_); + int r_Dim = r_->_dim(); + #pragma omp parallel for if ( r_Size > TH_OMP_OVERHEAD_THRESHOLD) + for (iter = 0; iter < r_Size; iter++) { + int j; + int64_t quot; + int64_t rem = iter; + ptrdiff_t tBasicIndex = 0; + + for(j = 0; j < r_Dim; ++j) { + if(j != dimension){ + quot = rem/r_->stride[j]; + rem = rem%r_->stride[j]; + tBasicIndex += quot*t->stride[j]; + } + } + real *t_data = tp+tBasicIndex; + real *r__data = rp+iter; + *r__data = 0; + for(j=0; j < t->size[dimension]; ++j) { + *r__data = *r__data || *(t_data + j*t->stride[dimension]); + } + } + } else { + serial_path = 1; + } + } +#else + serial_path = 1; +#endif + if (serial_path) { + // two implementations optimized for data locality + if (t->stride[dimension] == 1) { + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + accreal sum = 0; + int64_t i; + for(i = 0; i < t_size; i++) + sum = sum || t_data[i*t_stride]; + *r__data = (real)sum;); + } else { + THTensor_(zero)(r_); + THTensor *temp_ = THTensor_(newWithTensor)(r_); + // r_.expand_as(t) + temp_->size[dimension] = t->size[dimension]; + temp_->stride[dimension] = 0; + + TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data || *t_data;); + THTensor_(free)(temp_); + } + } + + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } +} + +#endif /* Byte only part */ + +/* floating point only now */ +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + +#if defined (TH_REAL_IS_FLOAT) +#define TH_MATH_NAME(fn) fn##f +#else +#define TH_MATH_NAME(fn) fn +#endif + +LAB_IMPLEMENT_BASIC_FUNCTION(log,TH_MATH_NAME(log)) +LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma)) +LAB_IMPLEMENT_BASIC_FUNCTION(digamma,TH_MATH_NAME(TH_digamma)) +LAB_IMPLEMENT_BASIC_FUNCTION(trigamma,TH_MATH_NAME(TH_trigamma)) +LAB_IMPLEMENT_BASIC_FUNCTION(log10,TH_MATH_NAME(log10)) +LAB_IMPLEMENT_BASIC_FUNCTION(log1p,TH_MATH_NAME(log1p)) +LAB_IMPLEMENT_BASIC_FUNCTION(log2,TH_MATH_NAME(log2)) +LAB_IMPLEMENT_BASIC_FUNCTION(erf,TH_MATH_NAME(erf)) +LAB_IMPLEMENT_BASIC_FUNCTION(erfc,TH_MATH_NAME(erfc)) +LAB_IMPLEMENT_BASIC_FUNCTION(erfinv,TH_erfinv) +LAB_IMPLEMENT_BASIC_FUNCTION(ceil,TH_MATH_NAME(ceil)) +LAB_IMPLEMENT_BASIC_FUNCTION(floor,TH_MATH_NAME(floor)) +LAB_IMPLEMENT_BASIC_FUNCTION(round,TH_MATH_NAME(round)) +LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs)) +LAB_IMPLEMENT_BASIC_FUNCTION(trunc,TH_MATH_NAME(trunc)) +LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac)) +LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / ) + +LAB_IMPLEMENT_BASIC_FUNCTION(exp,TH_MATH_NAME(exp),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(expm1,TH_MATH_NAME(expm1),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(cos,TH_MATH_NAME(cos),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(acos,TH_MATH_NAME(acos),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(sin,TH_MATH_NAME(sin),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(asin,TH_MATH_NAME(asin),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(tan,TH_MATH_NAME(tan),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(atan,TH_MATH_NAME(atan),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt),HYPER_TH_OMP_OVERHEAD_THRESHOLD) +LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt),HYPER_TH_OMP_OVERHEAD_THRESHOLD) + +LAB_IMPLEMENT_VECTORIZED_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid),HYPER_TH_OMP_OVERHEAD_THRESHOLD) + +void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty) +{ + THTensor_(resizeAs)(r_, tx); + TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data);); +} + +void THTensor_(polygamma)(THTensor *r_, int64_t n, THTensor *t) { + switch (n) { + case 0: THTensor_(digamma)(r_, t); return; + case 1: THTensor_(trigamma)(r_, t); return; + default: THError("polygamma(n,x) is not implemented for n>=2"); + } +} + +void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight) +{ + THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match"); + THTensor_(resizeAs)(r_, a); + TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_MATH_NAME(TH_lerp)(*a_data, *b_data, weight);); +} + +void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim) +{ + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + THTensor_(sum)(r_, t, dimension, keepdim); + THTensor_(div)(r_, r_, t->size[dimension]); +} + +void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + // Uses Welford's algorithm for numeric stability + accreal mean = 0; + accreal M2 = 0; + + int64_t i; + for (i = 0; i < t_size; i++) + { + real z = t_data[i*t_stride]; + real delta = z - mean; + mean += delta / (i + 1); + real delta2 = z - mean; + M2 += delta * delta2; + } + + if (biased && t_size >= 2) + { + *r__data = TH_MATH_NAME(sqrt)(M2 / t_size); + } else if (!biased && t_size >= 2) { + *r__data = TH_MATH_NAME(sqrt)(M2 / (t_size - 1)); + } else if (biased && t_size == 1) { + *r__data = 0; + } else { + *r__data = NAN; + }); + + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } +} + +void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, + // Uses Welford's algorithm for numeric stability + accreal mean = 0; + accreal M2 = 0; + + int64_t i; + for (i = 0; i < t_size; i++) + { + real z = t_data[i*t_stride]; + real delta = z - mean; + mean += delta / (i + 1); + real delta2 = z - mean; + M2 += delta * delta2; + } + + if (biased && t_size >= 2) + { + *r__data = M2 / t_size; + } else if (!biased && t_size >= 2) { + *r__data = M2 / (t_size - 1); + } else if (biased && t_size == 1) { + *r__data = 0; + } else { + *r__data = NAN; + }); + + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } +} + +void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim) +{ + THLongStorage *dim; + + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + THTensor_(preserveReduceDimSemantics)(r_, THTensor_(_nDimension)(t), dimension, keepdim); + dim = THTensor_(newSizeOf)(t); + THLongStorage_set(dim, dimension, 1); + THTensor_(resize)(r_, dim, NULL); + THLongStorage_free(dim); + + #define DIM_REDUCE(reduce, transform) \ + TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, \ + accreal sum = 0; \ + int64_t i; \ + for(i = 0; i < t_size; i++) { \ + (reduce); \ + } \ + (transform);) \ + + if(value == 0) { + DIM_REDUCE(sum += t_data[i*t_stride] != 0.0, + *r__data = sum); + } else if (value == 1) { + DIM_REDUCE(sum += TH_MATH_NAME(fabs)(t_data[i*t_stride]), + *r__data = sum); + } else if (value == 2) { + DIM_REDUCE(sum += t_data[i*t_stride] * t_data[i*t_stride], + *r__data = TH_MATH_NAME(sqrt)(sum)); + } else if (value == 3) { + DIM_REDUCE(sum += TH_MATH_NAME(fabs)(t_data[i*t_stride] * t_data[i*t_stride] * t_data[i*t_stride]), + *r__data = TH_MATH_NAME(pow)(sum, 1.0/3)); + } else if (value == INFINITY) { + DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])), + *r__data = sum); + } else { + DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value), + *r__data = TH_MATH_NAME(pow)(sum, 1.0/value)); + } + + if (!keepdim) { + THTensor_(squeeze1d)(r_, r_, dimension); + } + #undef DIM_REDUCE +} + +accreal THTensor_(normall)(THTensor *tensor, real value) +{ + accreal sum = 0; + if(value == 0) { + TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;); + return sum; + } else if(value == 1) { + TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data);); + return sum; + } else if(value == 2) { + TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;); + return sqrt(sum); + } else if(value == 3) { + TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += std::abs(z*z*z);); + return TH_MATH_NAME(pow)(sum, 1.0/3); + } else if(value == INFINITY) { + TH_TENSOR_APPLY(real, tensor, sum = THMax(sum, TH_MATH_NAME(fabs)(*tensor_data));); + return sum; + } else { + TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value);); + return TH_MATH_NAME(pow)(sum, 1.0/value); + } +} + +void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension, real maxnorm) +{ + int i; + THTensor *rowR, *rowS; + + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(src), 3, "invalid dimension %d", + dimension + TH_INDEX_BASE); + THArgCheck(value > 0, 2, "non-positive-norm not supported"); + THArgCheck(THTensor_(nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions", + THTensor_(nDimension)(src)); + + rowR = THTensor_(new)(); + rowS = THTensor_(new)(); + + THTensor_(resizeAs)(res, src); + + for (i=0; isize[dimension]; i++) + { + real norm = 0; + real new_norm; + + THTensor_(select)(rowS, src, dimension, i); + THTensor_(select)(rowR, res, dimension, i); + if (value == 1) { + TH_TENSOR_APPLY(real, rowS, norm += fabs(*rowS_data);); + } else if (value == 2) { + TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;); + } else if (value == INFINITY) { + TH_TENSOR_APPLY(real, rowS, norm = THMax(norm, TH_MATH_NAME(fabs)(*rowS_data));); + } else { + TH_TENSOR_APPLY(real, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value);); + } + + if (value != INFINITY) { + norm = pow(norm, 1/value); + } + + if (norm > maxnorm) + { + new_norm = maxnorm / (norm + 1e-7); + + TH_TENSOR_APPLY2( + real, rowR, real, rowS, + *rowR_data = (*rowS_data) * new_norm; + ) + } + else + THTensor_(copy)(rowR, rowS); + } + + THTensor_(free)(rowR); + THTensor_(free)(rowS); +} + +accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value) +{ + real sum = 0; + TH_TENSOR_APPLY2(real, tensor, real, src, + sum += TH_MATH_NAME(pow)( + TH_MATH_NAME(fabs)(*tensor_data - *src_data), value);); + return TH_MATH_NAME(pow)(sum, 1.0/value); +} + +accreal THTensor_(meanall)(THTensor *tensor) +{ + return THTensor_(sumall)(tensor)/THTensor_(nElement)(tensor); +} + +accreal THTensor_(varall)(THTensor *tensor, int biased) +{ + accreal mean = THTensor_(meanall)(tensor); + accreal sum = 0; + TH_TENSOR_APPLY(real, tensor, sum += (*tensor_data - mean)*(*tensor_data - mean);); + sum /= std::max(0, THTensor_(nElement)(tensor) - (biased ? 0 : 1)); + return sum; +} + +accreal THTensor_(stdall)(THTensor *tensor, int biased) +{ + return sqrt(THTensor_(varall)(tensor, biased)); +} + +void THTensor_(linspace)(THTensor *r_, real a, real b, int64_t n) +{ + real i = 0; + + // NumPy allows you to pass different points even if n <= 1 -- should we? + THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points"); + + if (THTensor_(nElement)(r_) != n) { + THTensor_(resize1d)(r_, n); + } + + if (n == 0) { + } else if (n == 1) { + THTensor_(set1d)(r_, 0, a); + } else { + TH_TENSOR_APPLY(real, r_, + *r__data = a + (b-a)/((real)(n-1))*i; + i++; + ); + } +} + +void THTensor_(logspace)(THTensor *r_, real a, real b, int64_t n) +{ + real i = 0; + + // NumPy allows you to pass different points even if n <= 1 -- should we? + THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points"); + + if (THTensor_(nElement)(r_) != n) { + THTensor_(resize1d)(r_, n); + } + + if (n == 0) { + } else if (n == 1) { + THTensor_(set1d)(r_, 0, TH_MATH_NAME(pow)(10.0, a)); + } else { + TH_TENSOR_APPLY(real, r_, + *r__data = TH_MATH_NAME(pow)(10.0, a + i*(b-a)/((real)(n-1))); + i++; + ); + } +} + +void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue) +{ + real minval; + real maxval; + real *h_data; + + THTensor_(resize1d)(hist, nbins); + THTensor_(zero)(hist); + minval = minvalue; + maxval = maxvalue; + if (minval == maxval) + { + minval = THTensor_(minall)(tensor); + maxval = THTensor_(maxall)(tensor); + } + if (minval == maxval) + { + minval = minval - 1; + maxval = maxval + 1; + } + + h_data = THTensor_(data)(hist); + + TH_TENSOR_APPLY(real, tensor, + if (*tensor_data >= minval && *tensor_data <= maxval) { + const int bin = (int)((*tensor_data-minval) / (maxval-minval) * nbins); + h_data[THMin(bin, nbins-1)] += 1; + } + ); +} + +void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue) +{ + THArgCheck(THTensor_(_nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(_nDimension)(tensor)); + + int dimension = 1; + THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(tensor), 2, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + real minval; + real maxval; + + THTensor_(resize2d)(hist, tensor->size[0], nbins); + THTensor_(zero)(hist); + + minval = minvalue; + maxval = maxvalue; + if (minval == maxval) + { + minval = THTensor_(minall)(tensor); + maxval = THTensor_(maxall)(tensor); + } + if (minval == maxval) + { + minval = minval - 1; + maxval = maxval + 1; + } + + TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, int64_t i; + for(i = 0; i < tensor_size; i++) + { + if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) { + const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins); + hist_data[THMin(bin, nbins-1)] += 1; + } + } + ); +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha. +// Assumes x is close to zero and uses a Taylor expansion. +static inline real THTensor_(beta_grad_alpha_small)(real x, real alpha, real beta) { + const real factor = TH_MATH_NAME(TH_digamma)(alpha) - TH_MATH_NAME(TH_digamma)(alpha + beta) - TH_MATH_NAME(log)(x); + real numer = 1; + real series = numer / alpha * (factor + 1 / alpha); + for (int i = 1; i <= 10; ++i) { + numer *= (i - beta) * x / i; + const real denom = alpha + i; + series += numer / denom * (factor + 1 / denom); + } + const real result = x * TH_MATH_NAME(pow)(1 - x, -beta) * series; + return th_isnan(result) ? 0.0 : result; +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta. +// Assumes x is close to zero and uses a Taylor expansion. +static inline real THTensor_(beta_grad_beta_small)(real x, real alpha, real beta) { + const real factor = TH_MATH_NAME(TH_digamma)(alpha+beta) - TH_MATH_NAME(TH_digamma)(beta); + real numer = 1; + real betas = 1; + real dbetas = 0; + real series = factor / alpha; + for (int i = 1; i <= 8; ++i) { + numer *= -x / i; + dbetas = dbetas * (beta - i) + betas; + betas = betas * (beta - i); + series += numer / (alpha + i) * (dbetas + factor * betas); + } + const real result = -TH_MATH_NAME(pow)(1 - x, 1 - beta) * series; + return th_isnan(result) ? 0.0 : result; +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha. +// Assumes alpha and beta are both large and uses a Rice saddle point expansion. +// To ensure numerical stability, this computation is performed at higher precision. +static inline real THTensor_(beta_grad_alpha_mid)(double x, double alpha, double beta) { + const double total = alpha + beta; + const double mean = alpha / total; + const double std = sqrt(alpha * beta / (total + 1)) / total; + if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) { + // Avoid the singularity at x = mean. + const double poly = 47 * x * (beta*beta)*(beta*beta) + alpha * ( + (43 + 20 * (16 + 27 * beta) * x) * (beta*beta)*beta + alpha * ( + 3 * (59 + 180 * beta - 90 * x) * (beta*beta) + alpha * ( + (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * ( + 8 * (1 - x) * (135 * beta - 11))))); + const double prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total); + const double prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total); + return prefactor_num / (1 - x) * poly / prefactor_den; + } + const double prefactor = -x / sqrt(2 * alpha * beta / total); + const double stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha*alpha)) + * (1 + 1 / (12 * beta) + 1 / (288 * beta*beta)) + / (1 + 1 / (12 * total) + 1 / (288 * total*total)); + const double term1_num = 2 * (alpha*alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta*beta); + const double axbx = alpha * (x-1) + beta * x; + const double term1_den = sqrt(2 * alpha / beta) * pow(total, 1.5f) * axbx*axbx; + const double term1 = term1_num / term1_den; + const double term2 = 0.5f * log(alpha / (total * x)); + const double term3_num = sqrt(8 * alpha * beta / total); + const double term3_den = beta * x + alpha * (x - 1); + const double term3 = term3_num / term3_den; + const double term4_base = beta * log(beta / (total * (1 - x))) + + alpha * log(alpha / (total * x)); + const double term4 = pow(term4_base, -1.5f); + const double term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4)); + return stirling * prefactor * term1234; +} + +// Computes a scaled reparameterized gradient +// -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x) +// for random number x drawn from a Beta distribution Beta(alpha,beta). +// This function inputs total=alpha+beta to make it easy to implement +// Dirichlet reparameterized gradients in terms of Betas. +static inline real THTensor_(dirichlet_grad_one)(real x, real alpha, real total) { + const real beta = total - alpha; + const real boundary = total * x * (1 - x); + + // Use an asymptotic approximation for x close to 0. + if (x <= 0.5f && boundary < 2.5f) { + return THTensor_(beta_grad_alpha_small)(x, alpha, beta); + } + + // Use an asymptotic approximation for x close to 1. + if (x >= 0.5f && boundary < 0.75f) { + return -THTensor_(beta_grad_beta_small)(1 - x, beta, alpha); + } + + // Use an asymptotic approximation when alpha and (total - alpha) are both large. + if (alpha > 6 && beta > 6) { + return THTensor_(beta_grad_alpha_mid)(x, alpha, beta); + } + + // Use a rational correction to an analytic approximation. + static const real c[2][3][3][4] = { + {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863}, + {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033}, + {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}}, + {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814}, + {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057}, + {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}}, + {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565}, + {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181}, + {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}}, + {{{1, -0.02924021934, -0.04438342661, 0.007285809825}, + {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521}, + {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}}, + {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273}, + {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956}, + {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}}, + {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05}, + {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05}, + {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}}, + }; + const real u = TH_MATH_NAME(log)(x); + const real a = TH_MATH_NAME(log)(alpha) - u; + const real b = TH_MATH_NAME(log)(total) - a; + const real pow_u[3] = {1, u, u * u}; + const real pow_a[3] = {1, a, a * a}; + real p = 0.0; + real q = 0.0; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + const real ua = pow_u[i] * pow_a[j]; + p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3]))); + q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3]))); + } + } + const real approx = x * (TH_MATH_NAME(TH_digamma)(total) - TH_MATH_NAME(TH_digamma)(alpha)) / beta; + return p / q * approx; +} + +void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total) +{ + x = THTensor_(newContiguous)(x); + alpha = THTensor_(newContiguous)(alpha); + total = THTensor_(newContiguous)(total); + TH_CHECK_SAME_SIZE(alpha, x); + TH_CHECK_SAME_SIZE(total, x); + THTensor_(resizeAs)(self, x); + THTensor* grad = THTensor_(newContiguous)(self); + + real*const grad_data = THTensor_(data)(grad); + real*const x_data = THTensor_(data)(x); + real*const alpha_data = THTensor_(data)(alpha); + real*const total_data = THTensor_(data)(total); + const int64_t numel = THTensor_(nElement)(x); + int64_t i; + #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i) + for(i = 0; i < numel; ++i) { + grad_data[i] = THTensor_(dirichlet_grad_one)(x_data[i], alpha_data[i], total_data[i]); + } + + THTensor_(freeCopyTo)(grad, self); +} + + +#undef TH_MATH_NAME +#endif /* floating point only part */ +#undef IS_NONZERO +#endif diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h new file mode 100644 index 0000000..08f3f15 --- /dev/null +++ b/aten/src/TH/generic/THTensorMath.h @@ -0,0 +1,214 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorMath.h" +#else + +TH_API void THTensor_(fill)(THTensor *r_, real value); +TH_API void THTensor_(zero)(THTensor *r_); + +TH_API void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value); +TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src); +TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask); + +TH_API void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor); + +TH_API void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index); +TH_API void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src); +TH_API void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src); +TH_API void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val); +TH_API void THTensor_(take)(THTensor *tensor, THTensor *src, THLongTensor *index); +TH_API void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate); + +TH_API void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index); +TH_API void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src); +TH_API void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src); +TH_API void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val); + +TH_API accreal THTensor_(dot)(THTensor *t, THTensor *src); + +TH_API real THTensor_(minall)(THTensor *t); +TH_API real THTensor_(maxall)(THTensor *t); +TH_API real THTensor_(medianall)(THTensor *t); +TH_API accreal THTensor_(sumall)(THTensor *t); +TH_API accreal THTensor_(prodall)(THTensor *t); + +TH_API void THTensor_(neg)(THTensor *self, THTensor *src); +TH_API void THTensor_(cinv)(THTensor *self, THTensor *src); + +TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(sub)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(add_scaled)(THTensor *r_, THTensor *t, real value, real alpha); +TH_API void THTensor_(sub_scaled)(THTensor *r_, THTensor *t, real value, real alpha); +TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(div)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(lshift)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value); +TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value); + +TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src); +TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, real value, THTensor *src2); +TH_API void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src); + +TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2); +TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2); + +TH_API void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec); +TH_API void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat1, THTensor *mat2); +TH_API void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2); + +TH_API void THTensor_(addbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2); +TH_API void THTensor_(baddbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2); + +TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain); + +TH_API ptrdiff_t THTensor_(numel)(THTensor *t); +void THTensor_(preserveReduceDimSemantics)(THTensor *r_, int in_dims, int reduce_dimension, int keepdim); +TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim); +TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension); +TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension); +TH_API void THTensor_(sign)(THTensor *r_, THTensor *t); +TH_API accreal THTensor_(trace)(THTensor *t); +TH_API void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension); + +TH_API void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src); +TH_API void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src); +TH_API void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value); +TH_API void THTensor_(cminValue)(THTensor *r, THTensor *t, real value); + +TH_API void THTensor_(zerosLike)(THTensor *r_, THTensor *input); +TH_API void THTensor_(onesLike)(THTensor *r_, THTensor *input); +TH_API void THTensor_(diag)(THTensor *r_, THTensor *t, int k); +TH_API void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m); +TH_API void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step); +TH_API void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step); +TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n); + +TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder); +TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted); +TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k); +TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k); +TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension); +TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension); + +TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb); + +TH_API void THTensor_(ltValue)(THByteTensor *r_, THTensor* t, real value); +TH_API void THTensor_(leValue)(THByteTensor *r_, THTensor* t, real value); +TH_API void THTensor_(gtValue)(THByteTensor *r_, THTensor* t, real value); +TH_API void THTensor_(geValue)(THByteTensor *r_, THTensor* t, real value); +TH_API void THTensor_(neValue)(THByteTensor *r_, THTensor* t, real value); +TH_API void THTensor_(eqValue)(THByteTensor *r_, THTensor* t, real value); + +TH_API void THTensor_(ltValueT)(THTensor *r_, THTensor* t, real value); +TH_API void THTensor_(leValueT)(THTensor *r_, THTensor* t, real value); +TH_API void THTensor_(gtValueT)(THTensor *r_, THTensor* t, real value); +TH_API void THTensor_(geValueT)(THTensor *r_, THTensor* t, real value); +TH_API void THTensor_(neValueT)(THTensor *r_, THTensor* t, real value); +TH_API void THTensor_(eqValueT)(THTensor *r_, THTensor* t, real value); + +TH_API void THTensor_(ltTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(leTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(gtTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(geTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(neTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(eqTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb); + +TH_API void THTensor_(ltTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(leTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(gtTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(geTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(neTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); +TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); + +TH_API void THTensor_(pow)(THTensor *r_, THTensor *t, real value); +TH_API void THTensor_(tpow)(THTensor *r_, real value, THTensor *t); + +#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) +TH_API void THTensor_(abs)(THTensor *r_, THTensor *t); +#endif + +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + +TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t); +TH_API void THTensor_(log)(THTensor *r_, THTensor *t); +TH_API void THTensor_(lgamma)(THTensor *r_, THTensor *t); +TH_API void THTensor_(digamma)(THTensor *r_, THTensor *t); +TH_API void THTensor_(trigamma)(THTensor *r_, THTensor *t); +TH_API void THTensor_(polygamma)(THTensor *r_, int64_t n, THTensor *t); +TH_API void THTensor_(log10)(THTensor *r_, THTensor *t); +TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t); +TH_API void THTensor_(log2)(THTensor *r_, THTensor *t); +TH_API void THTensor_(exp)(THTensor *r_, THTensor *t); +TH_API void THTensor_(expm1)(THTensor *r_, THTensor *t); +TH_API void THTensor_(cos)(THTensor *r_, THTensor *t); +TH_API void THTensor_(acos)(THTensor *r_, THTensor *t); +TH_API void THTensor_(cosh)(THTensor *r_, THTensor *t); +TH_API void THTensor_(sin)(THTensor *r_, THTensor *t); +TH_API void THTensor_(asin)(THTensor *r_, THTensor *t); +TH_API void THTensor_(sinh)(THTensor *r_, THTensor *t); +TH_API void THTensor_(tan)(THTensor *r_, THTensor *t); +TH_API void THTensor_(atan)(THTensor *r_, THTensor *t); +TH_API void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty); +TH_API void THTensor_(tanh)(THTensor *r_, THTensor *t); +TH_API void THTensor_(erf)(THTensor *r_, THTensor *t); +TH_API void THTensor_(erfc)(THTensor *r_, THTensor *t); +TH_API void THTensor_(erfinv)(THTensor *r_, THTensor *t); +TH_API void THTensor_(sqrt)(THTensor *r_, THTensor *t); +TH_API void THTensor_(rsqrt)(THTensor *r_, THTensor *t); +TH_API void THTensor_(ceil)(THTensor *r_, THTensor *t); +TH_API void THTensor_(floor)(THTensor *r_, THTensor *t); +TH_API void THTensor_(round)(THTensor *r_, THTensor *t); +TH_API void THTensor_(abs)(THTensor *r_, THTensor *t); +TH_API void THTensor_(trunc)(THTensor *r_, THTensor *t); +TH_API void THTensor_(frac)(THTensor *r_, THTensor *t); +TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight); + +TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim); +TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim); +TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim); +TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm); +TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value); +TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue); +TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue); + +TH_API accreal THTensor_(meanall)(THTensor *self); +TH_API accreal THTensor_(varall)(THTensor *self, int biased); +TH_API accreal THTensor_(stdall)(THTensor *self, int biased); +TH_API accreal THTensor_(normall)(THTensor *t, real value); + +TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, int64_t n); +TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, int64_t n); + +TH_API void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total); +#endif + +#if defined(TH_REAL_IS_BYTE) + +TH_API int THTensor_(logicalAndAll)(THTensor *self); +TH_API int THTensor_(logicalAnyAll)(THTensor *self); +TH_API void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim); +TH_API void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim); + +#endif /* TH_REAL_IS_BYTE */ + +#endif diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp new file mode 100644 index 0000000..3ddbfa6 --- /dev/null +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -0,0 +1,552 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorRandom.cpp" +#else + +#include + +#ifdef _OPENMP +#include +#endif + +#include + +#include "THGenerator.hpp" + +void THTensor_(random)(THTensor *self, THGenerator *_generator) +{ + std::lock_guard lock(_generator->mutex); +#if defined(TH_REAL_IS_BYTE) + TH_TENSOR_APPLY(real, self, *self_data = (uint8_t)(THRandom_random(_generator) % (UINT8_MAX + 1));); +#elif defined(TH_REAL_IS_CHAR) + TH_TENSOR_APPLY(real, self, *self_data = (int8_t)(THRandom_random(_generator) % (INT8_MAX + 1));); +#elif defined(TH_REAL_IS_SHORT) + TH_TENSOR_APPLY(real, self, *self_data = (int16_t)(THRandom_random(_generator) % (INT16_MAX + 1));); +#elif defined(TH_REAL_IS_INT) + TH_TENSOR_APPLY(real, self, *self_data = (int32_t)(THRandom_random(_generator) % (INT32_MAX + 1UL));); +#elif defined(TH_REAL_IS_LONG) + TH_TENSOR_APPLY(real, self, *self_data = (uint64_t)(THRandom_random64(_generator) % (LONG_MAX + 1ULL));); +#elif defined(TH_REAL_IS_FLOAT) + TH_TENSOR_APPLY(real, self, *self_data = (float)(THRandom_random(_generator) % ((1ULL << FLT_MANT_DIG) + 1));); +#elif defined(TH_REAL_IS_DOUBLE) + TH_TENSOR_APPLY(real, self, *self_data = (double)(THRandom_random64(_generator) % ((1ULL << DBL_MANT_DIG) + 1));); +#else +#error "Unknown type" +#endif + +} + +void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max) { + std::lock_guard lock(_generator->mutex); + THArgCheck(max > min, 2, "max must be greater than min, but got: min = %lld, max = %lld", min, max); + uint64_t range = max - min; +#if defined(TH_REAL_IS_LONG) || defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + if (range >= 1ULL << 32) { + TH_TENSOR_APPLY(real, self, *self_data = static_cast(static_cast((THRandom_random64(_generator) % range) + min));) + return; + } +#endif + TH_TENSOR_APPLY(real, self, *self_data = static_cast(static_cast((THRandom_random(_generator) % range) + min));) +} + +void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max) { + THArgCheck(max > 0, 1, "max must be positive, but got: max = %lld", max); + THTensor_(clampedRandom)(self, _generator, 0, max); +} + +void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p) +{ + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_geometric(_generator, p);); +} + +#ifdef TH_BLAS_MKL +#define BERNOULLI_OMP 800 +#define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000 + +void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator, const double p) +{ + int64_t seed = THRandom_random(_generator); + int64_t n = THTensor_(nElement)(self); + int contig = THTensor_(isContiguous)(self); + int *tmp = NULL; + THIntTensor* intTensor = NULL; + + if (contig) { +#ifdef TH_REAL_IS_INT + tmp = THIntTensor_data(self); +#else + tmp = (int*)THAlloc(n*sizeof(int)); +#endif + } else { + intTensor = THIntTensor_new(); + THIntTensor_resizeNd(intTensor, self->dim(), self->size, NULL); + tmp = THIntTensor_data(intTensor); + } + +#ifdef _OPENMP + size_t nthr = !omp_in_parallel() && n >= BERNOULLI_OMP ? omp_get_num_threads() : 1; +#pragma omp parallel num_threads(nthr) firstprivate(nthr) + { + size_t tid = omp_get_thread_num(); + int64_t seg_len_tmp = n / nthr; + int64_t line_index_offset = tid * seg_len_tmp; + int64_t line_seg_len = (tid == nthr - 1)? (n-line_index_offset) : seg_len_tmp; +#else + { + int64_t line_index_offset = 0; + int64_t line_seg_len = n; +#endif + + if (line_seg_len > 0) { + VSLStreamStatePtr stream; + vslNewStream(&stream, VSL_BRNG_MCG31, seed); + vslSkipAheadStream(stream, line_index_offset); + viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, line_seg_len, + tmp + line_index_offset, p); + vslDeleteStream(&stream); + +#ifndef TH_REAL_IS_INT + if (contig) { + real* self_seg = THTensor_(data)(self) + line_index_offset; + int* tmp_seg = tmp + line_index_offset; + THVector_(cvtFromInt)(self_seg, tmp_seg, line_seg_len); + } +#endif + } + } + + if(contig) { +#ifndef TH_REAL_IS_INT + THFree(tmp); +#endif + } else { +#ifdef _OPENMP + TH_TENSOR_APPLY2_OMP(n, 1, 0, int, intTensor, real, self, *self_data = *intTensor_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY) +#else + TH_TENSOR_APPLY2(int, intTensor, real, self, *self_data = *intTensor_data;) +#endif + THIntTensor_free(intTensor); + } + +} + +#endif + +void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p) +{ +#ifdef TH_BLAS_MKL + if(cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) { + std::lock_guard lock(_generator->mutex); + THTensor_(iBernoulli_generate_copy)(self, _generator, p); + } else { + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p);); + } +#else + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p);); +#endif +} + +void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p) +{ + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY2(real, self, float, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data);); +} + +void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p) +{ + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY2(real, self, double, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data);); +} + +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + +#if defined(TH_REAL_IS_FLOAT) +#define TH_REAL_MIN FLT_MIN +#elif defined(TH_REAL_IS_DOUBLE) +#define TH_REAL_MIN DBL_MIN +#endif + +void THTensor_(bernoulli_Tensor)(THTensor *self, THGenerator *_generator, THTensor* p) +{ +#if defined(TH_REAL_IS_FLOAT) + THTensor_(bernoulli_FloatTensor)(self, _generator, p); +#else + THTensor_(bernoulli_DoubleTensor)(self, _generator, p); +#endif +} + +void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b) +{ + std::lock_guard lock(_generator->mutex); + #if defined(TH_REAL_IS_FLOAT) + TH_TENSOR_APPLY(real, self, *self_data = + (real)THRandom_uniformFloat(_generator, (real)a, (real)b);); + #else + TH_TENSOR_APPLY(real, self, *self_data = + (real)THRandom_uniform(_generator, a, b);); + #endif +} + +void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stddev) +{ + std::lock_guard lock(_generator->mutex); + const int64_t size = THTensor_(numel)(self); + if (size >= 16 && THTensor_(isContiguous)(self)) { + THVector_(normal_fill)(THStorage_(data)(self->storage), size, _generator, mean, stddev); + } else { + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_normal(_generator, mean, stddev);); + } +} + +void THTensor_(normal_means)(THTensor *self, THGenerator *gen, THTensor *means, double stddev) +{ + THTensor_(resizeAs)(self, means); + THTensor_(normal)(self, gen, 0, stddev); + THTensor_(cadd)(self, self, 1, means); +} + +void THTensor_(normal_stddevs)(THTensor *self, THGenerator *gen, double mean, THTensor *stddevs) +{ + THTensor_(resizeAs)(self, stddevs); + THTensor_(normal)(self, gen, 0, 1); + THTensor_(cmul)(self, self, stddevs); + THTensor_(add)(self, self, mean); +} + +void THTensor_(normal_means_stddevs)(THTensor *self, THGenerator *gen, THTensor *means, THTensor *stddevs) +{ + THTensor_(resizeAs)(self, means); + THTensor_(normal)(self, gen, 0, 1); + THTensor_(cmul)(self, self, stddevs); + THTensor_(cadd)(self, self, 1, means); +} + +void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda) +{ + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_exponential(_generator, lambda);); +} + +#undef TH_REAL_MIN + +void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma) +{ + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_cauchy(_generator, median, sigma);); +} + +void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv) +{ + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_logNormal(_generator, mean, stdv);); +} + +void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor *q) +{ + int64_t inputsize = THTensor_(nElement)(probs); + int64_t i = 0; + THLongTensor *smaller = THLongTensor_newWithSize1d(inputsize); + THLongTensor *larger = THLongTensor_newWithSize1d(inputsize); + int64_t small_c = 0; + int64_t large_c = 0; + THLongTensor_resize1d(J, inputsize); + THTensor_(resize1d)(q, inputsize); + real *q_data = THTensor_(data)(q); + int64_t *J_data = THLongTensor_data(J); + + for (i = 0; i < inputsize; i++) + { + THLongTensor_fastSet1d(J, i, 0L); + real val = THTensor_(fastGet1d)(probs, i); + THTensor_(fastSet1d)(q, i, inputsize*val); + + if (inputsize * val < 1.0) + { + THLongTensor_fastSet1d(smaller, small_c, i); + small_c += 1; + } + else + { + THLongTensor_fastSet1d(larger, large_c, i); + large_c += 1; + } + } + + // Loop through and create little binary mixtures that + // appropriately allocate the larger outcomes over the + // overall uniform mixture. + int64_t large, small; + while (small_c > 0 && large_c > 0) + { + large = THLongTensor_fastGet1d(larger, large_c-1); + small = THLongTensor_fastGet1d(smaller, small_c-1); + + THLongTensor_fastSet1d(J, small, large); + q_data[large * q->stride[0]] -= 1.0 - THTensor_(fastGet1d)(q, small); + + if(q_data[large * q->stride[0]] < 1.0) + { + THLongTensor_fastSet1d(smaller, small_c-1, large); + large_c -= 1; + } + else + { + THLongTensor_fastSet1d(larger, large_c-1, large); + small_c -= 1; + } + } + + real q_min = THTensor_(fastGet1d)(q, inputsize-1); + real q_max = q_min; + real q_temp; + for (i=0; i < inputsize; i++) + { + q_temp = THTensor_(fastGet1d)(q, i); + if (q_temp < q_min) + q_min = q_temp; + else if (q_temp > q_max) + q_max = q_temp; + } + THArgCheckWithCleanup((q_min > 0), + THCleanup(THLongTensor_free(smaller); THLongTensor_free(larger);), 2, + "q_min is less than 0"); + + if (q_max > 1) + { + for (i=0; i < inputsize; i++) + { + q_data[i*q->stride[0]] /= q_max; + } + } + for (i=0; i < inputsize; i++) + { + // sometimes an large index isn't added to J. + // fix it by making the probability 1 so that J isn't indexed. + if(J_data[i] <= 0) + q_data[i] = 1.0; + } + THLongTensor_free(smaller); + THLongTensor_free(larger); +} +void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q) +{ + std::lock_guard lock(_generator->mutex); + int64_t K = THLongTensor_nElement(J); + int64_t output_nelem = THLongTensor_nElement(self); + int64_t i = 0, _mask=0; + real _q; + int64_t rand_ind, sample_idx, J_sample; + + for (i=0; i < output_nelem; i++) + { + rand_ind = THRandom_uniform(_generator, 0, K); + + _q = THTensor_(fastGet1d)(q, rand_ind); + + _mask = THRandom_bernoulli(_generator, _q); + + J_sample = THLongTensor_fastGet1d(J, rand_ind); + + sample_idx = J_sample*(1 -_mask) + (rand_ind+1L) * _mask; + + THLongTensor_fastSet1d(self, i, sample_idx-1L); + } +} +void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement) +{ + std::lock_guard lock(_generator->mutex); + int64_t start_dim = THTensor_(_nDimension)(prob_dist); + int64_t n_dist; + int64_t n_categories; + THDoubleTensor* cum_dist; + int64_t i,j,k; + + if (start_dim == 1) + { + THTensor_(unsqueeze1d)(prob_dist, prob_dist, 0); + } + + n_dist = THTensor_(size)(prob_dist, 0); + n_categories = THTensor_(size)(prob_dist, 1); + + THArgCheckWithCleanup(n_sample > 0, + THCleanup(if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), + 2, + "cannot sample n_sample <= 0 samples"); + + if (!with_replacement) + { + THArgCheckWithCleanup((!with_replacement) && (n_sample <= n_categories), + THCleanup(if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), + 2, + "cannot sample n_sample > prob_dist.size(1) samples without replacement"); + } + + /* cumulative probability distribution vector */ + cum_dist = THDoubleTensor_newWithSize1d(n_categories); + + /* will contain multinomial samples (category indices to be returned) */ + THLongTensor_resize2d(self, n_dist , n_sample); + + for (i=0; istorage, \ + prob_dist->storageOffset+i*prob_dist->stride[0]+j*prob_dist->stride[1] \ + ); + THArgCheckWithCleanup((val >= 0), + THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), + 2, + "invalid multinomial distribution (encountering probability entry < 0)"); + THArgCheckWithCleanup((std::isfinite(val)), + THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), + 2, + "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); + sum += val; + THDoubleStorage_set( + cum_dist->storage, \ + cum_dist->storageOffset+j*cum_dist->stride[0], \ + sum \ + ); + } + THArgCheckWithCleanup((sum > 0), + THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), + 2, + "invalid multinomial distribution (sum of probabilities <= 0)"); + /* normalize cumulative probability distribution so that last val is 1 + i.e. doesn't assume original prob_dist row sums to one */ + if ( (sum > 0) || ( ( sum < 1.00001) && (sum > 0.99999) ) ) + { + for (j=0; jstride[0]] /= sum; + } + } + + for (j=0; jstride[0]] = 1; + + while(right_pointer - left_pointer > 0) + { + mid_pointer = left_pointer + (right_pointer - left_pointer) / 2; + cum_prob = THDoubleStorage_get( \ + cum_dist->storage, \ + cum_dist->storageOffset+mid_pointer*cum_dist->stride[0] \ + ); + if (cum_prob < uniform_sample) + { + left_pointer = mid_pointer + 1; + } + else + { + right_pointer = mid_pointer; + } + } + sample_idx = left_pointer; + + /* store in result tensor (will be incremented for lua compat by wrapper) */ + THLongStorage_set( \ + self->storage, \ + self->storageOffset+i*self->stride[0]+j*self->stride[1], \ + sample_idx \ + ); + + /* Once a sample is drawn, it cannot be drawn again. ie sample without replacement */ + if (!with_replacement && j < n_sample - 1) + { + /* update cumulative distribution so that sample cannot be drawn again */ + double diff; + double new_val = 0; + double sum; + + if (sample_idx != 0) + { + new_val = THDoubleStorage_get( \ + cum_dist->storage, \ + cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride[0] \ + ); + } + /* marginal cumulative mass (i.e. original probability) of sample */ + diff = THDoubleStorage_get( \ + cum_dist->storage, \ + cum_dist->storageOffset+sample_idx*cum_dist->stride[0] \ + ) - new_val; + /* new sum of marginals is not one anymore... */ + sum = 1.0 - diff; + for (k=0; kstorage, \ + cum_dist->storageOffset+k*cum_dist->stride[0] \ + ); + if (k >= sample_idx) + { + /* remove sampled probability mass from later cumulative probabilities */ + new_val -= diff; + } + /* make total marginals sum to one */ + new_val /= sum; + THDoubleStorage_set( \ + cum_dist->storage, \ + cum_dist->storageOffset+k*cum_dist->stride[0], \ + new_val \ + ); + } + } + } + } + + THDoubleTensor_free(cum_dist); + + if (start_dim == 1) + { + THLongTensor_resize1d(self, n_sample); + THTensor_(squeeze1d)(prob_dist, prob_dist, 0); + } +} +#endif + +#if defined(TH_REAL_IS_BYTE) +void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self) +{ + std::lock_guard lock(_generator->mutex); + static const size_t size = sizeof(THGeneratorState); + THGeneratorState *rng_state; + THTensor_(resize1d)(self, size); + THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size"); + THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous"); + rng_state = (THGeneratorState *)THTensor_(data)(self); + THGeneratorState_copy(rng_state, &_generator->gen_state); +} + +void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self) +{ + std::lock_guard lock(_generator->mutex); + static const size_t size = sizeof(THGeneratorState); + THGeneratorState *rng_state; + THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size"); + THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous"); + rng_state = (THGeneratorState *)THTensor_(data)(self); + THArgCheck(THGeneratorState_isValid(rng_state), 1, "Invalid RNG state"); + THGeneratorState_copy(&_generator->gen_state, rng_state); +} +#endif +#endif diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h new file mode 100644 index 0000000..dc6bdaf --- /dev/null +++ b/aten/src/TH/generic/THTensorRandom.h @@ -0,0 +1,33 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THTensorRandom.h" +#else + +TH_API void THTensor_(random)(THTensor *self, THGenerator *_generator); +TH_API void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max); +TH_API void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max); +TH_API void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p); +TH_API void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p); +TH_API void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p); +TH_API void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p); + +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) +TH_API void THTensor_(bernoulli_Tensor)(THTensor *self, THGenerator *_generator, THTensor *p); +TH_API void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b); +TH_API void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stdv); +TH_API void THTensor_(normal_means)(THTensor *self, THGenerator *gen, THTensor *means, double stddev); +TH_API void THTensor_(normal_stddevs)(THTensor *self, THGenerator *gen, double mean, THTensor *stddevs); +TH_API void THTensor_(normal_means_stddevs)(THTensor *self, THGenerator *gen, THTensor *means, THTensor *stddevs); +TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda); +TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma); +TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv); +TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement); +TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor *J, THTensor *q); +TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q); +#endif + +#if defined(TH_REAL_IS_BYTE) +TH_API void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self); +TH_API void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self); +#endif + +#endif diff --git a/aten/src/TH/generic/THVector.h b/aten/src/TH/generic/THVector.h new file mode 100644 index 0000000..1931700 --- /dev/null +++ b/aten/src/TH/generic/THVector.h @@ -0,0 +1,68 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THVector.h" +#else + +// Opaque C++ struct +struct THGenerator; + +TH_API void THVector_(fill)(real *x, const real c, const ptrdiff_t n); +TH_API void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n); +TH_API void THVector_(adds)(real *y, const real *x, const real c, const ptrdiff_t n); +TH_API void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n); +TH_API void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n); +TH_API void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n); +TH_API void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n); +TH_API void THVector_(copy)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(neg)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(normal_fill)(real *data, + const int64_t size, + struct THGenerator *generator, + const real mean, + const real stddev); +#ifndef TH_REAL_IS_INT +TH_API void THVector_(cvtFromInt)(real *y, const int *x, const ptrdiff_t n); +#endif + +#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) +TH_API void THVector_(abs)(real *y, const real *x, const ptrdiff_t n); +#endif + +/* floating point only now */ +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + +TH_API void THVector_(log)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(lgamma)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(digamma)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(trigamma)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(log10)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(log1p)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(log2)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(sigmoid)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(exp)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(expm1)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(erf)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(erfc)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(erfinv)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(cos)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(acos)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(cosh)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(sin)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(asin)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(sinh)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(tan)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(atan)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(tanh)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(pow)(real *y, const real *x, const real c, const ptrdiff_t n); +TH_API void THVector_(sqrt)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(rsqrt)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(ceil)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(floor)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(round)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(abs)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(trunc)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(frac)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(cinv)(real *y, const real *x, const ptrdiff_t n); + +#endif /* floating point only part */ + +#endif diff --git a/aten/src/TH/generic/THVectorDefault.cpp b/aten/src/TH/generic/THVectorDefault.cpp new file mode 100644 index 0000000..a32701a --- /dev/null +++ b/aten/src/TH/generic/THVectorDefault.cpp @@ -0,0 +1,289 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THVectorDefault.cpp" +#else + +#include "../THRandom.h" + +void THVector_(copy_DEFAULT)(real *x, const real *y, const ptrdiff_t n) { + ptrdiff_t i = 0; + + for(; i (0, 1] for log. + const real u2 = data[j + 8]; + + const real radius = sqrt(-2 * log(u1)); + const real theta = 2.0f * M_PI * u2; + + data[j] = radius * cos(theta) * stddev + mean; + data[j + 8] = radius * sin(theta) * stddev + mean; + } +} + +void THVector_(normal_fill_DEFAULT)(real *data, + int64_t size, + THGenerator *generator, + const real mean, + const real stddev) +{ + THAssert(size >= 16 && "Size must be >= 16 for normal fill"); + + for (int64_t i = 0; i < size; ++i) { +#ifdef TH_REAL_IS_FLOAT + data[i] = THRandom_uniformFloat(generator, 0, 1); +#else + data[i] = THRandom_uniform(generator, 0, 1); +#endif + } + + for (int64_t i = 0; i < size - 15; i += 16) { + THVector_(interleaved_normal_fill_16)(data + i, mean, stddev); + } + + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (int64_t i = 0; i < 16; ++i) { +#ifdef TH_REAL_IS_FLOAT + data[i] = THRandom_uniformFloat(generator, 0, 1); +#else + data[i] = THRandom_uniform(generator, 0, 1); +#endif + } + THVector_(interleaved_normal_fill_16)(data, mean, stddev); + } +} + +#define VECTOR_IMPLEMENT_FUNCTION(NAME, CFUNC) \ + void THVector_(NAME)(real *y, const real *x, const ptrdiff_t n) \ + { \ + ptrdiff_t i = 0; \ + for(; i + +static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax, + unsigned int *__ebx, unsigned int *__ecx, + unsigned int *__edx) { + unsigned int cpui[4]; + __cpuid(cpui, __level); + *__eax = cpui[0]; *__ebx = cpui[1]; *__ecx = cpui[2]; *__edx = cpui[3]; + return 1; +} + +static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) { + *eax = 0; *edx = 0; + if (op == 0) + *eax = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); +} + +#else + +#if __i386__ +#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \ +__asm(" pushl %%ebx\n" \ +" cpuid\n" \ +" mov %%ebx,%1\n" \ +" popl %%ebx" \ +: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \ +: "0"(__level)) +#else +#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \ +__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \ +: "0"(__level)) +#endif + +static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax, + unsigned int *__ebx, unsigned int *__ecx, + unsigned int *__edx) { + __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} + +static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) { + __asm__ __volatile__ + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} + +#endif + +enum ECPUFeature +{ + kCPUFeature_SSE = 0x01, + kCPUFeature_SSE2 = 0x02, + kCPUFeature_SSE3 = 0x04, + kCPUFeature_SSE3_S = 0x08, + kCPUFeature_SSE4_1 = 0x10, + kCPUFeature_SSE4_2 = 0x20, + kCPUFeature_AVX = 0x40 +}; + +static unsigned int checkCPUFeatures() { + unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0; + unsigned int features = 0; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + if( (edx & (1 << 25)) != 0 ) { + features |= kCPUFeature_SSE; + } + if( (edx & (1 << 26)) != 0 ) { + features |= kCPUFeature_SSE2; + } + if( (ecx & (1 << 0)) != 0 ) { + features |= kCPUFeature_SSE3; + } + if( (ecx & (1 << 9)) != 0 ) { + features |= kCPUFeature_SSE3_S; + } + if( (ecx & (1 << 19)) != 0 ) { + features |= kCPUFeature_SSE4_1; + } + if( (ecx & (1 << 20)) != 0 ) { + features |= kCPUFeature_SSE4_2; + } + if( (ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0 ) { + xgetbv(0, &eax, &edx); + if( (eax & 6) == 6 ) { + features |= kCPUFeature_AVX; + } + } + return features; +} + +#include + +static int haveCPUFeature(unsigned int feature) { + static unsigned int sCPUFeatures = 0; + static int sDetectedCPUFeatures = 0; + if (!sDetectedCPUFeatures) { + sDetectedCPUFeatures = 1; + sCPUFeatures = checkCPUFeatures(); + if ((sCPUFeatures & kCPUFeature_AVX) != 0) { + printf("torch running avx\n"); + } else { + printf("torch running sse \n"); + } + } + return (sCPUFeatures & feature) != 0; +} + +#endif + +#include + +void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols); +void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols); + +void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols) { +#if defined(__AVX__) + int avx = haveCPUFeature(kCPUFeature_AVX); + if (avx) + { + convolve_5x5_avx(output, input, kernel, outRows, outCols, outCols, inCols); + } + else +#endif + { + convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols); + } +} diff --git a/aten/src/TH/generic/simd/convolve.h b/aten/src/TH/generic/simd/convolve.h new file mode 100644 index 0000000..fa04ce9 --- /dev/null +++ b/aten/src/TH/generic/simd/convolve.h @@ -0,0 +1 @@ +void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols); \ No newline at end of file diff --git a/aten/src/TH/generic/simd/convolve5x5_avx.cpp b/aten/src/TH/generic/simd/convolve5x5_avx.cpp new file mode 100644 index 0000000..560474b --- /dev/null +++ b/aten/src/TH/generic/simd/convolve5x5_avx.cpp @@ -0,0 +1,214 @@ +#include +#include "common_simd.h" +#include + + +#define CLEAR_AVX() _mm256_zeroupper() + +void convolve_5x5_1_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_1() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(1, i) + } +} + +void convolve_5x5_2_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_2() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(2, i) + } +} + +void convolve_5x5_4_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_4() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(4, i) + } +} + +void convolve_5x5_5_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_5() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(5, i) + } +} + +void convolve_5x5_6_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_6() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(6, i) + } +} + +void convolve_5x5_7_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_7() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(7, i) + } +} + +void convolve_5x5_8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount = count & 0xFFFFFFF8; + DECLARE_OUTPUT_8() + for (; i < alignedCount; i+=8) { + CONVOLVE_8COLS_XROWS(8, i) + } +} + +void convolve_5x5_64x64_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + for(int i = 0; i < 60; i+=6) + { + DECLARE_OUTPUT_6() + CONVOLVE_8COLS_XROWS(6, 0) + CONVOLVE_8COLS_XROWS(6, 8) + CONVOLVE_8COLS_XROWS(6, 16) + CONVOLVE_8COLS_XROWS(6, 24) + CONVOLVE_8COLS_XROWS(6, 32) + CONVOLVE_8COLS_XROWS(6, 40) + CONVOLVE_8COLS_XROWS(6, 48) + CONVOLVE_8COLS_XROWS(6, 56) + output += outputStride * 6; + image += inputStride * 6; + } + DECLARE_OUTPUT_4() + CONVOLVE_8COLS_XROWS(4, 0) + CONVOLVE_8COLS_XROWS(4, 8) + CONVOLVE_8COLS_XROWS(4, 16) + CONVOLVE_8COLS_XROWS(4, 24) + CONVOLVE_8COLS_XROWS(4, 32) + CONVOLVE_8COLS_XROWS(4, 40) + CONVOLVE_8COLS_XROWS(4, 48) + CONVOLVE_8COLS_XROWS(4, 56) +} + +void convolve_5x5_32x32_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + for(int i = 0; i < 30; i+=6) + { + DECLARE_OUTPUT_6() + CONVOLVE_8COLS_XROWS(6, 0) + CONVOLVE_8COLS_XROWS(6, 8) + CONVOLVE_8COLS_XROWS(6, 16) + CONVOLVE_8COLS_XROWS(6, 24) + output += outputStride * 6; + image += inputStride * 6; + } + DECLARE_OUTPUT_2() + CONVOLVE_8COLS_XROWS(2, 0) + CONVOLVE_8COLS_XROWS(2, 8) + CONVOLVE_8COLS_XROWS(2, 16) + CONVOLVE_8COLS_XROWS(2, 24) +} + +void convolve_5x5_16x16_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + for(int i = 0; i < 12; i+=6) + { + DECLARE_OUTPUT_6() + CONVOLVE_8COLS_XROWS(6, 0) + CONVOLVE_8COLS_XROWS(6, 8) + output += outputStride * 6; + image += inputStride * 6; + } + DECLARE_OUTPUT_4() + CONVOLVE_8COLS_XROWS(4, 0) + CONVOLVE_8COLS_XROWS(4, 8) +} + +void convolve_5x5_8x8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + DECLARE_OUTPUT_8() + CONVOLVE_8COLS_XROWS(8, 0) +} + +void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols); + +void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) { + int64_t ic = inCols; + int64_t yy = 0; + float* t_ = input; + float* r_ = output; + float* k_ = kernel; + + if((outRows == 64) && (outCols == 64)) { + convolve_5x5_64x64_avx(output, input, kernel, outRows, outStride, inCols); + return; + } + + if((outRows == 32) && (outCols == 32)) { + convolve_5x5_32x32_avx(output, input, kernel, outRows, outStride, inCols); + return; + } + + if((outRows == 16) && (outCols == 16)) { + convolve_5x5_16x16_avx(output, input, kernel, outRows, outStride, inCols); + return; + } + + if((outRows == 8) && (outCols == 8)) { + convolve_5x5_8x8_avx(output, input, kernel, outRows, outStride, inCols); + return; + } + + for(; yy < (outRows / 6 ) * 6; yy += 6) { + float *pi_ = t_ + yy*ic; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_6_avx(r_, pis_, pw_, outCols, outStride, ic); + r_ += (outStride * 6); + } + + // more than 2 rows left to process and we ended up on a non-multiple of 4 + if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) { + // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop) + float *pi_ = t_ + yy*ic; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic); + r_ += (outStride * 2); + yy += 2; + } + + for(; yy < (outRows & 0xFFFFFFFC); yy += 4) { + float *pi_ = t_ + yy*ic; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_4_avx(r_, pis_, pw_, outCols, outStride, ic); + r_ += (outStride * 4); + } + + for(; yy < (outRows & 0xFFFFFFFE); yy += 2) { + float *pi_ = t_ + yy*ic; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic); + r_ += (outStride * 2); + } + + for(; yy < outRows; yy += 1) { + float *pi_ = t_ + yy*ic; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_1_avx(r_, pis_, pw_, outCols, outStride, ic); + r_ += (outStride * 1); + } + + int64_t procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time + int64_t remCols = outCols - procCols; + + //process the rest using sse + if( remCols > 0) { + CLEAR_AVX(); + convolve_5x5_sse(&output[procCols], &input[procCols], kernel, outRows, remCols, outStride, inCols); + } +} \ No newline at end of file diff --git a/aten/src/TH/generic/simd/convolve5x5_sse.cpp b/aten/src/TH/generic/simd/convolve5x5_sse.cpp new file mode 100644 index 0000000..9de9a4a --- /dev/null +++ b/aten/src/TH/generic/simd/convolve5x5_sse.cpp @@ -0,0 +1,321 @@ +#include +#include "common_simd.h" +#include + + +/* SSE variants */ +void convolve_5x5_1_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount4 = count & 0xFFFFFFFC; + DECLARE_OUTPUT_1() + for (; i < alignedCount4; i+=4) { + CONVOLVE_4COLS_XROWS(1, i) + } + for (; i < (count); i++) { + float output0 = output[i + outputStride * 0]; + int row; + for (row = 0; row < 5; row++) { + int col; + for (col = 0; col < 5; col++) { + output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col]; + } + } + output[i + outputStride * 0] = output0; + } +} + +void convolve_5x5_2_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount4 = count & 0xFFFFFFFC; + DECLARE_OUTPUT_2() + for (; i < alignedCount4; i+=4) { + CONVOLVE_4COLS_XROWS(2, i) + } + for (; i < (count); i++) { + float output0 = output[i + outputStride * 0]; + float output1 = output[i + outputStride * 1]; + int row; + for (row = 0; row < 5; row++) { + int col; + for (col = 0; col < 5; col++) { + output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col]; + output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col]; + } + } + output[i + outputStride * 0] = output0; + output[i + outputStride * 1] = output1; + } +} + +void convolve_5x5_4_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount4 = count & 0xFFFFFFFC; + DECLARE_OUTPUT_4() + for (; i < alignedCount4; i+=4) { + CONVOLVE_4COLS_XROWS(4, i) + } + for (; i < (count); i++) { + float output0 = output[i + outputStride * 0]; + float output1 = output[i + outputStride * 1]; + float output2 = output[i + outputStride * 2]; + float output3 = output[i + outputStride * 3]; + int row; + for (row = 0; row < 5; row++) { + int col; + for (col = 0; col < 5; col++) { + output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col]; + output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col]; + output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col]; + output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col]; + } + } + output[i + outputStride * 0] = output0; + output[i + outputStride * 1] = output1; + output[i + outputStride * 2] = output2; + output[i + outputStride * 3] = output3; + } +} + +void convolve_5x5_6_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount4 = count & 0xFFFFFFFC; + DECLARE_OUTPUT_6() + for (; i < alignedCount4; i+=4) { + CONVOLVE_4COLS_XROWS(6, i) + } + for (; i<(count); i++) { + float output0 = output[i + outputStride * 0]; + float output1 = output[i + outputStride * 1]; + float output2 = output[i + outputStride * 2]; + float output3 = output[i + outputStride * 3]; + float output4 = output[i + outputStride * 4]; + float output5 = output[i + outputStride * 5]; + int row; + for (row = 0; row < 5; row++) { + int col; + for (col = 0; col < 5; col++) { + output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col]; + output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col]; + output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col]; + output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col]; + output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col]; + output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col]; + } + } + output[i + outputStride * 0] = output0; + output[i + outputStride * 1] = output1; + output[i + outputStride * 2] = output2; + output[i + outputStride * 3] = output3; + output[i + outputStride * 4] = output4; + output[i + outputStride * 5] = output5; + } +} + +void convolve_5x5_8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + int64_t i = 0; + int64_t alignedCount4 = count & 0xFFFFFFFC; + DECLARE_OUTPUT_8() + for (; i < alignedCount4; i+=4) { + CONVOLVE_4COLS_XROWS(8, i) + } + for (; i<(count); i++) { + float output0 = output[i + outputStride * 0]; + float output1 = output[i + outputStride * 1]; + float output2 = output[i + outputStride * 2]; + float output3 = output[i + outputStride * 3]; + float output4 = output[i + outputStride * 4]; + float output5 = output[i + outputStride * 5]; + float output6 = output[i + outputStride * 6]; + float output7 = output[i + outputStride * 7]; + int row; + for (row = 0; row < 5; row++) { + int col; + for (col = 0; col < 5; col++) { + output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col]; + output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col]; + output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col]; + output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col]; + output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col]; + output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col]; + output6 += weight[5 * row + col] * image[i + (row + 6) * inputStride + col]; + output7 += weight[5 * row + col] * image[i + (row + 7) * inputStride + col]; + } + } + output[i + outputStride * 0] = output0; + output[i + outputStride * 1] = output1; + output[i + outputStride * 2] = output2; + output[i + outputStride * 3] = output3; + output[i + outputStride * 4] = output4; + output[i + outputStride * 5] = output5; + output[i + outputStride * 6] = output6; + output[i + outputStride * 7] = output7; + } +} + +#define UNROLL_SSE_CONVOLUTION 0 +#if (UNROLL_SSE_CONVOLUTION) + +void convolve_5x5_64x64_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + for(int i = 0; i < 60; i+=6) + { + DECLARE_OUTPUT_6() + CONVOLVE_4COLS_XROWS(6, 0) + CONVOLVE_4COLS_XROWS(6, 4) + CONVOLVE_4COLS_XROWS(6, 8) + CONVOLVE_4COLS_XROWS(6, 12) + CONVOLVE_4COLS_XROWS(6, 16) + CONVOLVE_4COLS_XROWS(6, 20) + CONVOLVE_4COLS_XROWS(6, 24) + CONVOLVE_4COLS_XROWS(6, 28) + CONVOLVE_4COLS_XROWS(6, 32) + CONVOLVE_4COLS_XROWS(6, 36) + CONVOLVE_4COLS_XROWS(6, 40) + CONVOLVE_4COLS_XROWS(6, 44) + CONVOLVE_4COLS_XROWS(6, 48) + CONVOLVE_4COLS_XROWS(6, 52) + CONVOLVE_4COLS_XROWS(6, 56) + CONVOLVE_4COLS_XROWS(6, 60) + output += outputStride * 6; + image += inputStride * 6; + } + DECLARE_OUTPUT_4() + CONVOLVE_4COLS_XROWS(4, 0) + CONVOLVE_4COLS_XROWS(4, 4) + CONVOLVE_4COLS_XROWS(4, 8) + CONVOLVE_4COLS_XROWS(4, 12) + CONVOLVE_4COLS_XROWS(4, 16) + CONVOLVE_4COLS_XROWS(4, 20) + CONVOLVE_4COLS_XROWS(4, 24) + CONVOLVE_4COLS_XROWS(4, 28) + CONVOLVE_4COLS_XROWS(4, 32) + CONVOLVE_4COLS_XROWS(4, 36) + CONVOLVE_4COLS_XROWS(4, 40) + CONVOLVE_4COLS_XROWS(4, 44) + CONVOLVE_4COLS_XROWS(4, 48) + CONVOLVE_4COLS_XROWS(4, 52) + CONVOLVE_4COLS_XROWS(4, 56) + CONVOLVE_4COLS_XROWS(4, 60) +} + +void convolve_5x5_32x32_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + for(int i = 0; i < 30; i+=6) + { + DECLARE_OUTPUT_6() + + CONVOLVE_4COLS_XROWS(6, 0) + CONVOLVE_4COLS_XROWS(6, 4) + CONVOLVE_4COLS_XROWS(6, 8) + CONVOLVE_4COLS_XROWS(6, 12) + CONVOLVE_4COLS_XROWS(6, 16) + CONVOLVE_4COLS_XROWS(6, 20) + CONVOLVE_4COLS_XROWS(6, 24) + CONVOLVE_4COLS_XROWS(6, 28) + + output += outputStride * 6; + image += inputStride * 6; + } + DECLARE_OUTPUT_2() + CONVOLVE_4COLS_XROWS(2, 0) + CONVOLVE_4COLS_XROWS(2, 4) + CONVOLVE_4COLS_XROWS(2, 8) + CONVOLVE_4COLS_XROWS(2, 12) + CONVOLVE_4COLS_XROWS(2, 16) + CONVOLVE_4COLS_XROWS(2, 20) + CONVOLVE_4COLS_XROWS(2, 24) + CONVOLVE_4COLS_XROWS(2, 28) +} + +void convolve_5x5_16x16_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + for(int i = 0; i < 12; i+=6) + { + DECLARE_OUTPUT_6() + CONVOLVE_4COLS_XROWS(6, 0) + CONVOLVE_4COLS_XROWS(6, 4) + CONVOLVE_4COLS_XROWS(6, 8) + CONVOLVE_4COLS_XROWS(6, 12) + output += outputStride * 6; + image += inputStride * 6; + } + DECLARE_OUTPUT_4() + CONVOLVE_4COLS_XROWS(4, 0) + CONVOLVE_4COLS_XROWS(4, 4) + CONVOLVE_4COLS_XROWS(4, 8) + CONVOLVE_4COLS_XROWS(4, 12) +} + +void convolve_5x5_8x8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) { + DECLARE_OUTPUT_8() + CONVOLVE_4COLS_XROWS(8, 0) + CONVOLVE_4COLS_XROWS(8, 4) +} + +#endif + +void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) { + int64_t yy = 0; + float* t_ = input; + float* r_ = output; + float* k_ = kernel; +#if (UNROLL_SSE_CONVOLUTION) + if((outRows == 64) && (outCols == 64)) { + convolve_5x5_64x64_sse(output, input, kernel, outRows, outStride, inCols); + return; + } + + if((outRows == 32) && (outCols == 32)) { + convolve_5x5_32x32_sse(output, input, kernel, outRows, outStride, inCols); + return; + } + + if((outRows == 16) && (outCols == 16)) { + convolve_5x5_16x16_sse(output, input, kernel, outRows, outStride, inCols); + return; + } + + if((outRows == 8) && (outCols == 8)) { + convolve_5x5_8x8_sse(output, input, kernel, outRows, outStride, inCols); + return; + } +#endif + for(; yy < (outRows / 6 ) * 6; yy += 6) { + float *pi_ = t_ + yy*inCols; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_6_sse(r_, pis_, pw_, outCols, outStride, inCols); + r_ += (outStride * 6); + } + // more than 2 rows left to process and we ended up on a non-multiple of 4 + if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) { + // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop) + float *pi_ = t_ + yy*inCols; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols); + r_ += (outStride * 2); + yy += 2; + } + + for(; yy < (outRows & 0xFFFFFFFC); yy += 4) { + float *pi_ = t_ + yy*inCols; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_4_sse(r_, pis_, pw_, outCols, outStride, inCols); + r_ += (outStride * 4); + } + + for(; yy < (outRows & 0xFFFFFFFE); yy += 2) { + float *pi_ = t_ + yy*inCols; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols); + r_ += (outStride * 2); + } + + for(; yy < outRows; yy += 1) { + float *pi_ = t_ + yy*inCols; + float *pw_ = k_; + float *pis_ = pi_; + convolve_5x5_1_sse(r_, pis_, pw_, outCols, outStride, inCols); + r_ += (outStride * 1); + } +} diff --git a/aten/src/TH/generic/simd/simd.h b/aten/src/TH/generic/simd/simd.h new file mode 100644 index 0000000..33c08b0 --- /dev/null +++ b/aten/src/TH/generic/simd/simd.h @@ -0,0 +1,165 @@ +#ifndef TH_SIMD_INC +#define TH_SIMD_INC + +#include +#include +#if defined(_MSC_VER) +#include +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif + +// Can be found on Intel ISA Reference for CPUID +#define CPUID_AVX2_BIT 0x20 // Bit 5 of EBX for EAX=0x7 +#define CPUID_AVX_BIT 0x10000000 // Bit 28 of ECX for EAX=0x1 +#define CPUID_SSE_BIT 0x2000000 // bit 25 of EDX for EAX=0x1 + +// Helper macros for initialization +#define FUNCTION_IMPL(NAME, EXT) \ + { (void *)NAME, \ + EXT \ + } + +#define INIT_DISPATCH_PTR(OP) \ + do { \ + size_t i; \ + for (i = 0; i < sizeof(THVector_(OP ## _DISPATCHTABLE)) / sizeof(FunctionDescription); ++i) { \ + THVector_(OP ## _DISPATCHPTR) = reinterpret_cast(THVector_(OP ## _DISPATCHTABLE)[i].function); \ + if (THVector_(OP ## _DISPATCHTABLE)[i].supportedSimdExt & hostSimdExts) { \ + break; \ + } \ + } \ + } while(0) + + +typedef struct FunctionDescription +{ + void *function; + uint32_t supportedSimdExt; +} FunctionDescription; + + +enum SIMDExtensions +{ +#if defined(__NEON__) + SIMDExtension_NEON = 0x1, +#elif defined(__PPC64__) + SIMDExtension_VSX = 0x1, +#else + SIMDExtension_AVX2 = 0x1, + SIMDExtension_AVX = 0x2, + SIMDExtension_SSE = 0x4, +#endif + SIMDExtension_DEFAULT = 0x0 +}; + + +#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64 + + #if defined(__NEON__) + +static inline uint32_t detectHostSIMDExtensions() +{ + return SIMDExtension_NEON; +} + + #else //ARM without NEON + +static inline uint32_t detectHostSIMDExtensions() +{ + return SIMDExtension_DEFAULT; +} + + #endif + +#elif defined(__PPC64__) + + #if defined(__VSX__) + +static inline uint32_t detectHostSIMDExtensions() +{ + uint32_t hostSimdExts = SIMDExtension_DEFAULT; + char *evar; + + evar = getenv("TH_NO_VSX"); + if (evar == NULL || strncmp(evar, "1", 1) != 0) + hostSimdExts = SIMDExtension_VSX; + return hostSimdExts; +} + + #else //PPC64 without VSX + +static inline uint32_t detectHostSIMDExtensions() +{ + return SIMDExtension_DEFAULT; +} + + #endif + +#else // x86 +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ +#if defined(_MSC_VER) + uint32_t cpuInfo[4]; + __cpuid((int *)cpuInfo, *eax); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid (level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + asm volatile ( "cpuid\n\t" + : "+a"(a), "=b"(b), "+c"(c), "=d"(d) ); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + +static inline uint32_t detectHostSIMDExtensions() +{ + uint32_t eax, ebx, ecx, edx; + uint32_t hostSimdExts = 0x0; + int TH_NO_AVX = 1, TH_NO_AVX2 = 1, TH_NO_SSE = 1; + char *evar; + + evar = getenv("TH_NO_AVX2"); + if (evar == NULL || strncmp(evar, "1", 1) != 0) + TH_NO_AVX2 = 0; + + // Check for AVX2. Requires separate CPUID + eax = 0x7; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + if ((ebx & CPUID_AVX2_BIT) && TH_NO_AVX2 == 0) { + hostSimdExts |= SIMDExtension_AVX2; + } + + // Detect and enable AVX and SSE + eax = 0x1; + cpuid(&eax, &ebx, &ecx, &edx); + + evar = getenv("TH_NO_AVX"); + if (evar == NULL || strncmp(evar, "1", 1) != 0) + TH_NO_AVX = 0; + if (ecx & CPUID_AVX_BIT && TH_NO_AVX == 0) { + hostSimdExts |= SIMDExtension_AVX; + } + + evar = getenv("TH_NO_SSE"); + if (evar == NULL || strncmp(evar, "1", 1) != 0) + TH_NO_SSE = 0; + if (edx & CPUID_SSE_BIT && TH_NO_SSE == 0) { + hostSimdExts |= SIMDExtension_SSE; + } + + return hostSimdExts; +} + +#endif // end SIMD extension detection code + +#endif diff --git a/aten/src/TH/vector/AVX.cpp b/aten/src/TH/vector/AVX.cpp new file mode 100644 index 0000000..b39b803 --- /dev/null +++ b/aten/src/TH/vector/AVX.cpp @@ -0,0 +1,309 @@ +#if defined(__AVX__) +#ifndef _MSC_VER +#include +#else +#include +#endif + +#include "AVX.h" +#include "THGeneral.h" + +void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) { + ptrdiff_t i; + ptrdiff_t off; + for (i=0; i<=((n)-8); i+=8) { + _mm256_storeu_pd(y+i, _mm256_loadu_pd(x+i)); + _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(x+i+4)); + } + off = (n) - ((n)%8); + for (i=0; i<((n)%8); i++) { + y[off+i] = x[off+i]; + } +} + +void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) { + ptrdiff_t i; + ptrdiff_t off; + __m256d YMM0 = _mm256_set_pd(c, c, c, c); + for (i=0; i<=((n)-16); i+=16) { + _mm256_storeu_pd((x)+i , YMM0); + _mm256_storeu_pd((x)+i+4, YMM0); + _mm256_storeu_pd((x)+i+8, YMM0); + _mm256_storeu_pd((x)+i+12, YMM0); + } + off = (n) - ((n)%16); + for (i=0; i<((n)%16); i++) { + x[off+i] = c; + } +} + +void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) __ubsan_ignore_float_divide_by_zero__ { + ptrdiff_t i; + __m256d YMM0, YMM1, YMM2, YMM3; + for (i=0; i<=((n)-8); i+=8) { + YMM0 = _mm256_loadu_pd(x+i); + YMM1 = _mm256_loadu_pd(x+i+4); + YMM2 = _mm256_loadu_pd(y+i); + YMM3 = _mm256_loadu_pd(y+i+4); + YMM2 = _mm256_div_pd(YMM0, YMM2); + YMM3 = _mm256_div_pd(YMM1, YMM3); + _mm256_storeu_pd(z+i, YMM2); + _mm256_storeu_pd(z+i+4, YMM3); + } + for (; i<(n); i++) { + z[i] = x[i] / y[i]; + } +} + +void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) __ubsan_ignore_float_divide_by_zero__ { + ptrdiff_t i; + __m256d YMM15 = _mm256_set_pd(c, c, c, c); + __m256d YMM0, YMM1; + for (i=0; i<=((n)-8); i+=8) { + YMM0 = _mm256_loadu_pd(x+i); + YMM1 = _mm256_loadu_pd(x+i+4); + YMM0 = _mm256_div_pd(YMM0, YMM15); + YMM1 = _mm256_div_pd(YMM1, YMM15); + _mm256_storeu_pd(y+i, YMM0); + _mm256_storeu_pd(y+i+4, YMM1); + } + for (; i<(n); i++) { + y[i] = x[i] / c; + } +} + +void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { + ptrdiff_t i; + __m256d YMM0, YMM1, YMM2, YMM3; + for (i=0; i<=((n)-8); i+=8) { + YMM0 = _mm256_loadu_pd(x+i); + YMM1 = _mm256_loadu_pd(x+i+4); + YMM2 = _mm256_loadu_pd(y+i); + YMM3 = _mm256_loadu_pd(y+i+4); + YMM2 = _mm256_mul_pd(YMM0, YMM2); + YMM3 = _mm256_mul_pd(YMM1, YMM3); + _mm256_storeu_pd(z+i, YMM2); + _mm256_storeu_pd(z+i+4, YMM3); + } + for (; i + +TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n); +TH_API void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n); +TH_API void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n); +TH_API void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n); +TH_API void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n); +TH_API void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n); +TH_API void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n); +TH_API void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n); +TH_API void THDoubleVector_cvtFromInt_AVX(double *y, const int *x, const ptrdiff_t n); +TH_API void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n); +TH_API void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n); +TH_API void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n); +TH_API void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n); +TH_API void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n); +TH_API void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n); +TH_API void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n); +TH_API void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n); +TH_API void THFloatVector_cvtFromInt_AVX(float *y, const int *x, const ptrdiff_t n); +#endif diff --git a/aten/src/TH/vector/AVX2.cpp b/aten/src/TH/vector/AVX2.cpp new file mode 100644 index 0000000..bde22d3 --- /dev/null +++ b/aten/src/TH/vector/AVX2.cpp @@ -0,0 +1,130 @@ +#if defined(__AVX2__) +#ifndef _MSC_VER +#include +#else +#include +#include +#endif +#include "AVX2.h" +#include +#include "../THRandom.h" + +void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { + ptrdiff_t i; + __m256d YMM15 = _mm256_set_pd(c, c, c, c); + __m256d YMM0, YMM1, YMM2, YMM3; + for (i=0; i<=((n)-8); i+=8) { + YMM0 = _mm256_loadu_pd(y+i); + YMM1 = _mm256_loadu_pd(y+i+4); + YMM2 = _mm256_loadu_pd(x+i); + YMM3 = _mm256_loadu_pd(x+i+4); + YMM2 = _mm256_fmadd_pd(YMM0, YMM15, YMM2); + YMM3 = _mm256_fmadd_pd(YMM1, YMM15, YMM3); + _mm256_storeu_pd(z+i, YMM2); + _mm256_storeu_pd(z+i+4, YMM3); + } + for (; i<(n); i++) { + z[i] = x[i] + y[i] * c; + } +} + +void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { + ptrdiff_t i; + __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); + __m256 YMM0, YMM1, YMM2, YMM3; + for (i=0; i<=((n)-16); i+=16) { + YMM0 = _mm256_loadu_ps(y+i); + YMM1 = _mm256_loadu_ps(y+i+8); + YMM2 = _mm256_loadu_ps(x+i); + YMM3 = _mm256_loadu_ps(x+i+8); + YMM2 = _mm256_fmadd_ps(YMM0, YMM15, YMM2); + YMM3 = _mm256_fmadd_ps(YMM1, YMM15, YMM3); + _mm256_storeu_ps(z+i, YMM2); + _mm256_storeu_ps(z+i+8, YMM3); + } + for (; i<(n); i++) { + z[i] = x[i] + y[i] * c; + } +} + +static void normal_fill_16_AVX2(float *data, + const __m256* two_pi, + const __m256* one, + const __m256* minus_two, + const __m256* mean, + const __m256* stddev) { + const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data)); + const __m256 u2 = _mm256_loadu_ps(data + 8); + + // sincos256_ps and log256_ps are from avx_mathfun.h + const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1))); + const __m256 theta = _mm256_mul_ps(*two_pi, u2); + + __m256 sintheta, costheta; + sincos256_ps(theta, &sintheta, &costheta); + + const __m256 n1 = _mm256_mul_ps(radius, costheta); + const __m256 n2 = _mm256_mul_ps(radius, sintheta); + + _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *stddev, *mean)); + _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *stddev, *mean)); +} + +void THFloatVector_normal_fill_AVX2(float *data, + const int64_t size, + THGenerator *generator, + const float mean, + const float stddev) +{ + THAssert(size >= 16 && "Size must be >= 16 for AVX2 normal fill"); + const __m256 two_pi = _mm256_set1_ps(2.0f * M_PI); + const __m256 one = _mm256_set1_ps(1.0f); + const __m256 minus_two = _mm256_set1_ps(-2.0f); + const __m256 mean_v = _mm256_set1_ps(mean); + const __m256 stddev_v = _mm256_set1_ps(stddev); + + // First fill the data with the uniform numbers. Box-Mueller is a 2 -> 2 + // mapping of 2 uniform numbers to 2 normal numbers (per iteration), so we + // we need exactly as much space for uniform and normal numbers and can just + // use the single buffer for both. + for (int64_t i = 0; i < size; ++i) { + data[i] = THRandom_uniformFloat(generator, 0, 1); + } + + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &stddev_v); + } + + if (size % 16 != 0) { + // We rewind so that we have 16 values and then compute them in one step. + data = data + size - 16; + for (int i = 0; i < 16; ++i) { + data[i] = THRandom_uniformFloat(generator, 0, 1); + } + normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &stddev_v); + } +} + +void THFloatVector_sigmoid_AVX2(float *y, const float *x, const ptrdiff_t n) { + ptrdiff_t i; + const __m256 one = _mm256_set1_ps(1.0f); + const __m256 zero = _mm256_set1_ps(0.0f); + __m256 YMM0, YMM1, YMM2, YMM3; + for (i = 0; i <= ((n)-16); i += 16) { + YMM0 = _mm256_loadu_ps(x + i); + YMM1 = _mm256_loadu_ps(x + i + 8); + YMM0 = _mm256_sub_ps(zero, YMM0); + YMM1 = _mm256_sub_ps(zero, YMM1); + YMM2 = _mm256_add_ps(one, exp256_ps(YMM0)); + YMM3 = _mm256_add_ps(one, exp256_ps(YMM1)); + YMM2 = _mm256_div_ps(one, YMM2); + YMM3 = _mm256_div_ps(one, YMM3); + _mm256_storeu_ps(y + i, YMM2); + _mm256_storeu_ps(y + i + 8, YMM3); + } + for (; i < (n); i++) { + y[i] = 1.0f / (1.0f + expf(-x[i])); + } +} + +#endif // defined(__AVX2__) diff --git a/aten/src/TH/vector/AVX2.h b/aten/src/TH/vector/AVX2.h new file mode 100644 index 0000000..1c281d8 --- /dev/null +++ b/aten/src/TH/vector/AVX2.h @@ -0,0 +1,19 @@ +#ifndef TH_AVX2_H +#define TH_AVX2_H + +#include "THGeneral.h" + +#include +#include + +struct THGenerator; + +TH_API void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n); +TH_API void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n); +TH_API void THFloatVector_normal_fill_AVX2(float *data, + const int64_t size, + struct THGenerator *generator, + const float mean, + const float stddev); +TH_API void THFloatVector_sigmoid_AVX2(float *y, const float *x, const ptrdiff_t n); +#endif diff --git a/aten/src/TH/vector/NEON.cpp b/aten/src/TH/vector/NEON.cpp new file mode 100644 index 0000000..3966ace --- /dev/null +++ b/aten/src/TH/vector/NEON.cpp @@ -0,0 +1,105 @@ +static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) { + int64_t i = 0; + + for(; i < n-4; i += 4) + { + x[i] = c; + x[i+1] = c; + x[i+2] = c; + x[i+3] = c; + } + + for(; i < n; i++) + x[i] = c; + +} + +static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) { + int64_t i = 0; + + for(; i < n-4; i += 4) + { + z[i] = x[i] * y[i]; + z[i+1] = x[i+1] * y[i+1]; + z[i+2] = x[i+2] * y[i+2]; + z[i+3] = x[i+3] * y[i+3]; + } + + for(; i < n; i++) + z[i] = x[i] * y[i]; +} + +static void THFloatVector_muls_NEON(float *y, const float *x, const float c, const ptrdiff_t n) { + int64_t i = 0; + + for(; i < n-4; i += 4) + { + y[i] = x[i] * c; + y[i+1] = x[i+1] * c; + y[i+2] = x[i+2] * c; + y[i+3] = x[i+3] * c; + } + + for(; i < n; i++) + y[i] = x[i] * c; +} + +static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { + int64_t i = 0; + + for(;i < n-4; i += 4) + { + z[i] = x[i] + c * y[i]; + z[i+1] = x[i+1] + c * y[i+1]; + z[i+2] = x[i+2] + c * y[i+2]; + z[i+3] = x[i+3] + c * y[i+3]; + } + + for(; i < n; i++) + z[i] = x[i] + c * y[i]; +} + +static void THFloatVector_adds_NEON(float *y, const float *x, const float c, const ptrdiff_t n) { + int64_t i = 0; + + for(;i < n-4; i += 4) + { + y[i] = x[i] + c; + y[i+1] = x[i+1] + c; + y[i+2] = x[i+2] + c; + y[i+3] = x[i+3] + c; + } + + for(; i < n; i++) + y[i] = x[i] + c; +} + +static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) { + int64_t i = 0; + + for(;i < n-4; i += 4) + { + z[i] = x[i] / y[i]; + z[i+1] = x[i+1] / y[i+1]; + z[i+2] = x[i+2] / y[i+2]; + z[i+3] = x[i+3] / y[i+3]; + } + + for(; i < n; i++) + z[i] = x[i] / y[i]; +} + +static void THFloatVector_divs_NEON(float *y, const float *x, const float c, const ptrdiff_t n) { + int64_t i = 0; + + for(;i < n-4; i += 4) + { + y[i] = x[i] / c; + y[i+1] = x[i+1] / c; + y[i+2] = x[i+2] / c; + y[i+3] = x[i+3] / c; + } + + for(; i < n; i++) + y[i] = x[i] / c; +} diff --git a/aten/src/TH/vector/SSE.cpp b/aten/src/TH/vector/SSE.cpp new file mode 100644 index 0000000..20d5893 --- /dev/null +++ b/aten/src/TH/vector/SSE.cpp @@ -0,0 +1,303 @@ +#ifndef _MSC_VER +#include +#else +#include +#endif + +static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n) { + ptrdiff_t i; + ptrdiff_t off; + __m128d XMM0 = _mm_set1_pd(c); + for (i=0; i<=((n)-8); i+=8) { + _mm_storeu_pd((x)+i , XMM0); + _mm_storeu_pd((x)+i+2, XMM0); + _mm_storeu_pd((x)+i+4, XMM0); + _mm_storeu_pd((x)+i+6, XMM0); + } + off = (n) - ((n)%8); + for (i=0; i<((n)%8); i++) { + x[off+i] = c; + } +} + +static void THDoubleVector_cadd_SSE(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { + ptrdiff_t i; + __m128d XMM7 = _mm_set1_pd(c); + __m128d XMM0, XMM2; + for (i=0; i<=((n)-2); i+=2) { + XMM0 = _mm_loadu_pd((x)+i); + XMM2 = _mm_loadu_pd((y)+i); + XMM2 = _mm_mul_pd(XMM2, XMM7); + XMM2 = _mm_add_pd(XMM0, XMM2); + _mm_storeu_pd((z)+i, XMM2); + } + for (; i<(n); i++) { + z[i] = x[i] + c * y[i]; + } +} + +static void THDoubleVector_adds_SSE(double *y, const double *x, const double c, const ptrdiff_t n) { + ptrdiff_t i; + __m128d XMM7 = _mm_set1_pd(c); + __m128d XMM0, XMM2; + for (i=0; i<=((n)-4); i+=4) { + XMM0 = _mm_loadu_pd((x)+i); + XMM2 = _mm_loadu_pd((x)+i+2); + XMM0 = _mm_add_pd(XMM0, XMM7); + XMM2 = _mm_add_pd(XMM2, XMM7); + _mm_storeu_pd((y)+i, XMM0); + _mm_storeu_pd((y)+i+2, XMM2); + } + for (; i<(n); i++) { + y[i] = x[i] + c; + } +} + +static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) { + ptrdiff_t i; + for (i=0; i<=((n)-8); i+=8) { + __m128d XMM0 = _mm_loadu_pd((x)+i ); + __m128d XMM1 = _mm_loadu_pd((x)+i+2); + __m128d XMM2 = _mm_loadu_pd((x)+i+4); + __m128d XMM3 = _mm_loadu_pd((x)+i+6); + __m128d XMM4 = _mm_loadu_pd((y)+i ); + __m128d XMM5 = _mm_loadu_pd((y)+i+2); + __m128d XMM6 = _mm_loadu_pd((y)+i+4); + __m128d XMM7 = _mm_loadu_pd((y)+i+6); + XMM4 = _mm_mul_pd(XMM4, XMM0); + XMM5 = _mm_mul_pd(XMM5, XMM1); + XMM6 = _mm_mul_pd(XMM6, XMM2); + XMM7 = _mm_mul_pd(XMM7, XMM3); + _mm_storeu_pd((z)+i , XMM4); + _mm_storeu_pd((z)+i+2, XMM5); + _mm_storeu_pd((z)+i+4, XMM6); + _mm_storeu_pd((z)+i+6, XMM7); + } + for (; i<(n); i++) { + z[i] = x[i] * y[i]; + } +} + +static void THDoubleVector_muls_SSE(double *y, const double *x, const double c, const ptrdiff_t n) { + ptrdiff_t i; + __m128d XMM15 = _mm_set1_pd(c); + for (i=0; i<=((n)-8); i+=8) { + __m128d XMM0 = _mm_loadu_pd((x)+i ); + __m128d XMM1 = _mm_loadu_pd((x)+i+2); + __m128d XMM2 = _mm_loadu_pd((x)+i+4); + __m128d XMM3 = _mm_loadu_pd((x)+i+6); + __m128d XMM4 = _mm_mul_pd(XMM15, XMM0); + __m128d XMM5 = _mm_mul_pd(XMM15, XMM1); + __m128d XMM6 = _mm_mul_pd(XMM15, XMM2); + __m128d XMM7 = _mm_mul_pd(XMM15, XMM3); + _mm_storeu_pd((y)+i , XMM4); + _mm_storeu_pd((y)+i+2, XMM5); + _mm_storeu_pd((y)+i+4, XMM6); + _mm_storeu_pd((y)+i+6, XMM7); + } + for (; i<(n); i++) { + y[i] = x[i] * c; + } +} + +static void THDoubleVector_cdiv_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) { + ptrdiff_t i; + __m128d XMM0, XMM1, XMM2, XMM3; + for (i=0; i<=((n)-4); i+=4) { + XMM0 = _mm_loadu_pd(x+i); + XMM1 = _mm_loadu_pd(x+i+2); + XMM2 = _mm_loadu_pd(y+i); + XMM3 = _mm_loadu_pd(y+i+2); + XMM2 = _mm_div_pd(XMM0, XMM2); + XMM3 = _mm_div_pd(XMM1, XMM3); + _mm_storeu_pd(z+i, XMM2); + _mm_storeu_pd(z+i+2, XMM3); + } + for (; i<(n); i++) { + z[i] = x[i] / y[i]; + } +} + +static void THDoubleVector_divs_SSE(double *y, const double *x, const double c, const ptrdiff_t n) { + ptrdiff_t i; + __m128d XMM7 = _mm_set1_pd(c); + __m128d XMM0, XMM1; + for (i=0; i<=((n)-4); i+=4) { + XMM0 = _mm_loadu_pd(x+i); + XMM1 = _mm_loadu_pd(x+i+2); + XMM0 = _mm_div_pd(XMM0, XMM7); + XMM1 = _mm_div_pd(XMM1, XMM7); + _mm_storeu_pd(y+i, XMM0); + _mm_storeu_pd(y+i+2, XMM1); + } + for (; i<(n); i++) { + y[i] = x[i] / c; + } +} + +static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) { + ptrdiff_t i; + __m128 XMM0 = _mm_set_ps1(c); + ptrdiff_t off; + for (i=0; i<=((n)-16); i+=16) { + _mm_storeu_ps((x)+i , XMM0); + _mm_storeu_ps((x)+i+4, XMM0); + _mm_storeu_ps((x)+i+8, XMM0); + _mm_storeu_ps((x)+i+12, XMM0); + } + off = (n) - ((n)%16); + for (i=0; i<((n)%16); i++) { + x[off+i] = c; + } +} + + +static void THFloatVector_cadd_SSE(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { + ptrdiff_t i; + __m128 XMM7 = _mm_set_ps1(c); + __m128 XMM0, XMM2; + for (i=0; i<=((n)-4); i+=4) { + XMM0 = _mm_loadu_ps((x)+i); + XMM2 = _mm_loadu_ps((y)+i); + XMM2 = _mm_mul_ps(XMM2, XMM7); + XMM2 = _mm_add_ps(XMM0, XMM2); + _mm_storeu_ps((z)+i, XMM2); + } + for (; i<(n); i++) { + z[i] = x[i] + c * y[i]; + } +} + +static void THFloatVector_adds_SSE(float *y, const float *x, const float c, const ptrdiff_t n) { + ptrdiff_t i; + __m128 XMM7 = _mm_set1_ps(c); + __m128 XMM0, XMM2; + for (i=0; i<=((n)-8); i+=8) { + XMM0 = _mm_loadu_ps((x)+i); + XMM2 = _mm_loadu_ps((x)+i+4); + XMM0 = _mm_add_ps(XMM0, XMM7); + XMM2 = _mm_add_ps(XMM2, XMM7); + _mm_storeu_ps((y)+i, XMM0); + _mm_storeu_ps((y)+i+4, XMM2); + } + for (; i<(n); i++) { + y[i] = x[i] + c; + } +} + +static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) { + ptrdiff_t i; + for (i=0; i<=((n)-16); i+=16) { + __m128 XMM0 = _mm_loadu_ps((x)+i ); + __m128 XMM1 = _mm_loadu_ps((x)+i+ 4); + __m128 XMM2 = _mm_loadu_ps((x)+i+ 8); + __m128 XMM3 = _mm_loadu_ps((x)+i+12); + __m128 XMM4 = _mm_loadu_ps((y)+i ); + __m128 XMM5 = _mm_loadu_ps((y)+i+ 4); + __m128 XMM6 = _mm_loadu_ps((y)+i+ 8); + __m128 XMM7 = _mm_loadu_ps((y)+i+12); + XMM4 = _mm_mul_ps(XMM4, XMM0); + XMM5 = _mm_mul_ps(XMM5, XMM1); + XMM6 = _mm_mul_ps(XMM6, XMM2); + XMM7 = _mm_mul_ps(XMM7, XMM3); + _mm_storeu_ps((z)+i , XMM4); + _mm_storeu_ps((z)+i+ 4, XMM5); + _mm_storeu_ps((z)+i+ 8, XMM6); + _mm_storeu_ps((z)+i+12, XMM7); + } + for (; i<(n); i++) { + z[i] = x[i] * y[i]; + } +} + +static void THFloatVector_muls_SSE(float *y, const float *x, const float c, const ptrdiff_t n) { + ptrdiff_t i; + __m128 XMM15 = _mm_set_ps1(c); + for (i=0; i<=((n)-16); i+=16) { + __m128 XMM0 = _mm_loadu_ps((x)+i ); + __m128 XMM1 = _mm_loadu_ps((x)+i+ 4); + __m128 XMM2 = _mm_loadu_ps((x)+i+ 8); + __m128 XMM3 = _mm_loadu_ps((x)+i+12); + __m128 XMM4 = _mm_mul_ps(XMM15, XMM0); + __m128 XMM5 = _mm_mul_ps(XMM15, XMM1); + __m128 XMM6 = _mm_mul_ps(XMM15, XMM2); + __m128 XMM7 = _mm_mul_ps(XMM15, XMM3); + _mm_storeu_ps((y)+i , XMM4); + _mm_storeu_ps((y)+i+ 4, XMM5); + _mm_storeu_ps((y)+i+ 8, XMM6); + _mm_storeu_ps((y)+i+12, XMM7); + } + for (; i<(n); i++) { + y[i] = x[i] * c; + } +} + +static void THFloatVector_cdiv_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) { + ptrdiff_t i; + __m128 XMM0, XMM1, XMM2, XMM3; + for (i=0; i<=((n)-8); i+=8) { + XMM0 = _mm_loadu_ps(x+i); + XMM1 = _mm_loadu_ps(x+i+4); + XMM2 = _mm_loadu_ps(y+i); + XMM3 = _mm_loadu_ps(y+i+4); + XMM2 = _mm_div_ps(XMM0, XMM2); + XMM3 = _mm_div_ps(XMM1, XMM3); + _mm_storeu_ps(z+i, XMM2); + _mm_storeu_ps(z+i+4, XMM3); + } + for (; i<(n); i++) { + z[i] = x[i] / y[i]; + } +} + +static void THFloatVector_divs_SSE(float *y, const float *x, const float c, const ptrdiff_t n) { + ptrdiff_t i; + __m128 XMM7 = _mm_set1_ps(c); + __m128 XMM0, XMM1; + for (i=0; i<=((n)-8); i+=8) { + XMM0 = _mm_loadu_ps(x+i); + XMM1 = _mm_loadu_ps(x+i+4); + XMM0 = _mm_div_ps(XMM0, XMM7); + XMM1 = _mm_div_ps(XMM1, XMM7); + _mm_storeu_ps(y+i, XMM0); + _mm_storeu_ps(y+i+4, XMM1); + } + for (; i<(n); i++) { + y[i] = x[i] / c; + } +} + +static void THFloatVector_cvtFromInt_SSE(float *y, const int *x, const ptrdiff_t n) { + ptrdiff_t i; + __m128i YMM0, YMM1; + __m128 YMM2, YMM3; + for (i=0; i<=((n)-8); i+=8) { + YMM0 = _mm_loadu_si128((__m128i const*)(x+i)); + YMM1 = _mm_loadu_si128((__m128i const*)(x+i+4)); + YMM2 = _mm_cvtepi32_ps(YMM0); + YMM3 = _mm_cvtepi32_ps(YMM1); + _mm_storeu_ps(y+i, YMM2); + _mm_storeu_ps(y+i+4, YMM3); + } + for (; i<(n); i++) { + y[i] = (float)x[i]; + } +} + +static void THDoubleVector_cvtFromInt_SSE(double *y, const int *x, const ptrdiff_t n) { + ptrdiff_t i; + __m128i YMM0, YMM1; + __m128d YMM2, YMM3; + for (i=0; i<=((n)- 4); i+=4) { + YMM0 = _mm_loadu_si128((__m128i const*)(x+i)); + YMM2 = _mm_cvtepi32_pd(YMM0); + YMM1 = _mm_srli_si128(YMM0, 8); + YMM3 = _mm_cvtepi32_pd(YMM1); + _mm_storeu_pd(y+i, YMM2); + _mm_storeu_pd(y+i+2, YMM3); + } + for (; i<(n); i++) { + y[i] = (double)x[i]; + } +} + diff --git a/aten/src/TH/vector/VSX.cpp b/aten/src/TH/vector/VSX.cpp new file mode 100644 index 0000000..f01718c --- /dev/null +++ b/aten/src/TH/vector/VSX.cpp @@ -0,0 +1,2520 @@ +#ifdef __PPC64__ +#include +#include + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_fill_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + double val[2] = {c, c}; + vector double fp64vec2 = vec_xl(0, val); + + for (i = 0; i <= n-128; i += 128) + { + vec_xst(fp64vec2, 0, x+(i )); + vec_xst(fp64vec2, 0, x+(i+2 )); + vec_xst(fp64vec2, 0, x+(i+4 )); + vec_xst(fp64vec2, 0, x+(i+6 )); + vec_xst(fp64vec2, 0, x+(i+8 )); + vec_xst(fp64vec2, 0, x+(i+10 )); + vec_xst(fp64vec2, 0, x+(i+12 )); + vec_xst(fp64vec2, 0, x+(i+14 )); + vec_xst(fp64vec2, 0, x+(i+16 )); + vec_xst(fp64vec2, 0, x+(i+18 )); + vec_xst(fp64vec2, 0, x+(i+20 )); + vec_xst(fp64vec2, 0, x+(i+22 )); + vec_xst(fp64vec2, 0, x+(i+24 )); + vec_xst(fp64vec2, 0, x+(i+26 )); + vec_xst(fp64vec2, 0, x+(i+28 )); + vec_xst(fp64vec2, 0, x+(i+30 )); + vec_xst(fp64vec2, 0, x+(i+32 )); + vec_xst(fp64vec2, 0, x+(i+34 )); + vec_xst(fp64vec2, 0, x+(i+36 )); + vec_xst(fp64vec2, 0, x+(i+38 )); + vec_xst(fp64vec2, 0, x+(i+40 )); + vec_xst(fp64vec2, 0, x+(i+42 )); + vec_xst(fp64vec2, 0, x+(i+44 )); + vec_xst(fp64vec2, 0, x+(i+46 )); + vec_xst(fp64vec2, 0, x+(i+48 )); + vec_xst(fp64vec2, 0, x+(i+50 )); + vec_xst(fp64vec2, 0, x+(i+52 )); + vec_xst(fp64vec2, 0, x+(i+54 )); + vec_xst(fp64vec2, 0, x+(i+56 )); + vec_xst(fp64vec2, 0, x+(i+58 )); + vec_xst(fp64vec2, 0, x+(i+60 )); + vec_xst(fp64vec2, 0, x+(i+62 )); + vec_xst(fp64vec2, 0, x+(i+64 )); + vec_xst(fp64vec2, 0, x+(i+66 )); + vec_xst(fp64vec2, 0, x+(i+68 )); + vec_xst(fp64vec2, 0, x+(i+70 )); + vec_xst(fp64vec2, 0, x+(i+72 )); + vec_xst(fp64vec2, 0, x+(i+74 )); + vec_xst(fp64vec2, 0, x+(i+76 )); + vec_xst(fp64vec2, 0, x+(i+78 )); + vec_xst(fp64vec2, 0, x+(i+80 )); + vec_xst(fp64vec2, 0, x+(i+82 )); + vec_xst(fp64vec2, 0, x+(i+84 )); + vec_xst(fp64vec2, 0, x+(i+86 )); + vec_xst(fp64vec2, 0, x+(i+88 )); + vec_xst(fp64vec2, 0, x+(i+90 )); + vec_xst(fp64vec2, 0, x+(i+92 )); + vec_xst(fp64vec2, 0, x+(i+94 )); + vec_xst(fp64vec2, 0, x+(i+96 )); + vec_xst(fp64vec2, 0, x+(i+98 )); + vec_xst(fp64vec2, 0, x+(i+100)); + vec_xst(fp64vec2, 0, x+(i+102)); + vec_xst(fp64vec2, 0, x+(i+104)); + vec_xst(fp64vec2, 0, x+(i+106)); + vec_xst(fp64vec2, 0, x+(i+108)); + vec_xst(fp64vec2, 0, x+(i+110)); + vec_xst(fp64vec2, 0, x+(i+112)); + vec_xst(fp64vec2, 0, x+(i+114)); + vec_xst(fp64vec2, 0, x+(i+116)); + vec_xst(fp64vec2, 0, x+(i+118)); + vec_xst(fp64vec2, 0, x+(i+120)); + vec_xst(fp64vec2, 0, x+(i+122)); + vec_xst(fp64vec2, 0, x+(i+124)); + vec_xst(fp64vec2, 0, x+(i+126)); + } + for (; i <= n-16; i += 16) + { + vec_xst(fp64vec2, 0, x+(i )); + vec_xst(fp64vec2, 0, x+(i+2 )); + vec_xst(fp64vec2, 0, x+(i+4 )); + vec_xst(fp64vec2, 0, x+(i+6 )); + vec_xst(fp64vec2, 0, x+(i+8 )); + vec_xst(fp64vec2, 0, x+(i+10 )); + vec_xst(fp64vec2, 0, x+(i+12 )); + vec_xst(fp64vec2, 0, x+(i+14 )); + } + for (; i <= n-2; i += 2) + vec_xst(fp64vec2, 0, x+(i )); + for (; i < n; i++) + x[i] = c; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_cadds_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_cadd_VSX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + double val[2] = {c, c}; + vector double c_fp64vec2 = vec_xl(0, val); + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2); + y1_fp64vec2 = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2); + y2_fp64vec2 = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2); + y3_fp64vec2 = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2); + y4_fp64vec2 = vec_madd(y4_fp64vec2, c_fp64vec2, x4_fp64vec2); + y5_fp64vec2 = vec_madd(y5_fp64vec2, c_fp64vec2, x5_fp64vec2); + y6_fp64vec2 = vec_madd(y6_fp64vec2, c_fp64vec2, x6_fp64vec2); + y7_fp64vec2 = vec_madd(y7_fp64vec2, c_fp64vec2, x7_fp64vec2); + y8_fp64vec2 = vec_madd(y8_fp64vec2, c_fp64vec2, x8_fp64vec2); + y9_fp64vec2 = vec_madd(y9_fp64vec2, c_fp64vec2, x9_fp64vec2); + y10_fp64vec2 = vec_madd(y10_fp64vec2, c_fp64vec2,x10_fp64vec2); + y11_fp64vec2 = vec_madd(y11_fp64vec2, c_fp64vec2,x11_fp64vec2); + + vec_xst(y0_fp64vec2, 0, z+(i )); + vec_xst(y1_fp64vec2, 0, z+(i+2 )); + vec_xst(y2_fp64vec2, 0, z+(i+4 )); + vec_xst(y3_fp64vec2, 0, z+(i+6 )); + vec_xst(y4_fp64vec2, 0, z+(i+8 )); + vec_xst(y5_fp64vec2, 0, z+(i+10)); + vec_xst(y6_fp64vec2, 0, z+(i+12)); + vec_xst(y7_fp64vec2, 0, z+(i+14)); + vec_xst(y8_fp64vec2, 0, z+(i+16)); + vec_xst(y9_fp64vec2, 0, z+(i+18)); + vec_xst(y10_fp64vec2, 0, z+(i+20)); + vec_xst(y11_fp64vec2, 0, z+(i+22)); + } + for (; i <= n-8; i += 8) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2); + y1_fp64vec2 = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2); + y2_fp64vec2 = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2); + y3_fp64vec2 = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2); + + vec_xst(y0_fp64vec2, 0, z+(i )); + vec_xst(y1_fp64vec2, 0, z+(i+2 )); + vec_xst(y2_fp64vec2, 0, z+(i+4 )); + vec_xst(y3_fp64vec2, 0, z+(i+6 )); + } + for (; i <= n-2; i += 2) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2); + vec_xst(y0_fp64vec2, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] + c* y[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_adds_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_adds_VSX(double *y, const double *x, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + double val[2] = {c, c}; + vector double c_fp64vec2 = vec_xl(0, val); + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_add(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_add(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_add(x3_fp64vec2, c_fp64vec2); + y4_fp64vec2 = vec_add(x4_fp64vec2, c_fp64vec2); + y5_fp64vec2 = vec_add(x5_fp64vec2, c_fp64vec2); + y6_fp64vec2 = vec_add(x6_fp64vec2, c_fp64vec2); + y7_fp64vec2 = vec_add(x7_fp64vec2, c_fp64vec2); + y8_fp64vec2 = vec_add(x8_fp64vec2, c_fp64vec2); + y9_fp64vec2 = vec_add(x9_fp64vec2, c_fp64vec2); + y10_fp64vec2 = vec_add(x10_fp64vec2, c_fp64vec2); + y11_fp64vec2 = vec_add(x11_fp64vec2, c_fp64vec2); + + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + vec_xst(y4_fp64vec2, 0, y+(i+8 )); + vec_xst(y5_fp64vec2, 0, y+(i+10)); + vec_xst(y6_fp64vec2, 0, y+(i+12)); + vec_xst(y7_fp64vec2, 0, y+(i+14)); + vec_xst(y8_fp64vec2, 0, y+(i+16)); + vec_xst(y9_fp64vec2, 0, y+(i+18)); + vec_xst(y10_fp64vec2, 0, y+(i+20)); + vec_xst(y11_fp64vec2, 0, y+(i+22)); + } + for (; i <= n-8; i += 8) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_add(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_add(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_add(x3_fp64vec2, c_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + } + for (; i <= n-2; i += 2) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = x[i] +c; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_cmul_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_cmul_VSX(double *z, const double *x, const double *y, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); + y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2); + y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2); + y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2); + y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2); + y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2); + y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2); + y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2); + y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2); + y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2); + y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2); + y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2); + + vec_xst(y0_fp64vec2, 0, z+(i )); + vec_xst(y1_fp64vec2, 0, z+(i+2 )); + vec_xst(y2_fp64vec2, 0, z+(i+4 )); + vec_xst(y3_fp64vec2, 0, z+(i+6 )); + vec_xst(y4_fp64vec2, 0, z+(i+8 )); + vec_xst(y5_fp64vec2, 0, z+(i+10)); + vec_xst(y6_fp64vec2, 0, z+(i+12)); + vec_xst(y7_fp64vec2, 0, z+(i+14)); + vec_xst(y8_fp64vec2, 0, z+(i+16)); + vec_xst(y9_fp64vec2, 0, z+(i+18)); + vec_xst(y10_fp64vec2, 0, z+(i+20)); + vec_xst(y11_fp64vec2, 0, z+(i+22)); + } + for (; i <= n-8; i += 8) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); + y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2); + y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2); + y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2); + + vec_xst(y0_fp64vec2, 0, z+(i )); + vec_xst(y1_fp64vec2, 0, z+(i+2 )); + vec_xst(y2_fp64vec2, 0, z+(i+4 )); + vec_xst(y3_fp64vec2, 0, z+(i+6 )); + } + for (; i <= n-2; i += 2) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); + vec_xst(y0_fp64vec2, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] * y[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_muls_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + double val[2] = {c, c}; + vector double c_fp64vec2 = vec_xl(0, val); + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_mul(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_mul(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_mul(x3_fp64vec2, c_fp64vec2); + y4_fp64vec2 = vec_mul(x4_fp64vec2, c_fp64vec2); + y5_fp64vec2 = vec_mul(x5_fp64vec2, c_fp64vec2); + y6_fp64vec2 = vec_mul(x6_fp64vec2, c_fp64vec2); + y7_fp64vec2 = vec_mul(x7_fp64vec2, c_fp64vec2); + y8_fp64vec2 = vec_mul(x8_fp64vec2, c_fp64vec2); + y9_fp64vec2 = vec_mul(x9_fp64vec2, c_fp64vec2); + y10_fp64vec2 = vec_mul(x10_fp64vec2, c_fp64vec2); + y11_fp64vec2 = vec_mul(x11_fp64vec2, c_fp64vec2); + + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + vec_xst(y4_fp64vec2, 0, y+(i+8 )); + vec_xst(y5_fp64vec2, 0, y+(i+10)); + vec_xst(y6_fp64vec2, 0, y+(i+12)); + vec_xst(y7_fp64vec2, 0, y+(i+14)); + vec_xst(y8_fp64vec2, 0, y+(i+16)); + vec_xst(y9_fp64vec2, 0, y+(i+18)); + vec_xst(y10_fp64vec2, 0, y+(i+20)); + vec_xst(y11_fp64vec2, 0, y+(i+22)); + } + for (; i <= n-8; i += 8) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_mul(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_mul(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_mul(x3_fp64vec2, c_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + } + for (; i <= n-2; i += 2) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = c * x[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_cdiv_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_cdiv_VSX(double *z, const double *x, const double *y, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2); + y1_fp64vec2 = vec_div(x1_fp64vec2, y1_fp64vec2); + y2_fp64vec2 = vec_div(x2_fp64vec2, y2_fp64vec2); + y3_fp64vec2 = vec_div(x3_fp64vec2, y3_fp64vec2); + y4_fp64vec2 = vec_div(x4_fp64vec2, y4_fp64vec2); + y5_fp64vec2 = vec_div(x5_fp64vec2, y5_fp64vec2); + y6_fp64vec2 = vec_div(x6_fp64vec2, y6_fp64vec2); + y7_fp64vec2 = vec_div(x7_fp64vec2, y7_fp64vec2); + y8_fp64vec2 = vec_div(x8_fp64vec2, y8_fp64vec2); + y9_fp64vec2 = vec_div(x9_fp64vec2, y9_fp64vec2); + y10_fp64vec2 = vec_div(x10_fp64vec2, y10_fp64vec2); + y11_fp64vec2 = vec_div(x11_fp64vec2, y11_fp64vec2); + + vec_xst(y0_fp64vec2, 0, z+(i )); + vec_xst(y1_fp64vec2, 0, z+(i+2 )); + vec_xst(y2_fp64vec2, 0, z+(i+4 )); + vec_xst(y3_fp64vec2, 0, z+(i+6 )); + vec_xst(y4_fp64vec2, 0, z+(i+8 )); + vec_xst(y5_fp64vec2, 0, z+(i+10)); + vec_xst(y6_fp64vec2, 0, z+(i+12)); + vec_xst(y7_fp64vec2, 0, z+(i+14)); + vec_xst(y8_fp64vec2, 0, z+(i+16)); + vec_xst(y9_fp64vec2, 0, z+(i+18)); + vec_xst(y10_fp64vec2, 0, z+(i+20)); + vec_xst(y11_fp64vec2, 0, z+(i+22)); + } + for (; i <= n-8; i += 8) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2); + y1_fp64vec2 = vec_div(x1_fp64vec2, y1_fp64vec2); + y2_fp64vec2 = vec_div(x2_fp64vec2, y2_fp64vec2); + y3_fp64vec2 = vec_div(x3_fp64vec2, y3_fp64vec2); + + vec_xst(y0_fp64vec2, 0, z+(i )); + vec_xst(y1_fp64vec2, 0, z+(i+2 )); + vec_xst(y2_fp64vec2, 0, z+(i+4 )); + vec_xst(y3_fp64vec2, 0, z+(i+6 )); + } + for (; i <= n-2; i += 2) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2); + vec_xst(y0_fp64vec2, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] / y[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_divs_VSX: +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_divs_VSX(double *y, const double *x, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + double val[2] = {c, c}; + vector double c_fp64vec2 = vec_xl(0, val); + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2); + y4_fp64vec2 = vec_div(x4_fp64vec2, c_fp64vec2); + y5_fp64vec2 = vec_div(x5_fp64vec2, c_fp64vec2); + y6_fp64vec2 = vec_div(x6_fp64vec2, c_fp64vec2); + y7_fp64vec2 = vec_div(x7_fp64vec2, c_fp64vec2); + y8_fp64vec2 = vec_div(x8_fp64vec2, c_fp64vec2); + y9_fp64vec2 = vec_div(x9_fp64vec2, c_fp64vec2); + y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2); + y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2); + + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + vec_xst(y4_fp64vec2, 0, y+(i+8 )); + vec_xst(y5_fp64vec2, 0, y+(i+10)); + vec_xst(y6_fp64vec2, 0, y+(i+12)); + vec_xst(y7_fp64vec2, 0, y+(i+14)); + vec_xst(y8_fp64vec2, 0, y+(i+16)); + vec_xst(y9_fp64vec2, 0, y+(i+18)); + vec_xst(y10_fp64vec2, 0, y+(i+20)); + vec_xst(y11_fp64vec2, 0, y+(i+22)); + } + for (; i <= n-8; i += 8) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + } + for (; i <= n-2; i += 2) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = x[i] / c; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_fill_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + + float val[4] = {c, c, c, c}; + vector float fp32vec4 = vec_xl(0, val); + + for (i = 0; i <= n-256; i += 256) + { + vec_xst(fp32vec4, 0, x+(i )); + vec_xst(fp32vec4, 0, x+(i+4 )); + vec_xst(fp32vec4, 0, x+(i+8 )); + vec_xst(fp32vec4, 0, x+(i+12 )); + vec_xst(fp32vec4, 0, x+(i+16 )); + vec_xst(fp32vec4, 0, x+(i+20 )); + vec_xst(fp32vec4, 0, x+(i+24 )); + vec_xst(fp32vec4, 0, x+(i+28 )); + vec_xst(fp32vec4, 0, x+(i+32 )); + vec_xst(fp32vec4, 0, x+(i+36 )); + vec_xst(fp32vec4, 0, x+(i+40 )); + vec_xst(fp32vec4, 0, x+(i+44 )); + vec_xst(fp32vec4, 0, x+(i+48 )); + vec_xst(fp32vec4, 0, x+(i+52 )); + vec_xst(fp32vec4, 0, x+(i+56 )); + vec_xst(fp32vec4, 0, x+(i+60 )); + vec_xst(fp32vec4, 0, x+(i+64 )); + vec_xst(fp32vec4, 0, x+(i+68 )); + vec_xst(fp32vec4, 0, x+(i+72 )); + vec_xst(fp32vec4, 0, x+(i+76 )); + vec_xst(fp32vec4, 0, x+(i+80 )); + vec_xst(fp32vec4, 0, x+(i+84 )); + vec_xst(fp32vec4, 0, x+(i+88 )); + vec_xst(fp32vec4, 0, x+(i+92 )); + vec_xst(fp32vec4, 0, x+(i+96 )); + vec_xst(fp32vec4, 0, x+(i+100)); + vec_xst(fp32vec4, 0, x+(i+104)); + vec_xst(fp32vec4, 0, x+(i+108)); + vec_xst(fp32vec4, 0, x+(i+112)); + vec_xst(fp32vec4, 0, x+(i+116)); + vec_xst(fp32vec4, 0, x+(i+120)); + vec_xst(fp32vec4, 0, x+(i+124)); + vec_xst(fp32vec4, 0, x+(i+128)); + vec_xst(fp32vec4, 0, x+(i+132)); + vec_xst(fp32vec4, 0, x+(i+136)); + vec_xst(fp32vec4, 0, x+(i+140)); + vec_xst(fp32vec4, 0, x+(i+144)); + vec_xst(fp32vec4, 0, x+(i+148)); + vec_xst(fp32vec4, 0, x+(i+152)); + vec_xst(fp32vec4, 0, x+(i+156)); + vec_xst(fp32vec4, 0, x+(i+160)); + vec_xst(fp32vec4, 0, x+(i+164)); + vec_xst(fp32vec4, 0, x+(i+168)); + vec_xst(fp32vec4, 0, x+(i+172)); + vec_xst(fp32vec4, 0, x+(i+176)); + vec_xst(fp32vec4, 0, x+(i+180)); + vec_xst(fp32vec4, 0, x+(i+184)); + vec_xst(fp32vec4, 0, x+(i+188)); + vec_xst(fp32vec4, 0, x+(i+192)); + vec_xst(fp32vec4, 0, x+(i+196)); + vec_xst(fp32vec4, 0, x+(i+200)); + vec_xst(fp32vec4, 0, x+(i+204)); + vec_xst(fp32vec4, 0, x+(i+208)); + vec_xst(fp32vec4, 0, x+(i+212)); + vec_xst(fp32vec4, 0, x+(i+216)); + vec_xst(fp32vec4, 0, x+(i+220)); + vec_xst(fp32vec4, 0, x+(i+224)); + vec_xst(fp32vec4, 0, x+(i+228)); + vec_xst(fp32vec4, 0, x+(i+232)); + vec_xst(fp32vec4, 0, x+(i+236)); + vec_xst(fp32vec4, 0, x+(i+240)); + vec_xst(fp32vec4, 0, x+(i+244)); + vec_xst(fp32vec4, 0, x+(i+248)); + vec_xst(fp32vec4, 0, x+(i+252)); + } + for (; i <= n-32; i += 32) + { + vec_xst(fp32vec4, 0, x+(i )); + vec_xst(fp32vec4, 0, x+(i+4 )); + vec_xst(fp32vec4, 0, x+(i+8 )); + vec_xst(fp32vec4, 0, x+(i+12 )); + vec_xst(fp32vec4, 0, x+(i+16 )); + vec_xst(fp32vec4, 0, x+(i+20 )); + vec_xst(fp32vec4, 0, x+(i+24 )); + vec_xst(fp32vec4, 0, x+(i+28 )); + } + for (; i <= n-4; i += 4) + vec_xst(fp32vec4, 0, x+(i )); + for (; i < n; i++) + x[i] = c; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_cadd_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_cadd_VSX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + + float val[4] = {c, c, c, c}; + vector float c_fp32vec4 = vec_xl(0, val); + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + + for (i = 0; i <= n-48; i += 48) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + y4_fp32vec4 = vec_xl(0, y+(i+16 )); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12 )); + x4_fp32vec4 = vec_xl(0, x+(i+16 )); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4); + y1_fp32vec4 = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4); + y2_fp32vec4 = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4); + y3_fp32vec4 = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4); + y4_fp32vec4 = vec_madd(y4_fp32vec4, c_fp32vec4, x4_fp32vec4); + y5_fp32vec4 = vec_madd(y5_fp32vec4, c_fp32vec4, x5_fp32vec4); + y6_fp32vec4 = vec_madd(y6_fp32vec4, c_fp32vec4, x6_fp32vec4); + y7_fp32vec4 = vec_madd(y7_fp32vec4, c_fp32vec4, x7_fp32vec4); + y8_fp32vec4 = vec_madd(y8_fp32vec4, c_fp32vec4, x8_fp32vec4); + y9_fp32vec4 = vec_madd(y9_fp32vec4, c_fp32vec4, x9_fp32vec4); + y10_fp32vec4 = vec_madd(y10_fp32vec4, c_fp32vec4, x10_fp32vec4); + y11_fp32vec4 = vec_madd(y11_fp32vec4, c_fp32vec4, x11_fp32vec4); + + vec_xst(y0_fp32vec4, 0, z+(i )); + vec_xst(y1_fp32vec4, 0, z+(i+4 )); + vec_xst(y2_fp32vec4, 0, z+(i+8 )); + vec_xst(y3_fp32vec4, 0, z+(i+12 )); + vec_xst(y4_fp32vec4, 0, z+(i+16 )); + vec_xst(y5_fp32vec4, 0, z+(i+20)); + vec_xst(y6_fp32vec4, 0, z+(i+24)); + vec_xst(y7_fp32vec4, 0, z+(i+28)); + vec_xst(y8_fp32vec4, 0, z+(i+32)); + vec_xst(y9_fp32vec4, 0, z+(i+36)); + vec_xst(y10_fp32vec4, 0, z+(i+40)); + vec_xst(y11_fp32vec4, 0, z+(i+44)); + } + for (; i <= n-16; i += 16) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12 )); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12 )); + + y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4); + y1_fp32vec4 = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4); + y2_fp32vec4 = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4); + y3_fp32vec4 = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4); + + vec_xst(y0_fp32vec4, 0, z+(i )); + vec_xst(y1_fp32vec4, 0, z+(i+4 )); + vec_xst(y2_fp32vec4, 0, z+(i+8 )); + vec_xst(y3_fp32vec4, 0, z+(i+12 )); + } + for (; i <= n-4; i += 4) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4); + vec_xst(y0_fp32vec4, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] + c* y[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_adds_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_adds_VSX(float *y, const float *x, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + float val[4] = {c, c, c, c}; + vector float c_fp32vec4 = vec_xl(0, val); + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + + for (i = 0; i <= n-48; i += 48) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + x4_fp32vec4 = vec_xl(0, x+(i+16)); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4); + y1_fp32vec4 = vec_add(x1_fp32vec4, c_fp32vec4); + y2_fp32vec4 = vec_add(x2_fp32vec4, c_fp32vec4); + y3_fp32vec4 = vec_add(x3_fp32vec4, c_fp32vec4); + y4_fp32vec4 = vec_add(x4_fp32vec4, c_fp32vec4); + y5_fp32vec4 = vec_add(x5_fp32vec4, c_fp32vec4); + y6_fp32vec4 = vec_add(x6_fp32vec4, c_fp32vec4); + y7_fp32vec4 = vec_add(x7_fp32vec4, c_fp32vec4); + y8_fp32vec4 = vec_add(x8_fp32vec4, c_fp32vec4); + y9_fp32vec4 = vec_add(x9_fp32vec4, c_fp32vec4); + y10_fp32vec4 = vec_add(x10_fp32vec4, c_fp32vec4); + y11_fp32vec4 = vec_add(x11_fp32vec4, c_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + vec_xst(y4_fp32vec4, 0, y+(i+16)); + vec_xst(y5_fp32vec4, 0, y+(i+20)); + vec_xst(y6_fp32vec4, 0, y+(i+24)); + vec_xst(y7_fp32vec4, 0, y+(i+28)); + vec_xst(y8_fp32vec4, 0, y+(i+32)); + vec_xst(y9_fp32vec4, 0, y+(i+36)); + vec_xst(y10_fp32vec4, 0, y+(i+40)); + vec_xst(y11_fp32vec4, 0, y+(i+44)); + } + for (; i <= n-16; i += 16) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + + y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4); + y1_fp32vec4 = vec_add(x1_fp32vec4, c_fp32vec4); + y2_fp32vec4 = vec_add(x2_fp32vec4, c_fp32vec4); + y3_fp32vec4 = vec_add(x3_fp32vec4, c_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + } + for (; i <= n-4; i += 4) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4); + vec_xst(y0_fp32vec4, 0, y+(i )); + } + for (; i < n; i++) + y[i] = c + x[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_cmul_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_cmul_VSX(float *z, const float *y, const float *x, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + + for (i = 0; i <= n-48; i += 48) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12 )); + y4_fp32vec4 = vec_xl(0, y+(i+16 )); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12 )); + x4_fp32vec4 = vec_xl(0, x+(i+16 )); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); + y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4); + y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4); + y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4); + y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4); + y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4); + y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4); + y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4); + y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4); + y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4); + y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4); + y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4); + + vec_xst(y0_fp32vec4, 0, z+(i )); + vec_xst(y1_fp32vec4, 0, z+(i+4 )); + vec_xst(y2_fp32vec4, 0, z+(i+8 )); + vec_xst(y3_fp32vec4, 0, z+(i+12 )); + vec_xst(y4_fp32vec4, 0, z+(i+16 )); + vec_xst(y5_fp32vec4, 0, z+(i+20)); + vec_xst(y6_fp32vec4, 0, z+(i+24)); + vec_xst(y7_fp32vec4, 0, z+(i+28)); + vec_xst(y8_fp32vec4, 0, z+(i+32)); + vec_xst(y9_fp32vec4, 0, z+(i+36)); + vec_xst(y10_fp32vec4, 0, z+(i+40)); + vec_xst(y11_fp32vec4, 0, z+(i+44)); + } + for (; i <= n-16; i += 16) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12 )); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12 )); + + y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); + y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4); + y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4); + y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4); + + vec_xst(y0_fp32vec4, 0, z+(i )); + vec_xst(y1_fp32vec4, 0, z+(i+4 )); + vec_xst(y2_fp32vec4, 0, z+(i+8 )); + vec_xst(y3_fp32vec4, 0, z+(i+12 )); + } + for (; i <= n-4; i += 4) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); + vec_xst(y0_fp32vec4, 0, z+(i )); + } + for (; i < n; i++) + z[i] = y[i] * x[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_muls_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + float val[4] = {c, c, c, c}; + vector float c_fp32vec4 = vec_xl(0, val); + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + + for (i = 0; i <= n-48; i += 48) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + x4_fp32vec4 = vec_xl(0, x+(i+16)); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4); + y1_fp32vec4 = vec_mul(x1_fp32vec4, c_fp32vec4); + y2_fp32vec4 = vec_mul(x2_fp32vec4, c_fp32vec4); + y3_fp32vec4 = vec_mul(x3_fp32vec4, c_fp32vec4); + y4_fp32vec4 = vec_mul(x4_fp32vec4, c_fp32vec4); + y5_fp32vec4 = vec_mul(x5_fp32vec4, c_fp32vec4); + y6_fp32vec4 = vec_mul(x6_fp32vec4, c_fp32vec4); + y7_fp32vec4 = vec_mul(x7_fp32vec4, c_fp32vec4); + y8_fp32vec4 = vec_mul(x8_fp32vec4, c_fp32vec4); + y9_fp32vec4 = vec_mul(x9_fp32vec4, c_fp32vec4); + y10_fp32vec4 = vec_mul(x10_fp32vec4, c_fp32vec4); + y11_fp32vec4 = vec_mul(x11_fp32vec4, c_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + vec_xst(y4_fp32vec4, 0, y+(i+16)); + vec_xst(y5_fp32vec4, 0, y+(i+20)); + vec_xst(y6_fp32vec4, 0, y+(i+24)); + vec_xst(y7_fp32vec4, 0, y+(i+28)); + vec_xst(y8_fp32vec4, 0, y+(i+32)); + vec_xst(y9_fp32vec4, 0, y+(i+36)); + vec_xst(y10_fp32vec4, 0, y+(i+40)); + vec_xst(y11_fp32vec4, 0, y+(i+44)); + } + for (; i <= n-16; i += 16) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + + y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4); + y1_fp32vec4 = vec_mul(x1_fp32vec4, c_fp32vec4); + y2_fp32vec4 = vec_mul(x2_fp32vec4, c_fp32vec4); + y3_fp32vec4 = vec_mul(x3_fp32vec4, c_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + } + for (; i <= n-4; i += 4) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4); + vec_xst(y0_fp32vec4, 0, y+(i )); + } + for (; i < n; i++) + y[i] = c * x[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_cdiv_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_cdiv_VSX(float *z, const float *x, const float *y, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + + for (i = 0; i <= n-48; i += 48) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4)); + y2_fp32vec4 = vec_xl(0, y+(i+8)); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + y4_fp32vec4 = vec_xl(0, y+(i+16)); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12 )); + x4_fp32vec4 = vec_xl(0, x+(i+16 )); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4); + y1_fp32vec4 = vec_div(x1_fp32vec4, y1_fp32vec4); + y2_fp32vec4 = vec_div(x2_fp32vec4, y2_fp32vec4); + y3_fp32vec4 = vec_div(x3_fp32vec4, y3_fp32vec4); + y4_fp32vec4 = vec_div(x4_fp32vec4, y4_fp32vec4); + y5_fp32vec4 = vec_div(x5_fp32vec4, y5_fp32vec4); + y6_fp32vec4 = vec_div(x6_fp32vec4, y6_fp32vec4); + y7_fp32vec4 = vec_div(x7_fp32vec4, y7_fp32vec4); + y8_fp32vec4 = vec_div(x8_fp32vec4, y8_fp32vec4); + y9_fp32vec4 = vec_div(x9_fp32vec4, y9_fp32vec4); + y10_fp32vec4 = vec_div(x10_fp32vec4, y10_fp32vec4); + y11_fp32vec4 = vec_div(x11_fp32vec4, y11_fp32vec4); + + vec_xst(y0_fp32vec4, 0, z+(i )); + vec_xst(y1_fp32vec4, 0, z+(i+4 )); + vec_xst(y2_fp32vec4, 0, z+(i+8 )); + vec_xst(y3_fp32vec4, 0, z+(i+12 )); + vec_xst(y4_fp32vec4, 0, z+(i+16 )); + vec_xst(y5_fp32vec4, 0, z+(i+20)); + vec_xst(y6_fp32vec4, 0, z+(i+24)); + vec_xst(y7_fp32vec4, 0, z+(i+28)); + vec_xst(y8_fp32vec4, 0, z+(i+32)); + vec_xst(y9_fp32vec4, 0, z+(i+36)); + vec_xst(y10_fp32vec4, 0, z+(i+40)); + vec_xst(y11_fp32vec4, 0, z+(i+44)); + } + for (; i <= n-16; i += 16) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12 )); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12 )); + + y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4); + y1_fp32vec4 = vec_div(x1_fp32vec4, y1_fp32vec4); + y2_fp32vec4 = vec_div(x2_fp32vec4, y2_fp32vec4); + y3_fp32vec4 = vec_div(x3_fp32vec4, y3_fp32vec4); + + vec_xst(y0_fp32vec4, 0, z+(i )); + vec_xst(y1_fp32vec4, 0, z+(i+4 )); + vec_xst(y2_fp32vec4, 0, z+(i+8 )); + vec_xst(y3_fp32vec4, 0, z+(i+12 )); + } + for (; i <= n-4; i += 4) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4); + vec_xst(y0_fp32vec4, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] / y[i]; +} + + +//-------------------------------------------------------------------------------------------------- +// THFloatVector_divs_VSX: +//-------------------------------------------------------------------------------------------------- +static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + + float val[4] = {c, c, c, c}; + vector float c_fp64vec2 = vec_xl(0, val); + + vector float y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector float y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector float x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector float x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-48; i += 48) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+4 )); + x2_fp64vec2 = vec_xl(0, x+(i+8 )); + x3_fp64vec2 = vec_xl(0, x+(i+12 )); + x4_fp64vec2 = vec_xl(0, x+(i+16 )); + x5_fp64vec2 = vec_xl(0, x+(i+20)); + x6_fp64vec2 = vec_xl(0, x+(i+24)); + x7_fp64vec2 = vec_xl(0, x+(i+28)); + x8_fp64vec2 = vec_xl(0, x+(i+32)); + x9_fp64vec2 = vec_xl(0, x+(i+36)); + x10_fp64vec2 = vec_xl(0, x+(i+40)); + x11_fp64vec2 = vec_xl(0, x+(i+44)); + + y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2); + y4_fp64vec2 = vec_div(x4_fp64vec2, c_fp64vec2); + y5_fp64vec2 = vec_div(x5_fp64vec2, c_fp64vec2); + y6_fp64vec2 = vec_div(x6_fp64vec2, c_fp64vec2); + y7_fp64vec2 = vec_div(x7_fp64vec2, c_fp64vec2); + y8_fp64vec2 = vec_div(x8_fp64vec2, c_fp64vec2); + y9_fp64vec2 = vec_div(x9_fp64vec2, c_fp64vec2); + y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2); + y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2); + + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+4 )); + vec_xst(y2_fp64vec2, 0, y+(i+8 )); + vec_xst(y3_fp64vec2, 0, y+(i+12 )); + vec_xst(y4_fp64vec2, 0, y+(i+16 )); + vec_xst(y5_fp64vec2, 0, y+(i+20)); + vec_xst(y6_fp64vec2, 0, y+(i+24)); + vec_xst(y7_fp64vec2, 0, y+(i+28)); + vec_xst(y8_fp64vec2, 0, y+(i+32)); + vec_xst(y9_fp64vec2, 0, y+(i+36)); + vec_xst(y10_fp64vec2, 0, y+(i+40)); + vec_xst(y11_fp64vec2, 0, y+(i+44)); + } + for (; i <= n-16; i += 16) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+4 )); + x2_fp64vec2 = vec_xl(0, x+(i+8 )); + x3_fp64vec2 = vec_xl(0, x+(i+12 )); + + y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+4 )); + vec_xst(y2_fp64vec2, 0, y+(i+8 )); + vec_xst(y3_fp64vec2, 0, y+(i+12 )); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+4 )); + vec_xst(y2_fp64vec2, 0, y+(i+8 )); + vec_xst(y3_fp64vec2, 0, y+(i+16 )); + } + for (; i <= n-4; i += 4) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = x[i] / c; +} + + +//------------------------------------------------ +// +// Testing for correctness and performance +// +// If you want to run these tests, compile this +// file with -DRUN_VSX_TESTS on a Power machine, +// and then run the executable that is generated. +// +//------------------------------------------------ +// +// Example passing run (from a Power8 machine): +// +// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest +// $ ./vsxtest +// +// TODO +// +// +// Finished running all tests. All tests PASSED. +// +//------------------------------------------------ +#ifdef RUN_VSX_TESTS + +#include +#include +#include +#include +#include + +#define VSX_PERF_NUM_TEST_ELEMENTS 100000000 +#define VSX_FUNC_NUM_TEST_ELEMENTS 2507 + + +//-------------------------------------------------------------------------------------------------- +// Standard implementations: +//-------------------------------------------------------------------------------------------------- +static void standardDouble_fill(double *x, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + x[i] = c; +} + +static void standardFloat_fill(float *x, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + x[i] = c; +} + +static void standardDouble_cadd(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] + c * y[i]; +} + +static void standardFloat_cadd(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] + c * y[i]; +} + +static void standardDouble_adds(double *y, const double *x, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] = c + x[i]; +} + +static void standardFloat_adds(float *y, const float *x, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] = c + x[i]; +} + +static void standardDouble_cmul(double *z, const double *x, const double *y, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] * y[i]; +} + +static void standardFloat_cmul(float *z, const float *x, const float *y, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] * y[i]; +} + +static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] = c * x[i]; +} + +static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] = c * x[i]; +} + +static void standardDouble_cdiv(double *z, const double *x, const double *y, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] / y[i]; +} + +static void standardFloat_cdiv(float *z, const float *x, const float *y, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] / y[i]; +} + +static void standardDouble_divs(double *y, const double *x, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] = x[i] / c; +} + +static void standardFloat_divs(float *y, const float *x, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] = x[i] / c; +} + +double randDouble() +{ + return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0); +} + +int near(double a, double b) +{ + int aClass = fpclassify(a); + int bClass = fpclassify(b); + + if(aClass != bClass) // i.e. is it NAN, infinite, or finite...? + return 0; + + if(aClass == FP_INFINITE) // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity + return (signbit(a) == signbit(b)); + else if(aClass == FP_NORMAL) // if it is a normal number then check the magnitude of the difference between the numbers + return fabs(a - b) < 0.001; + else // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other. + return 1; +} + + +//-------------------------------------------------------------------------------------------------- +// Standard tests: +//-------------------------------------------------------------------------------------------------- +void test_THDoubleVector_fill_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *x_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + + double yVal0 = 17.2; + double yVal1 = 8.2; + double yVal2 = 5.1; + double yVal3 = -0.9; + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + yVal0 += 1.0; + yVal1 += 1.0; + yVal2 += 1.0; + yVal3 -= 1.0; + + standardDouble_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == yVal0); + + standardDouble_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == x_standard[i]); + printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n"); + + + free(x_standard); + free(x_optimized); +} + + +void test_THFloatVector_fill_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *x_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + + float yVal0 = 17.2; + float yVal1 = 8.2; + float yVal2 = 5.1; + float yVal3 = -0.9; + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + yVal0 += 1.0; + yVal1 += 1.0; + yVal2 += 1.0; + yVal3 -= 1.0; + + standardFloat_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == yVal0); + + standardFloat_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == x_standard[i]); + printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n"); + + + free(x_standard); + free(x_optimized); +} + + +void test_THDoubleVector_cadd_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double c = randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + y[i] = randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_cadd() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_cadd( z_standard+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_cadd( z_standard+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_cadd( z_standard+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_cadd( z_standard+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_cadd( z_standard+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_cadd_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(x); +} + +void test_THFloatVector_cadd_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float c = (float)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + y[i] = (float)randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_cadd() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_cadd( z_standard+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_cadd( z_standard+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_cadd( z_standard+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_cadd( z_standard+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_cadd( z_standard+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THFloatVector_cadd_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(x); +} + +void test_THDoubleVector_adds_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double c = randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + x[i] = randDouble(); + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_adds() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_adds( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_adds( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_adds( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_adds( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_adds( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_adds_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + +void test_THFloatVector_adds_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float c = (float)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + x[i] = (float)randDouble(); + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_adds() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_adds( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_adds( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_adds( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_adds( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_adds( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THFloatVector_adds_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + +void test_THDoubleVector_cmul_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + y[i] = randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_cmul() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_cmul( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_cmul( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_cmul( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_cmul( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_cmul( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_cmul_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(x); +} + +void test_THFloatVector_cmul_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + y[i] = (float)randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_cmul() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_cmul( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_cmul( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_cmul( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_cmul( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_cmul( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THFloatVector_cmul_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(x); +} + +void test_THDoubleVector_muls_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double c = randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_muls_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + +void test_THFloatVector_muls_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float c = (float)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THFloatVector_muls_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + + +void test_THDoubleVector_cdiv_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + y[i] = randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_cdiv( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_cdiv( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_cdiv( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_cdiv( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_cdiv( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_cdiv_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(x); +} + +void test_THFloatVector_cdiv_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + y[i] = (float)randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_cdiv( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_cdiv( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_cdiv( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_cdiv( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_cdiv( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THFloatVector_cdiv_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(x); +} + +void test_THDoubleVector_divs_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double c = randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_divs() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_divs( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_divs( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_divs( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_divs( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_divs( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_divs_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + +void test_THFloatVector_divs_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float c = (float)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_divs() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_divs( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_divs( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_divs( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_divs( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_divs( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THFloatVector_divs_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + +//-------------------------------------------------------------------------------------------------- +// Run tests: +//-------------------------------------------------------------------------------------------------- +int main() +{ + printf("\n"); + + + // First test utility functions + + assert(!near(0.1, -0.1)); + assert(!near(0.1f, -0.1f)); + assert(!near(9, 10)); + assert(near(0.1, 0.1000001)); + assert(near(0.1f, 0.1000001f)); + assert(near(100.764, 100.764)); + assert(!near(NAN, 0.0)); + assert(!near(-9.5, NAN)); + assert(!near(NAN, 100)); + assert(!near(-0.0, NAN)); + assert(near(NAN, NAN)); + assert(near(INFINITY, INFINITY)); + assert(near(-INFINITY, -INFINITY)); + assert(!near(INFINITY, NAN)); + assert(!near(0, INFINITY)); + assert(!near(-999.4324, INFINITY)); + assert(!near(INFINITY, 982374.1)); + assert(!near(-INFINITY, INFINITY)); + + + + // Then test each vectorized function + + test_THDoubleVector_fill_VSX(); + test_THFloatVector_fill_VSX(); + + test_THDoubleVector_cadd_VSX(); + test_THFloatVector_cadd_VSX(); + + test_THDoubleVector_adds_VSX(); + test_THFloatVector_adds_VSX(); + + test_THDoubleVector_cmul_VSX(); + test_THFloatVector_cmul_VSX(); + + test_THDoubleVector_muls_VSX(); + test_THFloatVector_muls_VSX(); + + test_THDoubleVector_cdiv_VSX(); + test_THFloatVector_cdiv_VSX(); + + test_THDoubleVector_divs_VSX(); + test_THFloatVector_divs_VSX(); + + + + printf("Finished running all tests. All tests PASSED.\n"); + return 0; +} + + +#endif // defined RUN_VSX_TESTS + +#endif // defined __PPC64__ + diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt new file mode 100644 index 0000000..ac445f7 --- /dev/null +++ b/aten/src/THC/CMakeLists.txt @@ -0,0 +1,165 @@ +set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} + "${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}" +PARENT_SCOPE) + +CONFIGURE_FILE(THCGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h") + +set(extra_src) +# loop over all types +foreach(THC_TYPE Byte Char Short Int Long Half Float Double) + # loop over files which need to be split between types (because of long compile times) + foreach(THC_FILE TensorSort TensorMathCompareT TensorMathPointwise TensorMathCompare TensorMathReduce TensorMasked) + if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu") + FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu" + "#include \"../THC${THC_FILE}.cuh\"\n#include \"THCTensor.hpp\"\n#include \"../generic/THC${THC_FILE}.cu\"\n#include \"../THCGenerate${THC_TYPE}Type.h\"\n") + endif() + LIST(APPEND extra_src "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu") + endforeach() +endforeach() + +IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) + LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/THCHalf.cu) +ENDIF() + +set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/THCCachingAllocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCCachingHostAllocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCGeneral.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCStream.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THCThreadLocal.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/THCReduceApplyUtils.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCBlas.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCSleep.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCStorage.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMath.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathBlas.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathMagma.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathPairwise.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathReduce.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathScan.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu + ${extra_src} + PARENT_SCOPE) + +INSTALL(FILES + THC.h + ${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h + THCGeneral.hpp + THCBlas.h + THCSleep.h + THCStorage.h + THCStorageCopy.h + THCStream.h + THCThreadLocal.h + THCTensor.h + THCTensorCopy.h + THCTensorCopy.hpp + THCTensorRandom.h + THCTensorMath.h + THCApply.cuh + THCReduce.cuh + THCReduceAll.cuh + THCReduceApplyUtils.cuh + THCTensorMathReduce.cuh + THCAsmUtils.cuh + THCAtomics.cuh + THCScanUtils.cuh + THCSortUtils.cuh + THCAllocator.h + THCCachingAllocator.h + THCCachingHostAllocator.h + THCDeviceUtils.cuh + THCDeviceTensor.cuh + THCDeviceTensor-inl.cuh + THCDeviceTensorUtils.cuh + THCDeviceTensorUtils-inl.cuh + THCGenerateAllTypes.h + THCGenerateByteType.h + THCGenerateCharType.h + THCGenerateShortType.h + THCGenerateIntType.h + THCGenerateLongType.h + THCGenerateHalfType.h + THCGenerateFloatType.h + THCGenerateFloatTypes.h + THCGenerateDoubleType.h + THCHalf.h + THCIntegerDivider.cuh + THCNumerics.cuh + THCTensorSort.cuh + THCTensorInfo.cuh + THCTensorMathPointwise.cuh + THCTensorTypeUtils.cuh + THCTensorRandom.cuh + THCTensorMathMagma.cuh + THCThrustAllocator.cuh + THCTensorMode.cuh + THCTensorTopK.cuh + THCCachingAllocator.h + # See Note [TH abstraction violation] + THCGenerator.hpp + THCTensor.hpp + THCStorage.hpp + DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC") + +INSTALL(FILES + generic/THCStorage.cpp + generic/THCStorage.cu + generic/THCStorage.h + generic/THCTensor.cpp + generic/THCTensor.cu + generic/THCTensor.h + generic/THCStorageCopy.cpp + generic/THCStorageCopy.cu + generic/THCStorageCopy.h + generic/THCTensorCopy.cpp + generic/THCTensorCopy.cu + generic/THCTensorCopy.h + generic/THCTensorMasked.h + generic/THCTensorMasked.cu + generic/THCTensorMath.h + generic/THCTensorMath.cu + generic/THCTensorMathBlas.cu + generic/THCTensorMathBlas.h + generic/THCTensorMathCompare.h + generic/THCTensorMathCompare.cu + generic/THCTensorMathCompareT.h + generic/THCTensorMathCompareT.cu + generic/THCTensorMathMagma.h + generic/THCTensorMathMagma.cu + generic/THCTensorMathPairwise.h + generic/THCTensorMathPairwise.cu + generic/THCTensorMathPointwise.h + generic/THCTensorMathPointwise.cu + generic/THCTensorMathReduce.h + generic/THCTensorMathReduce.cu + generic/THCTensorMathScan.h + generic/THCTensorMathScan.cu + generic/THCTensorScatterGather.h + generic/THCTensorScatterGather.cu + generic/THCTensorIndex.h + generic/THCTensorIndex.cu + generic/THCTensorSort.h + generic/THCTensorSort.cu + generic/THCTensorRandom.h + generic/THCTensorRandom.cu + generic/THCTensorMode.h + generic/THCTensorMode.cu + generic/THCTensorTopK.h + generic/THCTensorTopK.cu + DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic") diff --git a/aten/src/THC/THC.h b/aten/src/THC/THC.h new file mode 100644 index 0000000..e333d8a --- /dev/null +++ b/aten/src/THC/THC.h @@ -0,0 +1,18 @@ +#ifndef THC_INC +#define THC_INC + +#include "THCGeneral.h" +#include "THCAllocator.h" +#include "THCBlas.h" +#include "THCCachingAllocator.h" +#include "THCCachingHostAllocator.h" +#include "THCSleep.h" +#include "THCStorage.h" +#include "THCStorageCopy.h" +#include "THCStream.h" +#include "THCTensor.h" +#include "THCTensorCopy.h" +#include "THCTensorRandom.h" +#include "THCTensorMath.h" + +#endif diff --git a/aten/src/THC/THCAllocator.cpp b/aten/src/THC/THCAllocator.cpp new file mode 100644 index 0000000..c6be2f0 --- /dev/null +++ b/aten/src/THC/THCAllocator.cpp @@ -0,0 +1,68 @@ +#include "THCAllocator.h" + +static void THCudaHostDeleter(void* ptr) { + THCudaCheck(cudaFreeHost(ptr)); +} + +struct THCudaHostAllocator : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + void* ptr = nullptr; + if (size != 0) { + THCudaCheck(cudaMallocHost(&ptr, size)); + } + return {ptr, ptr, &THCudaHostDeleter, at::kCPU}; + } + at::DeleterFnPtr raw_deleter() const override { + return &THCudaHostDeleter; + } +}; + +static THCudaHostAllocator th_cuda_host_allocator; +at::Allocator* getTHCudaHostAllocator() { + return &th_cuda_host_allocator; +} + +static void THCUVADeleter(void* ptr) { + THCudaCheck(cudaFree(ptr)); +} + +struct THCUVAAllocator : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + // See J.1.1 of the CUDA_C_Programming_Guide.pdf for UVA and coherence rules + // on various compute capabilities. + void* ptr = nullptr; + if (size != 0) { + THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)); + } + return {ptr, ptr, &THCUVADeleter, at::kCPU}; + } + at::DeleterFnPtr raw_deleter() const override { + return &THCUVADeleter; + } +}; + +static THCUVAAllocator thc_uva_allocator; +at::Allocator* getTHCUVAAllocator() { + return &thc_uva_allocator; +} + + +THCIpcDeleter::~THCIpcDeleter() { + int prev_device; + THCudaCheck(cudaGetDevice(&prev_device)); + THCudaCheck(cudaSetDevice(device_)); + THCudaCheck(cudaIpcCloseMemHandle(data_)); + THCudaCheck(cudaSetDevice(prev_device)); +} + +void deleteTHCIpcDeleter(void* ptr) { + delete static_cast(ptr); +} + +at::DataPtr THCIpcDeleter::makeDataPtr(void* data, int device) { + // The dynamic allocation here is a bit unfortunate + int cur_device; + THCudaCheck(cudaGetDevice(&cur_device)); + auto* context = new THCIpcDeleter(data, device); + return {data, context, &deleteTHCIpcDeleter, at::Device(at::kCUDA, cur_device)}; +} diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h new file mode 100644 index 0000000..652bb7a --- /dev/null +++ b/aten/src/THC/THCAllocator.h @@ -0,0 +1,22 @@ +#ifndef THC_ALLOCATOR_INC +#define THC_ALLOCATOR_INC + +#include "THCGeneral.h" + +THC_API THAllocator* getTHCudaHostAllocator(void); +THC_API THAllocator* getTHCUVAAllocator(void); +// IPC doesn't support (re)allocation + +#ifdef __cplusplus +class AT_API THCIpcDeleter { +public: + THCIpcDeleter(void* data, int device) : data_(data), device_(device) {}; + ~THCIpcDeleter(); + static at::DataPtr makeDataPtr(void* data, int device); +private: + void* data_; + int device_; +}; +#endif + +#endif diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh new file mode 100644 index 0000000..d456b53 --- /dev/null +++ b/aten/src/THC/THCApply.cuh @@ -0,0 +1,748 @@ +#ifndef THC_APPLY_INC +#define THC_APPLY_INC + +#include "THCTensorCopy.h" +#include "THCReduceApplyUtils.cuh" +#include "THCTensorTypeUtils.cuh" +#include "THCTensorCopy.hpp" + +// +// This file contains pointwise operation functions and kernels that +// work on both contiguous and non-contiguous tensor arguments of +// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without +// copying or temporary storage. +// + +// Rearrange dimensions for pointwise operations so that strides are in +// decreasing order as much as possible, so that kernels have better memory +// access patterns. +// +// For example, consider a binary operation on two "transposed" 2-dim tensors: +// sizes: 256 512 +// aInfo->strides: 1 256 +// bInfo->strides: 1 256 +// +// Given this, each concurrent memory access inside kernelPointwiseApply2() is +// exactly 256 elements apart, resulting in poor performance. +// +// This function exchanges dimensions so that memory access is contiguous: +// sizes: 512 256 +// aInfo->strides: 256 1 +// bInfo->strides: 256 1 +// +// (Actually, it becomes even better because now collapseDims() can turn each +// input into one contiguous array.) +// +// In general, given M (<=3) TensorInfo's with N dimensions, we can view each +// strides[i] (0 <= i < N) as an M-tuple. Given each pair i < j, we exchange +// strides[i] and [j] if +// (1) strides[i][k] < strides[j][k] for some k (0 <= k < M) +// (exchanging them will benefit input #k), and +// (2) strides[i][k] <= strieds[j][k] for all k +// (exchanging them will not make any input worse). +template +void rearrangeDims(TensorInfo* aInfo, + TensorInfo* bInfo = nullptr, + TensorInfo* cInfo = nullptr) { + int numInfos = 1; + int dims = aInfo->dims; + IndexType *sizes[3] = { aInfo->sizes, }; + IndexType *strides[3] = { aInfo->strides, }; + + if (bInfo != nullptr) { + ++numInfos; + if (bInfo->dims != dims) return; + sizes[1] = bInfo->sizes; + strides[1] = bInfo->strides; + } + + if (cInfo != nullptr) { + ++numInfos; + if (cInfo->dims != dims) return; + sizes[2] = cInfo->sizes; + strides[2] = cInfo->strides; + } + + // Bail out if sizes do not match: we are using "deprecated pointwise + // behavior" among tensors of different shapes but same number of elements. + for (int i = 1; i < numInfos; ++i) { + for (int j = 0; j < dims; ++j) { + if (sizes[i][j] != sizes[0][j]) return; + } + } + + for (int i = 0; i < dims - 1; ++i) { + // No need to consider dimensions of size 1. + if (sizes[0][i] == 1) continue; + + for (int j = i + 1; j < dims; ++j) { + if (sizes[0][j] == 1) continue; + + // Compare the relative sizes of strides between dim #i and dim #j. + bool hasIncreasingStrides = false; + bool hasDecreasingStrides = false; + + for (int k = 0; k < numInfos; k++) { + IndexType stride_i = strides[k][i]; + IndexType stride_j = strides[k][j]; + if (stride_i < stride_j) { + hasIncreasingStrides = true; + } else if (stride_i > stride_j) { + hasDecreasingStrides = true; + } + } + + if (hasIncreasingStrides && !hasDecreasingStrides) { + for (int k = 0; k < numInfos; k++) { + IndexType size = sizes[k][i]; + sizes[k][i] = sizes[k][j]; + sizes[k][j] = size; + + IndexType stride = strides[k][i]; + strides[k][i] = strides[k][j]; + strides[k][j] = stride; + } + } + } + } +} + +// Threads per block for our apply kernel +// FIXME: use occupancy calculator instead +#define THC_APPLY_THREADS_PER_BLOCK (32 * 16) +#define THC_APPLY_BLOCKS_PER_SM 4 +template +__global__ void +kernelPointwiseApply1(const OffsetInfo a, + IndexType totalElements, + Op op) { + // NOTE: The two typecasts below are essential when IndexType is 64-bit; + // without them, results are silently truncated to 32 bits! + for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += (IndexType) gridDim.x * blockDim.x) { + op(a.get(linearIndex)); + } +} + +template +__global__ void +kernelPointwiseApply2(const OffsetInfo a, + const OffsetInfo b, + IndexType totalElements, + Op op) { + for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += (IndexType) gridDim.x * blockDim.x) { + op(a.get(linearIndex), b.get(linearIndex)); + } +} + +template +__global__ void +kernelPointwiseApply3(const OffsetInfo a, + const OffsetInfo b, + const OffsetInfo c, + IndexType totalElements, + Op op) { + for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += (IndexType) gridDim.x * blockDim.x) { + op(a.get(linearIndex), b.get(linearIndex), c.get(linearIndex)); + } +} + +inline dim3 getApplyBlock() { + return dim3(THC_APPLY_THREADS_PER_BLOCK); +} + +inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid, int curDevice) { + if (curDevice == -1) return false; + + uint64_t numBlocks = THCCeilDiv(totalElements, static_cast(THC_APPLY_THREADS_PER_BLOCK)); + uint64_t maxGridX = THCState_getDeviceProperties(state, curDevice)->maxGridSize[0]; + if (numBlocks > maxGridX) + numBlocks = maxGridX; + + // For 32-bit indices, make sure that gridDim.x * blockDim.x fits in 32 bits. + if (totalElements <= INT32_MAX && + numBlocks > INT32_MAX / THC_APPLY_THREADS_PER_BLOCK) + numBlocks = INT32_MAX / THC_APPLY_THREADS_PER_BLOCK; + + grid = dim3(numBlocks); + return true; +} + +template +bool THC_pointwiseApply1(THCState* state, + TensorTypeA* a, + const Op& op, + TensorArgType aType = ReadWrite) { + if (THCTensor__nDimension(state, a) > MAX_CUTORCH_DIMS) { + return false; + } + + if (THCTensor__nDimension(state, a) == 0) { + // Zero-dim tensor; do nothing + return true; + } + + const dim3 block = getApplyBlock(); + + dim3 grid; + ptrdiff_t totalElements = THCTensor_nElement(state, a); + + int curDevice = -1; + cudaGetDevice(&curDevice); + if (!getApplyGrid(state, totalElements, grid, curDevice)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + TensorTypeA* oldA = NULL; + + if (aType == ReadWrite && + THCTensor_maybeOverlappingIndices(state, a)) { + // Must perform in contiguous space + oldA = a; + a = (TensorTypeA*)THCTensor_newContiguous(state, a); + } + + // It is possible that the tensor dimensions are able to be collapsed, + // and thus we can reduce the actual code complexity of the copy by + // exploiting this knowledge statically, since the div/mod is the + // most expensive part of the operation, more so than memory accesses. + // For instance, when copying a non-contiguous to a contiguous tensor + // (or vice versa), the contiguous tensor can be collapsed to one + // dimension, and the loop to translate the linear index to the array + // index can be similarly collapsed. That is what this unrolling is for. +#define HANDLE_CASE(TYPE, A) \ + kernelPointwiseApply1 \ + <<>>( \ + OffsetInfo \ + (aInfo), \ + (TYPE) totalElements, op); + +#define HANDLE_A_CASE(TYPE, A) { \ + switch (A) { \ + case 1: \ + HANDLE_CASE(TYPE, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, -1); \ + break; \ + } \ +} + + // Can we use 32-bit integer math in the kernel (the linear ID for the copy + // and the resulting non-linear offset is all computable using 32-bit math?) + // We also use unsigned index math in the kernel, as signed div/mod has + // additional overhead. + if (THCTensor_canUse32BitIndexMath(state, a)) { + TensorInfo aInfo = + getTensorInfo(state, a); + rearrangeDims(&aInfo); + aInfo.collapseDims(); +#if CUDA_VERSION < 9000 + if (!aInfo.isContiguous()) { + grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); + } +#endif + HANDLE_A_CASE(unsigned int, aInfo.dims); + } else { + TensorInfo aInfo = + getTensorInfo(state, a); + rearrangeDims(&aInfo); + aInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1) { + OffsetInfo + aOffset(aInfo); + kernelPointwiseApply1 + <<>>( + aOffset, (uint64_t) totalElements, op); + } else { + +#if CUDA_VERSION < 9000 + grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); +#endif + OffsetInfo + aOffset(aInfo); + kernelPointwiseApply1 + <<>>( + aOffset, (uint64_t) totalElements, op); + } + } +#undef HANDLE_CASE +#undef HANDLE_A_CASE + + if (oldA) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldA contiguous. + THCTensor_copyIgnoringOverlaps(state, oldA, a); + THCTensor_free(state, a); + a = oldA; + } + + return true; +} + +template +bool THC_pointwiseApply2(THCState* state, + TensorTypeA* a, + TensorTypeB* b, + const Op& op, + TensorArgType aType = ReadWrite, + TensorArgType bType = ReadOnly) { + ptrdiff_t totalElements = THCTensor_nElement(state, a); + if (totalElements != THCTensor_nElement(state, b)) { + return false; + } + + if (THCTensor__nDimension(state, a) > MAX_CUTORCH_DIMS || + THCTensor__nDimension(state, b) > MAX_CUTORCH_DIMS) { + return false; + } + + if (THCTensor__nDimension(state, a) == 0) { + // Zero-dim tensor; do nothing + return true; + } + + const dim3 block = getApplyBlock(); + + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + if (!getApplyGrid(state, totalElements, grid, curDevice)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + TensorTypeA* oldA = NULL; + TensorTypeB* oldB = NULL; + + if (aType == ReadWrite && + THCTensor_maybeOverlappingIndices(state, a)) { + // Must perform in contiguous space + oldA = a; + a = (TensorTypeA*)THCTensor_newContiguous(state, a); + } + if (bType == ReadWrite && + THCTensor_maybeOverlappingIndices(state, b)) { + // Must perform in contiguous space + oldB = b; + b = (TensorTypeB*)THCTensor_newContiguous(state, b); + } + + // It is possible that the tensor dimensions are able to be collapsed, + // and thus we can reduce the actual code complexity of the copy by + // exploiting this knowledge statically, since the div/mod is the + // most expensive part of the operation, more so than memory accesses. + // For instance, when copying a non-contiguous to a contiguous tensor + // (or vice versa), the contiguous tensor can be collapsed to one + // dimension, and the loop to translate the linear index to the array + // index can be similarly collapsed. That is what this unrolling is for. +#define HANDLE_CASE(TYPE, A, B) \ + kernelPointwiseApply2 \ + <<>>( \ + OffsetInfo \ + (aInfo), \ + OffsetInfo \ + (bInfo), \ + (TYPE) totalElements, op); + +#define HANDLE_B_CASE(TYPE, A, B) { \ + switch (B) { \ + case 1: \ + HANDLE_CASE(TYPE, A, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, -1); \ + break; \ + } \ +} + +#define HANDLE_A_CASE(TYPE, A, B) { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B); \ + break; \ + } \ +} + + if (THCTensor_canUse32BitIndexMath(state, a) && + THCTensor_canUse32BitIndexMath(state, b)) { + TensorInfo aInfo = + getTensorInfo(state, a); + + TensorInfo bInfo = + getTensorInfo(state, b); + + rearrangeDims(&aInfo, &bInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); +#if CUDA_VERSION < 9000 + if (!(aInfo.isContiguous() && bInfo.isContiguous())) + grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); +#endif + + HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); + } else { + TensorInfo aInfo = + getTensorInfo(state, a); + + TensorInfo bInfo = + getTensorInfo(state, b); + + rearrangeDims(&aInfo, &bInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1 && bInfo.dims == 1) { + OffsetInfo + aOffset(aInfo); + OffsetInfo + bOffset(bInfo); + kernelPointwiseApply2 + <<>>( + aOffset, bOffset, (uint64_t) totalElements, op); + } else { +#if CUDA_VERSION < 9000 + grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); +#endif + OffsetInfo + aOffset(aInfo); + OffsetInfo + bOffset(bInfo); + kernelPointwiseApply2 + <<>>( + aOffset, bOffset, (uint64_t) totalElements, op); + } + } +#undef HANDLE_CASE +#undef HANDLE_B_CASE +#undef HANDLE_A_CASE + + if (oldA) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldA contiguous. + THCTensor_copyIgnoringOverlaps(state, oldA, a); + THCTensor_free(state, a); + a = oldA; + } + + if (oldB) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldB contiguous. + THCTensor_copyIgnoringOverlaps(state, oldB, b); + THCTensor_free(state, b); + b = oldB; + } + + return true; +} + +template +bool THC_pointwiseApply3(THCState* state, + TensorTypeA* a, + TensorTypeB* b, + TensorTypeC* c, + const Op& op, + TensorArgType aType = ReadWrite, + TensorArgType bType = ReadOnly, + TensorArgType cType = ReadOnly) { + ptrdiff_t totalElements = THCTensor_nElement(state, a); + + if (totalElements != THCTensor_nElement(state, b) || + totalElements != THCTensor_nElement(state, c)) { + return false; + } + + if (THCTensor__nDimension(state, a) > MAX_CUTORCH_DIMS || + THCTensor__nDimension(state, b) > MAX_CUTORCH_DIMS || + THCTensor__nDimension(state, c) > MAX_CUTORCH_DIMS) { + return false; + } + + if (THCTensor__nDimension(state, a) == 0) { + // Zero-dim tensor; do nothing + return true; + } + + const dim3 block = getApplyBlock(); + + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + if (!getApplyGrid(state, totalElements, grid, curDevice)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + TensorTypeA* oldA = NULL; + TensorTypeB* oldB = NULL; + TensorTypeC* oldC = NULL; + + if (aType == ReadWrite && + THCTensor_maybeOverlappingIndices(state, a)) { + // Must perform in contiguous space + oldA = a; + a = (TensorTypeA*)THCTensor_newContiguous(state, a); + } + if (bType == ReadWrite && + THCTensor_maybeOverlappingIndices(state, b)) { + // Must perform in contiguous space + oldB = b; + b = (TensorTypeB*)THCTensor_newContiguous(state, b); + } + if (cType == ReadWrite && + THCTensor_maybeOverlappingIndices(state, c)) { + // Must perform in contiguous space + oldC = c; + c = (TensorTypeC*)THCTensor_newContiguous(state, c); + } + +#define HANDLE_CASE(TYPE, A, B, C) \ + kernelPointwiseApply3 \ + <<>>( \ + OffsetInfo \ + (aInfo), \ + OffsetInfo \ + (bInfo), \ + OffsetInfo \ + (cInfo), \ + (TYPE) totalElements, op); + +#define HANDLE_C_CASE(TYPE, A, B, C) { \ + switch (C) { \ + case 1: \ + HANDLE_CASE(TYPE, A, B, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, B, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, B, -1); \ + break; \ + } \ +} + +#define HANDLE_B_CASE(TYPE, A, B, C) { \ + switch (B) { \ + case 1: \ + HANDLE_C_CASE(TYPE, A, 1, C); \ + break; \ + case 2: \ + HANDLE_C_CASE(TYPE, A, 2, C); \ + break; \ + default: \ + HANDLE_C_CASE(TYPE, A, -1, C); \ + break; \ + } \ +} + +#define HANDLE_A_CASE(TYPE, A, B, C) { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B, C); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B, C); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B, C); \ + break; \ + } \ +} + + if (THCTensor_canUse32BitIndexMath(state, a) && + THCTensor_canUse32BitIndexMath(state, b) && + THCTensor_canUse32BitIndexMath(state, c)) { + TensorInfo aInfo = + getTensorInfo(state, a); + + TensorInfo bInfo = + getTensorInfo(state, b); + + TensorInfo cInfo = + getTensorInfo(state, c); + + rearrangeDims(&aInfo, &bInfo, &cInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + cInfo.collapseDims(); + +#if CUDA_VERSION < 9000 + if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous())) + grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); +#endif + HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims); + } else { + TensorInfo aInfo = + getTensorInfo(state, a); + + TensorInfo bInfo = + getTensorInfo(state, b); + + TensorInfo cInfo = + getTensorInfo(state, c); + + rearrangeDims(&aInfo, &bInfo, &cInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + cInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) { + OffsetInfo + aOffset(aInfo); + OffsetInfo + bOffset(bInfo); + OffsetInfo + cOffset(cInfo); + kernelPointwiseApply3 + <<>>( + aOffset, bOffset, cOffset, (uint64_t) totalElements, op); + } else { +#if CUDA_VERSION < 9000 + grid.x = min(THCState_getCurrentDeviceProperties(state)->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); +#endif + + OffsetInfo + aOffset(aInfo); + OffsetInfo + bOffset(bInfo); + OffsetInfo + cOffset(cInfo); + kernelPointwiseApply3 + <<>>( + aOffset, bOffset, cOffset, (uint64_t) totalElements, op); + } + } +#undef HANDLE_CASE +#undef HANDLE_C_CASE +#undef HANDLE_B_CASE +#undef HANDLE_A_CASE + + if (oldA) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldA contiguous. + THCTensor_copyIgnoringOverlaps(state, oldA, a); + THCTensor_free(state, a); + a = oldA; + } + + if (oldB) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldB contiguous. + THCTensor_copyIgnoringOverlaps(state, oldB, b); + THCTensor_free(state, b); + b = oldB; + } + + if (oldC) { + // Ignore overlaps when copying back; if we use THCTensor_copy + // instead, it will recursively try and invoke ourselves to make + // oldC contiguous. + THCTensor_copyIgnoringOverlaps(state, oldC, c); + THCTensor_free(state, c); + c = oldC; + } + + return true; +} + +#undef THC_APPLY_THREADS_PER_BLOCK +#undef THC_APPLY_BLOCKS_PER_SM + +#endif // THC_APPLY_INC diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh new file mode 100644 index 0000000..c419ffa --- /dev/null +++ b/aten/src/THC/THCAsmUtils.cuh @@ -0,0 +1,142 @@ +#ifndef THC_ASM_UTILS_INC +#define THC_ASM_UTILS_INC + +// Collection of direct PTX functions + +template +struct Bitfield {}; + +template <> +struct Bitfield { + static __device__ __forceinline__ + unsigned int getBitfield(unsigned int val, int pos, int len) { +#if defined(__HIP_PLATFORM_HCC__) + pos &= 0x1f; + len &= 0x1f; + + unsigned int m = (1u << len) - 1u; + m <<= pos; + return val & m; +#else + unsigned int ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len)); + return ret; +#endif + } + + static __device__ __forceinline__ + unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) { +#if defined(__HIP_PLATFORM_HCC__) + pos &= 0x1f; + len &= 0x1f; + + unsigned int m = (1u << len) - 1u; + toInsert &= m; + toInsert <<= pos; + m <<= pos; + + return (val & ~m) | toInsert; +#else + unsigned int ret; + asm("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len)); + return ret; +#endif + } +}; + +template <> +struct Bitfield { + static __device__ __forceinline__ + uint64_t getBitfield(uint64_t val, int pos, int len) { +#if defined(__HIP_PLATFORM_HCC__) + pos &= 0x1f; + len &= 0x1f; + + uint64_t m = (1u << len) - 1u; + m <<= pos; + return val & m; +#else + uint64_t ret; + asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); + return ret; +#endif + } + + static __device__ __forceinline__ + uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) { +#if defined(__HIP_PLATFORM_HCC__) + pos &= 0x1f; + len &= 0x1f; + + uint64_t m = (1u << len) - 1u; + toInsert &= m; + toInsert <<= pos; + m <<= pos; + + return (val & ~m) | toInsert; +#else + uint64_t ret; + asm("bfi.b64 %0, %1, %2, %3, %4;" : + "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len)); + return ret; +#endif + } +}; + +__device__ __forceinline__ int getLaneId() { +#if defined(__HIP_PLATFORM_HCC__) + return hc::__lane_id(); +#else + int laneId; + asm("mov.s32 %0, %laneid;" : "=r"(laneId) ); + return laneId; +#endif +} + +__device__ __forceinline__ unsigned getLaneMaskLt() { +#if defined(__HIP_PLATFORM_HCC__) + std::uint64_t m = (1ull << getLaneId()) - 1ull; + return m; +#else + unsigned mask; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); + return mask; +#endif +} + +__device__ __forceinline__ unsigned getLaneMaskLe() { +#if defined(__HIP_PLATFORM_HCC__) + std::uint64_t m = (1ull << (getLaneId() + 1ull)) - 1ull; + return m; +#else + unsigned mask; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); + return mask; +#endif +} + +__device__ __forceinline__ unsigned getLaneMaskGt() { +#if defined(__HIP_PLATFORM_HCC__) + std::uint64_t m = getLaneMaskLe(); + return m ? ~m : m; +#else + unsigned mask; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask)); + return mask; +#endif +} + +__device__ __forceinline__ unsigned getLaneMaskGe() { +#if defined(__HIP_PLATFORM_HCC__) + std::uint64_t m = getLaneMaskLt(); + return ~m; +#else + unsigned mask; + asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask)); + return mask; +#endif +} + + +#endif // THC_ASM_UTILS_INC diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh new file mode 100644 index 0000000..bdb7859 --- /dev/null +++ b/aten/src/THC/THCAtomics.cuh @@ -0,0 +1,148 @@ +#ifndef THC_ATOMICS_INC +#define THC_ATOMICS_INC + +#include "THC.h" +#include "THCHalf.h" +#include "THCNumerics.cuh" + +namespace at { struct Half; } + +template +struct AtomicAddIntegerImpl; + +template +struct AtomicAddIntegerImpl { + inline __device__ void operator()(T *address, T val) { + uint32_t * address_as_ui = + (uint32_t *) (address - ((size_t)address & 3)); + uint32_t old = *address_as_ui; + uint32_t shift = (((size_t)address & 3) * 8); + uint32_t sum; + uint32_t assumed; + + do { + assumed = old; + sum = val + T((old >> shift) & 0xff); + old = (old & ~(0x000000ff << shift)) | (sum << shift); + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); + } +}; + +template +struct AtomicAddIntegerImpl { + inline __device__ void operator()(T *address, T val) { + uint32_t * address_as_ui = + (uint32_t *) ((char *)address - ((size_t)address & 2)); + uint32_t old = *address_as_ui; + uint32_t sum; + uint32_t newval; + uint32_t assumed; + + do { + assumed = old; + sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff); + newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum; + old = atomicCAS(address_as_ui, assumed, newval); + } while (assumed != old); + } +}; + +template +struct AtomicAddIntegerImpl { + inline __device__ void operator()(T *address, T val) { + uint32_t * address_as_ui = (uint32_t *) (address); + uint32_t old = *address_as_ui; + uint32_t newval; + uint32_t assumed; + + do { + assumed = old; + newval = val + (T)old; + old = atomicCAS(address_as_ui, assumed, newval); + } while (assumed != old); + } +}; + +template +struct AtomicAddIntegerImpl { + inline __device__ void operator()(T *address, T val) { + unsigned long long * address_as_ui = (unsigned long long *) (address); + unsigned long long old = *address_as_ui; + unsigned long long newval; + unsigned long long assumed; + + do { + assumed = old; + newval = val + (T)old; + old = atomicCAS(address_as_ui, assumed, newval); + } while (assumed != old); + } +}; + +static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) { + AtomicAddIntegerImpl()(address, val); +} + +static inline __device__ void atomicAdd(int8_t *address, int8_t val) { + AtomicAddIntegerImpl()(address, val); +} + +static inline __device__ void atomicAdd(int16_t *address, int16_t val) { + AtomicAddIntegerImpl()(address, val); +} + +static inline __device__ void atomicAdd(int64_t *address, int64_t val) { + AtomicAddIntegerImpl()(address, val); +} + +#ifdef CUDA_HALF_TENSOR +static inline __device__ void atomicAdd(half *address, half val) { + unsigned int * address_as_ui = + (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + do { + assumed = old; +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) + half hsum; + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + hsum = THCNumerics::add(hsum, val); +#else + __half_raw hsum; + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + half tmpres = THCNumerics::add(hsum, val); + hsum = __half_raw(tmpres); +#endif + old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); +} +static inline __device__ void atomicAdd(at::Half *address, half val) { + return atomicAdd(reinterpret_cast(address), val); +} +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) +// from CUDA C Programmic Guide +static inline __device__ void atomicAdd(double *address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull; + unsigned long long int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +} while (assumed != old); +} +#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__) + // This needs to be defined for the host side pass + static inline __device__ void atomicAdd(double *address, double val) { } +#endif + +#endif // THC_ATOMICS_INC diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu new file mode 100644 index 0000000..e2003da --- /dev/null +++ b/aten/src/THC/THCBlas.cu @@ -0,0 +1,539 @@ +#include "THCBlas.h" +#include "THCGeneral.h" +#include "THCHalf.h" + +float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy) +{ + if (n == 1) { + incx = 1; + incy = 1; + } + + if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { + int i_n = (int)n; + int i_incx = (int)incx; + int i_incy = (int)incy; + float result; + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSdot(handle, i_n, x, i_incx, y, i_incy, &result)); + return result; + } + + THError("Cublas_Sdot only supports n, incx and incy " + "up to signed integer limits: %d", INT_MAX); + return 0; +} + +double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy) +{ + if (n == 1) { + incx = 1; + incy = 1; + } + + if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { + int i_n = (int)n; + int i_incx = (int)incx; + int i_incy = (int)incy; + double result; + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDdot(handle, i_n, x, i_incx, y, i_incy, &result)); + return result; + } + + THError("Cublas_Ddot only supports n, incx and incy " + "up to signed integer limits: %d", INT_MAX); + return 0; +} + +#ifdef CUDA_HALF_TENSOR +half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy) +{ +#if CUDA_VERSION >= 8000 + if (n == 1) { + incx = 1; + incy = 1; + } + + if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { + half result; + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDotEx(handle, n, + x, CUDA_R_16F, incx, + y, CUDA_R_16F, incy, + &result, CUDA_R_16F, + CUDA_R_32F)); + return result; + } + + THError("Cublas_Hdot only supports n, incx and incy " + "up to signed integer limits: %d", INT_MAX); + return THC_float2half(0); +#else + THError("Cublas_Hdot requires CUDA 8.0+"); + return THC_float2half(0); +#endif +} +#endif + +/* Level 2 */ +void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy) +{ + if(n == 1) + lda = m; + + cublasOperation_t op; + if (trans == 't') op = CUBLAS_OP_T; + else if (trans == 'n') op = CUBLAS_OP_N; + else if (trans == 'c') op = CUBLAS_OP_C; + else THError("Cublas_Sgemv parameter trans should be 't', 'n' or 'c'."); + + if( (m <= INT_MAX) && (n <= INT_MAX) && + (lda > 0) && (lda <= INT_MAX) && + (incx > 0) && (incx <= INT_MAX) && + (incy > 0) && (incy <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgemv(handle, op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy)); + return; + } + THError("Cublas_Sgemv only supports m, n, lda, incx, incy" + "in the range 0 < [val] <= %d", INT_MAX); +} + +void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double alpha, double *a, int64_t lda, double *x, int64_t incx, double beta, double *y, int64_t incy) +{ + if(n == 1) + lda = m; + + cublasOperation_t op; + if (trans == 't') op = CUBLAS_OP_T; + else if (trans == 'n') op = CUBLAS_OP_N; + else if (trans == 'c') op = CUBLAS_OP_C; + else THError("Cublas_Sgemv parameter trans should be 't', 'n' or 'c'."); + + if( (m <= INT_MAX) && (n <= INT_MAX) && + (lda > 0) && (lda <= INT_MAX) && + (incx > 0) && (incx <= INT_MAX) && + (incy > 0) && (incy <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgemv(handle, op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy)); + return; + } + THError("Cublas_Dgemv only supports m, n, lda, incx, incy" + "in the range 0 < [val] <= %d", INT_MAX); +} + +void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) +{ + if(n == 1) + lda = m; + + if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSger(handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda)); + return; + } + THError("Cublas_Sger only supports m, n, lda, incx, incy" + "with the bound [val] <= %d", INT_MAX); +} + +void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) +{ + if(n == 1) + lda = m; + + if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDger(handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda)); + return; + } + THError("Cublas_Dger only supports m, n, lda, incx, incy" + "with the bound [val] <= %d", INT_MAX); +} + + +cublasOperation_t convertTransToCublasOperation(char trans) { + if (trans == 't') return CUBLAS_OP_T; + else if (trans == 'n') return CUBLAS_OP_N; + else if (trans == 'c') return CUBLAS_OP_C; + else { + THError("trans must be one of: t, n, c"); + return CUBLAS_OP_T; + } +} + +void adjustLd(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t *lda, int64_t *ldb, int64_t *ldc) +{ + int transa_ = ((transa == 't') || (transa == 'T')); + int transb_ = ((transb == 't') || (transb == 'T')); + + if(n == 1) + *ldc = m; + + if(transa_) + { + if(m == 1) + *lda = k; + } + else + { + if(k == 1) + *lda = m; + } + + if(transb_) + { + if(k == 1) + *ldb = n; + } + else + { + if(n == 1) + *ldb = k; + } +} + +/* Level 3 */ +void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) +{ + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_lda = (int)lda; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgemm(handle, opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc)); + return; + } + THError("Cublas_Sgemm only supports m, n, k, lda, ldb, ldc" + "with the bound [val] <= %d", INT_MAX); +} + +#ifdef CUDA_HALF_TENSOR +// In CUDA 8.0, definition of data types for sgemmex changed +#if CUDA_VERSION < 8000 +# define CUDA_R_16F CUBLAS_DATA_HALF +#endif + +void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc) +{ + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_lda = (int)lda; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + + // Simulated Hgemm + float fAlpha = THC_half2float(alpha); + float fBeta = THC_half2float(beta); + +#if CUDA_VERSION < 9000 + THCublasCheck(cublasSgemmEx(handle, opa, opb, + i_m, i_n, i_k, &fAlpha, + a, CUDA_R_16F, i_lda, b, CUDA_R_16F, + i_ldb, &fBeta, c, CUDA_R_16F, i_ldc)); +#else + cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state); + if (prop->major >= 5){ + THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); + THCublasCheck(cublasGemmEx(handle, opa, opb, + i_m, i_n, i_k, &fAlpha, + a, CUDA_R_16F, i_lda, b, CUDA_R_16F, + i_ldb, &fBeta, c, CUDA_R_16F, i_ldc, + CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP)); + THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); + }else{ + THCublasCheck(cublasSgemmEx(handle, opa, opb, + i_m, i_n, i_k, &fAlpha, + a, CUDA_R_16F, i_lda, b, CUDA_R_16F, + i_ldb, &fBeta, c, CUDA_R_16F, i_ldc)); + } +#endif + return; + } + THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc" + "with th bound [val] <= %d", INT_MAX); +} +#endif + +void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) +{ + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_lda = (int)lda; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgemm(handle, opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc)); + return; + } + THError("Cublas_Dgemm only supports m, n, k, lda, ldb, ldc" + "with the bound [val] <= %d", INT_MAX); +} + +#if CUDA_VERSION >= 9010 +void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB, + half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount) +{ + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + + { + THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + float fAlpha = THC_half2float(alpha); + float fBeta = THC_half2float(beta); + THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); + THCublasCheck(cublasGemmStridedBatchedEx(handle, + opa, opb, (int)m, (int)n, (int)k, + (void*)&fAlpha, a, CUDA_R_16F, (int)lda, strideA, + b, CUDA_R_16F, (int)ldb, strideB, + (void*)&fBeta, c, CUDA_R_16F, (int)ldc, strideC, + (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); +} +#endif + +void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, + float beta, float *c[], int64_t ldc, int64_t batchCount) +{ + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + { + THError("Cublas_SgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgemmBatched(handle, + opa, opb, (int)m, (int)n, (int)k, + &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc, + (int)batchCount)); +} + +#if CUDA_VERSION >= 8000 +void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB, + float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount) +{ + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + + { + THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgemmStridedBatched(handle, + opa, opb, (int)m, (int)n, (int)k, + &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC, + (int)batchCount)); +} +#endif + +void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb, + double beta, double *c[], int64_t ldc, int64_t batchCount) +{ + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + { + THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgemmBatched(handle, + opa, opb, (int)m, (int)n, (int)k, + &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc, + (int)batchCount)); +} + +#if CUDA_VERSION >= 8000 +void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB, + double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount) +{ + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + { + THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgemmStridedBatched(handle, + opa, opb, (int)m, (int)n, (int)k, + &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC, + (int)batchCount)); +} +#endif + +/* Inverse */ +void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) { + if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Sgetrf only supports n, lda, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); +} + +void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) { + if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Dgetrf only supports n, lda, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); +} + +THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize) +{ + if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + + // no need to adjust leading dimensions, since matrices are square + cublasOperation_t opa = convertTransToCublasOperation(transa); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize)); +} + + +THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize) +{ + if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + + // no need to adjust leading dimensions, since matrices are square + cublasOperation_t opa = convertTransToCublasOperation(transa); + + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize)); +} + +void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) { + + if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Sgetri only supports n, lda, ldc, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasSgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize)); +} + +void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) { + + if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Dgetri only supports n, lda, ldc, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + cublasHandle_t handle = THCState_getCurrentBlasHandle(state); + cublasSetStream(handle, THCState_getCurrentStream(state)); + THCublasCheck(cublasDgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize)); +} diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h new file mode 100644 index 0000000..d9cff32 --- /dev/null +++ b/aten/src/THC/THCBlas.h @@ -0,0 +1,59 @@ +#ifndef THC_BLAS_INC +#define THC_BLAS_INC + +#include "THCGeneral.h" +#include "THCHalf.h" + +/* Level 1 */ +THC_API float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy); +THC_API double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy); +#ifdef CUDA_HALF_TENSOR +THC_API half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy); +#endif + +/* Level 2 */ +THC_API void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy); +THC_API void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double alpha, double *a, int64_t lda, double *x, int64_t incx, double beta, double *y, int64_t incy); +THC_API void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda); +THC_API void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda); + +/* Level 3 */ +THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc); +THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); + +#ifdef CUDA_HALF_TENSOR +THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc); +#endif + +THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, + float beta, float *c[], int64_t ldc, int64_t batchCount); +THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb, + double beta, double *c[], int64_t ldc, int64_t batchCount); +#if CUDA_VERSION >= 8000 +THC_API void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB, + float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount); +THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB, + double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount); +#endif + +#if CUDA_VERSION >= 9010 +void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB, + half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount); +#endif + +/* Inverse */ +THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize); +THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize); + +THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize); +THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize); + +THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize); +THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize); + +#endif diff --git a/aten/src/THC/THCCachingAllocator.cpp b/aten/src/THC/THCCachingAllocator.cpp new file mode 100644 index 0000000..7d400a2 --- /dev/null +++ b/aten/src/THC/THCCachingAllocator.cpp @@ -0,0 +1,575 @@ +#include "THCCachingAllocator.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +// Yet another caching allocator for CUDA device allocations. +// +// - Allocations are associated with a stream. Once freed, blocks can be +// re-allocated on the same stream, but not on any other stream. +// - The allocator attempts to find the smallest cached block that will fit the +// requested size. If the block is larger than the requested size, it may be +// split. If no block is found, the allocator will delegate to cudaMalloc. +// - If the cudaMalloc fails, the allocator will free all cached blocks that +// are not split and retry the allocation. +// - Large (>1MB) and small allocation requests are handled separately. Large +// allocation requests can be filled by a cudaMalloc call of the exact size. +// Small requests will allocate and split a 1MB buffer, if necessary. +// +// With this allocator, allocations and frees should logically be considered +// "usages" of the memory segment associated with streams, just like kernel +// launches. The programmer must insert the proper synchronization if memory +// segments are used from multiple streams. +// +// The library provides a recordStream() function to help insert the correct +// synchronization when allocations are used on multiple streams. This will +// ensure that the block is not reused before each recorded stream completes +// work. +// + +namespace { + +typedef std::shared_ptr THCStreamPtr; +typedef std::set stream_set; + +const size_t kRoundSmall = 512; // round up small allocs to 512 bytes +const size_t kRoundLarge = 131072; // round up large allocs to 128 KiB +const size_t kSmallAlloc = 1048576; // largest "small" allocation is 1 MiB + +struct DeviceStats { + uint64_t amount_allocated; // total amount allocated in bytes + uint64_t max_amount_allocated; // max total amount allocated in bytes + uint64_t amount_cached; // total amount in cache in bytes + uint64_t max_amount_cached; // max total amount in cache in bytes + + DeviceStats() : + amount_allocated(0), max_amount_allocated(0), + amount_cached(0), max_amount_cached(0) { } + + void increaseAllocated(size_t delta) { + amount_allocated += delta; + max_amount_allocated = std::max(max_amount_allocated, amount_allocated); + } + + void decreaseAllocated(size_t delta) { + amount_allocated -= delta; + } + + void increaseCached(size_t delta) { + amount_cached += delta; + max_amount_cached = std::max(max_amount_cached, amount_cached); + } + + void decreaseCached(size_t delta) { + amount_cached -= delta; + } +}; + +struct Block { + int device; // gpu + cudaStream_t stream; // allocation stream + stream_set stream_uses; // streams on which the block was used + size_t size; // block size in bytes + char* ptr; // memory address + bool allocated; // in-use flag + Block* prev; // prev block if split from a larger allocation + Block* next; // next block if split from a larger allocation + int event_count; // number of outstanding CUDA events + + Block(int device, cudaStream_t stream, size_t size, char* ptr=NULL) : + device(device), stream(stream), stream_uses(), size(size), ptr(ptr), + allocated(0), prev(NULL), next(NULL), event_count(0) { } +}; + +static bool BlockComparator(const Block* a, const Block* b) +{ + if (a->device != b->device) { + return a->device < b->device; + } + if (a->stream != b->stream) { + return (uintptr_t)a->stream < (uintptr_t)b->stream; + } + if (a->size != b->size) { + return a->size < b->size; + } + return (uintptr_t)a->ptr < (uintptr_t)b->ptr; +} + +} // namespace + +struct THCCachingAllocator +{ + typedef bool (*Comparison)(const Block*, const Block*); + typedef std::set FreeBlocks; + + // device statistics + std::vector device_stats; + + // lock around all operations + std::mutex mutex; + + // lock around calls to cudaFree (to prevent deadlocks with NCCL) + std::mutex cuda_free_mutex; + + // cached blocks larger than 1 MB + FreeBlocks large_blocks; + + // cached blocks 1 MB or smaller + FreeBlocks small_blocks; + + // allocated blocks by device pointer + std::unordered_map allocated_blocks; + + // outstanding cuda events + std::deque> cuda_events; + + THCCachingAllocator() : + large_blocks(BlockComparator), + small_blocks(BlockComparator) {} + + DeviceStats &get_stats_for_device(int device) { + THAssert(device >= 0); + if ((size_t) device >= device_stats.size()) { + device_stats.resize(device + 1); + } + return device_stats.at(device); + } + + /** allocates a block which is safe to use from the provided stream */ + cudaError_t malloc(void** devPtr, size_t size, cudaStream_t stream) + { + std::lock_guard lock(mutex); + + int device; + cudaError_t err = cudaGetDevice(&device); + if (err != cudaSuccess) { + return err; + } + + err = process_events(); + if (err != cudaSuccess) { + return err; + } + + size = round_size(size); + bool small = size <= kSmallAlloc; + + DeviceStats &stats = get_stats_for_device(device); + + Block search_key(device, stream, size); + auto& free_blocks = small ? large_blocks : small_blocks; + + Block* block = NULL; + Block* remaining = NULL; + + auto it = free_blocks.lower_bound(&search_key); + if (it != free_blocks.end() && (*it)->device == device && (*it)->stream == stream) { + block = *it; + free_blocks.erase(it); + } else { + void* ptr; + size_t alloc_size = small ? kSmallAlloc : size; + err = cuda_malloc_retry(device, &ptr, alloc_size); + if (err != cudaSuccess) { + return err; + } + stats.increaseCached(alloc_size); + block = new Block(device, stream, alloc_size, (char*)ptr); + } + + if (block->size - size >= (small ? kRoundSmall : kSmallAlloc + 1)) { + remaining = block; + + block = new Block(device, stream, size, block->ptr); + block->prev = remaining->prev; + if (block->prev) { + block->prev->next = block; + } + block->next = remaining; + + remaining->prev = block; + remaining->ptr += size; + remaining->size -= size; + free_blocks.insert(remaining); + } + + block->allocated = true; + allocated_blocks[block->ptr] = block; + + *devPtr = (void*)block->ptr; + + stats.increaseAllocated(block->size); + return cudaSuccess; + } + + cudaError_t free(void* ptr) + { + std::lock_guard lock(mutex); + if (!ptr) { + return cudaSuccess; + } + + auto it = allocated_blocks.find(ptr); + if (it == allocated_blocks.end()) { + return cudaErrorInvalidDevicePointer; + } + + Block* block = it->second; + allocated_blocks.erase(it); + block->allocated = false; + + get_stats_for_device(block->device).decreaseAllocated(block->size); + if (!block->stream_uses.empty()) { + return insert_events(block); + } + + free_block(block); + return cudaSuccess; + } + + /** returns cached blocks to the system allocator */ + cudaError_t emptyCache() + { + std::lock_guard lock(mutex); + cudaError_t err = free_blocks(large_blocks, large_blocks.begin(), large_blocks.end()); + if (err != cudaSuccess) { + return err; + } + err = free_blocks(small_blocks, small_blocks.begin(), small_blocks.end()); + if (err != cudaSuccess) { + return err; + } + return cudaSuccess; + } + + void* getBaseAllocation(void* ptr, size_t* outSize) + { + std::lock_guard lock(mutex); + Block* block = find_allocated_block(ptr); + if (!block) { + THError("invalid device pointer: %p", ptr); + } + while (block->prev) { + block = block->prev; + } + void *basePtr = block->ptr; + if (outSize) { + size_t size = 0; + while (block) { + size += block->size; + block = block->next; + } + *outSize = size; + } + return basePtr; + } + + // Accumulates sizes of all memory blocks for given device in given free list + void cacheInfoAux(FreeBlocks& blocks, int dev_id, size_t* total, size_t* largest) + { + Block search_key(dev_id, 0, 0); + auto it = blocks.lower_bound(&search_key); + for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) { + size_t blocksize = (*it)->size; + *total += blocksize; + if (blocksize > *largest) { + *largest = blocksize; + } + } + } + + void cacheInfo(int dev_id, size_t* total, size_t* largest) + { + std::lock_guard lock(mutex); + cacheInfoAux(large_blocks, dev_id, total, largest); + cacheInfoAux(small_blocks, dev_id, total, largest); + } + + void recordStream(void* ptr, THCStream* stream) + { + std::lock_guard lock(mutex); + Block* block = find_allocated_block(ptr); + if (!block) { + THError("invalid device pointer: %p", ptr); + } + if (THCStream_stream(stream) == block->stream) { + // ignore uses on the allocation stream, since those don't require any + // special synchronization + return; + } + THCStream_retain(stream); + block->stream_uses.insert(THCStreamPtr(stream, &THCStream_free)); + } + + /** moves a block into the free block list */ + void free_block(Block* block) + { + THAssert(!block->allocated && block->event_count == 0); + bool small = block->size <= kSmallAlloc; + auto& free_blocks = small ? large_blocks : small_blocks; + try_merge_blocks(block, block->prev, free_blocks); + try_merge_blocks(block, block->next, free_blocks); + free_blocks.insert(block); + } + + /** combine previously split blocks */ + void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks) + { + if (!src || src->allocated || src->event_count > 0) { + return; + } + if (dst->prev == src) { + dst->ptr = src->ptr; + dst->prev = src->prev; + if (dst->prev) { + dst->prev->next = dst; + } + } else { + dst->next = src->next; + if (dst->next) { + dst->next->prev = dst; + } + } + dst->size += src->size; + free_blocks.erase(src); + delete src; + } + + size_t round_size(size_t size) + { + if (size < kRoundSmall) { + size = kRoundSmall; + } else if (size < kSmallAlloc) { + size += kRoundSmall - 1 - (size - 1) % kRoundSmall; + } else { + size += kRoundLarge - 1 - (size - 1) % kRoundLarge; + } + return size; + } + + cudaError_t cuda_malloc_retry(int device, void** devPtr, size_t size) + { + // Try cudaMalloc. If cudaMalloc fails, frees all non-split cached blocks + // and retries. + cudaError_t err = cudaMalloc(devPtr, size); + if (err != cudaSuccess) { + cudaGetLastError(); + err = free_cached_blocks(device); + if (err != cudaSuccess) { + return err; + } + err = cudaMalloc(devPtr, size); + if (err != cudaSuccess) { + return err; + } + } + return cudaSuccess; + } + + cudaError_t free_cached_blocks(int device) + { + // Free all non-split cached blocks on device + Block lower_bound(device, NULL, 0); + Block upper_bound(device + 1, NULL, 0); + + cudaError_t err = free_blocks( + large_blocks, + large_blocks.lower_bound(&lower_bound), + large_blocks.lower_bound(&upper_bound)); + if (err != cudaSuccess) { + return err; + } + err = free_blocks( + small_blocks, + small_blocks.lower_bound(&lower_bound), + small_blocks.lower_bound(&upper_bound)); + return err; + } + + cudaError_t free_blocks(FreeBlocks& blocks, FreeBlocks::iterator it, FreeBlocks::iterator end) + { + // Frees all non-split blocks between `it` and `end` + std::lock_guard lock(cuda_free_mutex); + while (it != end) { + Block* block = *it; + if (!block->prev && !block->next) { + cudaError_t err = cudaFree((void*)block->ptr); + if (err != cudaSuccess) { + return err; + } + get_stats_for_device(block->device).decreaseCached(block->size); + auto cur = it; + ++it; + blocks.erase(cur); + delete block; + } else { + ++it; + } + } + return cudaSuccess; + } + + Block* find_allocated_block(void *ptr) { + auto it = allocated_blocks.find(ptr); + if (it == allocated_blocks.end()) { + return NULL; + } + return it->second; + } + + cudaError_t insert_events(Block* block) + { + cudaError_t err; + + int prev_device; + err = cudaGetDevice(&prev_device); + if (err != cudaSuccess) return err; + + std::set streams(std::move(block->stream_uses)); + THAssert(block->stream_uses.empty()); + for (auto it = streams.begin(); it != streams.end(); ++it) { + auto& stream = *it; + + err = cudaSetDevice(THCStream_device(stream.get())); + if (err != cudaSuccess) break; + + cudaEvent_t event; + err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + if (err != cudaSuccess) break; + + err = cudaEventRecord(event, THCStream_stream(stream.get())); + if (err != cudaSuccess) break; + + block->event_count++; + cuda_events.emplace_back(event, block); + } + + cudaSetDevice(prev_device); + return err; + } + + cudaError_t process_events() + { + // Process outstanding cudaEvents. Events that are completed are removed + // from the queue, and the 'event_count' for the corresponding allocation + // is decremented. Stops at the first event which has not been completed. + // Since events on different devices or streams may occur out of order, + // the processing of some events may be delayed. + while (!cuda_events.empty()) { + auto& e = cuda_events.front(); + cudaEvent_t event = e.first; + Block* block = e.second; + + cudaError_t err = cudaEventQuery(event); + if (err == cudaErrorNotReady) { + break; + } else if (err != cudaSuccess) { + return err; + } + err = cudaEventDestroy(event); + if (err != cudaSuccess) { + return err; + } + + block->event_count--; + if (block->event_count == 0) { + free_block(block); + } + cuda_events.pop_front(); + } + return cudaSuccess; + } +}; + +THCCachingAllocator caching_allocator; + +static void CudaCachingDeleter(void* ptr) { + AT_CUDA_CHECK(caching_allocator.free(ptr)); +} + +// NB: I decided not to fold this into THCCachingAllocator, because the latter +// has a lot more methods and it wasn't altogether clear that they should +// actually be publically exposed +struct CudaCachingAllocator : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + int device; + THCudaCheck(cudaGetDevice(&device)); + void* r = nullptr; + if (size != 0) { + AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::globalContext().getCurrentCUDAStreamOnDevice(device))); + } + return {r, r, &CudaCachingDeleter, at::Device(at::kCUDA, device)}; + } + at::DeleterFnPtr raw_deleter() const override { + return &CudaCachingDeleter; + } +}; + +CudaCachingAllocator device_allocator; + +THC_API at::Allocator* THCCachingAllocator_get(void) +{ + return &device_allocator; +} + +THC_API void THCCachingAllocator_emptyCache(void) { + AT_CUDA_CHECK(caching_allocator.emptyCache()); +} + +THC_API void THCCachingAllocator_cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) { + caching_allocator.cacheInfo(dev_id, cachedAndFree, largestBlock); +} + +THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size) +{ + return caching_allocator.getBaseAllocation(ptr, size); +} + +THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream) +{ + caching_allocator.recordStream(ptr, stream); +} + +THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex() +{ + return &caching_allocator.cuda_free_mutex; +} + +static inline void assertValidDevice(int device) { + int device_count; + THCudaCheck(cudaGetDeviceCount(&device_count)); + THAssertMsg(0 <= device && device < device_count, "Invalid device argument."); +} + +THC_API uint64_t THCCachingAllocator_currentMemoryAllocated(int device) +{ + assertValidDevice(device); + return caching_allocator.get_stats_for_device(device).amount_allocated; +} + +THC_API uint64_t THCCachingAllocator_maxMemoryAllocated(int device) { + assertValidDevice(device); + return caching_allocator.get_stats_for_device(device).max_amount_allocated; +} + +THC_API uint64_t THCCachingAllocator_currentMemoryCached(int device) +{ + assertValidDevice(device); + return caching_allocator.get_stats_for_device(device).amount_cached; +} + +THC_API uint64_t THCCachingAllocator_maxMemoryCached(int device) { + assertValidDevice(device); + return caching_allocator.get_stats_for_device(device).max_amount_cached; +} diff --git a/aten/src/THC/THCCachingAllocator.h b/aten/src/THC/THCCachingAllocator.h new file mode 100644 index 0000000..61314ac --- /dev/null +++ b/aten/src/THC/THCCachingAllocator.h @@ -0,0 +1,25 @@ +#ifndef THC_DEVICE_ALLOCATOR_INC +#define THC_DEVICE_ALLOCATOR_INC + +#if (__cplusplus >= 201103L) || (defined(_MSC_VER) && defined(__cplusplus)) +#include +#endif + +#include "THCGeneral.h" +#include "THCStream.h" + +THC_API THCDeviceAllocator* THCCachingAllocator_get(void); +THC_API void THCCachingAllocator_emptyCache(void); +THC_API void THCCachingAllocator_cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock); +THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size); +THC_API void THCCachingAllocator_recordStream(void *ptr, THCStream* stream); +THC_API uint64_t THCCachingAllocator_currentMemoryAllocated(int device); +THC_API uint64_t THCCachingAllocator_maxMemoryAllocated(int device); +THC_API uint64_t THCCachingAllocator_currentMemoryCached(int device); +THC_API uint64_t THCCachingAllocator_maxMemoryCached(int device); + +#if (__cplusplus >= 201103L) || (defined(_MSC_VER) && defined(__cplusplus)) +THC_API std::mutex* THCCachingAllocator_getCudaFreeMutex(); +#endif + +#endif diff --git a/aten/src/THC/THCCachingHostAllocator.cpp b/aten/src/THC/THCCachingHostAllocator.cpp new file mode 100644 index 0000000..617c6f2 --- /dev/null +++ b/aten/src/THC/THCCachingHostAllocator.cpp @@ -0,0 +1,282 @@ +#include "THCCachingHostAllocator.h" +#include "THCStream.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +typedef std::shared_ptr THCStreamPtr; + +struct BlockSize +{ + size_t size; // allocation size + void* ptr; // host memory pointer + + BlockSize(size_t size, void* ptr=NULL) : size(size), ptr(ptr) {} +}; + +struct Block : public BlockSize +{ + bool allocated; // true if the block is currently allocated + int event_count; // number of outstanding cuda events + std::set streams; + + Block(size_t size, void* ptr, bool allocated) : + BlockSize(size, ptr), allocated(allocated), event_count(0), streams() {} +}; + +static bool BlockComparator(const BlockSize& a, const BlockSize& b) +{ + // sort by size, break ties with pointer + if (a.size != b.size) { + return a.size < b.size; + } + return (uintptr_t)a.ptr < (uintptr_t)b.ptr; +} + +struct HostAllocator +{ + typedef bool (*Comparison)(const BlockSize&, const BlockSize&); + + // lock around all operations + std::mutex mutex; + + // blocks by pointer + std::unordered_map blocks; + + // pointers that are ready to be allocated (event_count=0) + std::set available; + + // outstanding cuda events + std::deque> cuda_events; + + HostAllocator() : available(BlockComparator) {} + + cudaError_t malloc(void** ptr, size_t size) + { + std::lock_guard lock(mutex); + + // process outstanding cuda events which may have occurred + cudaError_t err = processEvents(); + if (err != cudaSuccess) { + return err; + } + + // search for the smallest block which can hold this allocation + BlockSize search_key(size); + auto it = available.lower_bound(search_key); + if (it != available.end()) { + Block& block = blocks.at(it->ptr); + THAssert(!block.allocated && block.event_count == 0); + block.allocated = true; + *ptr = block.ptr; + available.erase(it); + return cudaSuccess; + } + + // note that cudaHostAlloc may not touch pointer if size is 0 + *ptr = 0; + + // allocate a new block if no cached allocation is found + err = cudaHostAlloc(ptr, size, cudaHostAllocDefault); + if (err != cudaSuccess) { + return err; + } + + blocks.insert({*ptr, Block(size, *ptr, true)}); + return cudaSuccess; + } + + cudaError_t free(void* ptr) + { + std::lock_guard lock(mutex); + + if (!ptr) { + return cudaSuccess; + } + + // process outstanding cuda events which may have occurred + cudaError_t err = processEvents(); + if (err != cudaSuccess) { + return err; + } + + auto it = blocks.find(ptr); + THAssert(it != blocks.end()); + + Block& block = it->second; + THAssert(block.allocated); + + // free (on valid memory) shouldn't fail, so mark unallocated before + // we process the streams. + block.allocated = false; + + // insert CUDA events for each stream on which this block was used. This + err = insertEvents(block); + if (err != cudaSuccess) { + return err; + } + + if (block.event_count == 0) { + // the block can be re-used if there are no outstanding cuda events + available.insert(block); + } + return cudaSuccess; + } + + cudaError_t recordEvent(void* ptr, THCStream *stream) + { + std::lock_guard lock(mutex); + + auto it = blocks.find(ptr); + if (it == blocks.end()) { + // ignore events for untracked pointers + return cudaSuccess; + } + + Block& block = it->second; + THAssert(block.allocated); + + THCStreamPtr stream_ptr(stream, &THCStream_free); + THCStream_retain(stream); + + block.streams.insert(std::move(stream_ptr)); + return cudaSuccess; + } + + cudaError_t processEvents() + { + // Process outstanding cudaEvents. Events that are completed are removed + // from the queue, and the 'event_count' for the corresponding allocation + // is decremented. Stops at the first event which has not been completed. + // Since events on different devices or streams may occur out of order, + // the processing of some events may be delayed. + while (!cuda_events.empty()) { + auto& e = cuda_events.front(); + cudaEvent_t event = e.first; + + cudaError_t err = cudaEventQuery(event); + if (err == cudaErrorNotReady) { + break; + } else if (err != cudaSuccess) { + return err; + } + err = cudaEventDestroy(event); + if (err != cudaSuccess) { + return err; + } + + Block& block = blocks.at(e.second); + block.event_count--; + if (block.event_count == 0 && !block.allocated) { + available.insert(block); + } + cuda_events.pop_front(); + } + return cudaSuccess; + } + + void emptyCache() + { + std::lock_guard lock(mutex); + + // remove events for freed blocks + for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) { + cudaEvent_t event = it->first; + Block& block = blocks.at(it->second); + if (!block.allocated) { + THCudaCheckWarn(cudaEventDestroy(event)); + block.event_count--; + } + } + + // all cuda_events have been processed + cuda_events.clear(); + + // clear list of available blocks + available.clear(); + + // free and erase non-allocated blocks + for (auto it = blocks.begin(); it != blocks.end();) { + Block& block = it->second; + if (!block.allocated) { + THCudaCheckWarn(cudaFreeHost(block.ptr)); + it = blocks.erase(it); + } else { + ++it; + } + } + } + + cudaError_t insertEvents(Block& block) + { + cudaError_t err; + + int prev_device; + err = cudaGetDevice(&prev_device); + if (err != cudaSuccess) return err; + + std::set streams(std::move(block.streams)); + for (auto it = streams.begin(); it != streams.end(); ++it) { + auto& stream = *it; + + err = cudaSetDevice(THCStream_device(stream.get())); + if (err != cudaSuccess) break; + + cudaEvent_t event; + err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + if (err != cudaSuccess) break; + + err = cudaEventRecord(event, THCStream_stream(stream.get())); + if (err != cudaSuccess) break; + + block.event_count++; + cuda_events.emplace_back(event, block.ptr); + } + + cudaSetDevice(prev_device); + return err; + } +}; + +} // namespace + +static HostAllocator allocator; + +cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream) +{ + return allocator.recordEvent(ptr, stream); +} + +void THCCachingHostAllocator_emptyCache() +{ + allocator.emptyCache(); +} + +static void THCCachingHostDeleter(void* ptr) { + allocator.free(ptr); +} + +struct THCCachingHostAllocator final : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + THAssert(size >= 0); + void *ptr; + THCudaCheck(allocator.malloc(&ptr, size)); + return {ptr, ptr, &THCCachingHostDeleter, at::kCPU}; + } + at::DeleterFnPtr raw_deleter() const override { + return &THCCachingHostDeleter; + } +}; + +static THCCachingHostAllocator thc_caching_host_allocator; +at::Allocator* getTHCCachingHostAllocator() { + return &thc_caching_host_allocator; +} diff --git a/aten/src/THC/THCCachingHostAllocator.h b/aten/src/THC/THCCachingHostAllocator.h new file mode 100644 index 0000000..adb86cb --- /dev/null +++ b/aten/src/THC/THCCachingHostAllocator.h @@ -0,0 +1,31 @@ +#ifndef THC_CACHING_HOST_ALLOCATOR_INC +#define THC_CACHING_HOST_ALLOCATOR_INC + +#include "THCGeneral.h" +#include "THCStream.h" + +// +// A caching allocator for CUDA host allocations (pinned memory). +// +// This provides a drop-in replacement for THCudaHostAllocator, which re-uses +// freed pinned (page-locked) memory allocations. This avoids device +// synchronizations due to cudaFreeHost calls. +// +// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be +// called anytime a pointer from this allocator is used in a cudaMemcpyAsync +// call between host and device. The THC library implements this for storages +// and tensors in THCTensor_(copyAsyncCPU) and THCTensor_(copyAsyncCuda). +// +// Note that this allocator does not split larger allocations into smaller +// blocks, unlike the caching device allocator. +// +THC_API THAllocator* getTHCCachingHostAllocator(void); + +// Records an event in the specified stream. The allocation 'ptr' will not be +// re-used until the event has occurred. +THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream); + +// Releases cached pinned memory allocations via cudaHostFree +THC_API void THCCachingHostAllocator_emptyCache(void); + +#endif diff --git a/aten/src/THC/THCDeviceTensor-inl.cuh b/aten/src/THC/THCDeviceTensor-inl.cuh new file mode 100644 index 0000000..16e1f94 --- /dev/null +++ b/aten/src/THC/THCDeviceTensor-inl.cuh @@ -0,0 +1,416 @@ +#include + +namespace detail { + +template +__host__ __device__ void copy(T to[N], T from[N]) { + for (int i = 0; i < N; ++i) { + to[i] = from[i]; + } +} + +} // namespace detail + +template class PtrTraits> +__host__ __device__ +THCDeviceTensor::THCDeviceTensor() + : data_(NULL) { + thc_static_assert(Dim > 0); + + for (int i = 0; i < Dim; ++i) { + size_[i] = 0; + stride_[i] = (IndexT) 1; + } +} + +template class PtrTraits> +__host__ __device__ +THCDeviceTensor:: +#ifdef _MSC_VER +THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim]) +#else +THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim]) +#endif + : data_(data) { + thc_static_assert(Dim > 0); + + for (int i = 0; i < Dim; ++i) { + size_[i] = sizes[i]; + } + + stride_[Dim - 1] = (IndexT) 1; + for (int i = Dim - 2; i >= 0; --i) { + stride_[i] = stride_[i + 1] * sizes[i + 1]; + } +} + +template class PtrTraits> +__host__ __device__ +THCDeviceTensor::THCDeviceTensor( +#ifdef _MSC_VER + DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim]) +#else + DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim]) +#endif + : data_(data) { + thc_static_assert(Dim > 0); + + for (int i = 0; i < Dim; ++i) { + size_[i] = sizes[i]; + stride_[i] = strides[i]; + } +} + +template class PtrTraits> +template +__host__ __device__ bool +THCDeviceTensor::isSameSizeAndStride( + const THCDeviceTensor& rhs) const { + if (Dim != OtherDim) { + return false; + } + + for (int i = 0; i < Dim; ++i) { + if (size_[i] != rhs.size_[i]) { + return false; + } + + if (stride_[i] != rhs.stride_[i]) { + return false; + } + } + + return true; +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::cast() { + thc_static_assert(sizeof(U) == sizeof(T)); + + return THCDeviceTensor( + reinterpret_cast(data_), size_, stride_); +} + +template class PtrTraits> +template +__host__ __device__ const THCDeviceTensor +THCDeviceTensor::cast() const { + thc_static_assert(sizeof(U) == sizeof(T)); + + return THCDeviceTensor( + reinterpret_cast(data_), size_, stride_); +} + +template class PtrTraits> +__host__ __device__ ptrdiff_t +THCDeviceTensor::numElements() const { + ptrdiff_t size = getSize(0); + + for (int i = 1; i < Dim; ++i) { + size *= getSize(i); + } + + return size; +} + +template class PtrTraits> +__host__ __device__ bool +THCDeviceTensor::isContiguous() const { + return isContiguousRange(0, Dim); +} + +template class PtrTraits> +__host__ __device__ bool +THCDeviceTensor::isConsistentlySized(int i) const { + if (i == 0 && getStride(i) > 0 && getSize(i) > 0) { + return true; + } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) && + ((getStride(i - 1) / getStride(i)) >= getSize(i))) { + return true; + } + + return false; +} + +template class PtrTraits> +__host__ __device__ bool +THCDeviceTensor::isConsistentlySized() const { + for (int i = 0; i < Dim; ++i) { + if (!isConsistentlySized(i)) { + return false; + } + } + + return true; +} + +template class PtrTraits> +__host__ __device__ bool +THCDeviceTensor::isContiguousRange( + int first, int last) const { + + int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1; + + for (int i = last - 1; i >= first; --i) { + if (getSize(i) != (IndexT) 1) { + if (getStride(i) == prevSize) { + prevSize *= getSize(i); + } else { + return false; + } + } + } + + return true; +} + +template class PtrTraits> +__host__ __device__ THCDeviceTensor +THCDeviceTensor::transpose(int dim1, + int dim2) const { +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) + // Device code + assert(dim1 >= 0 && dim1 < Dim); + assert(dim1 >= 0 && dim2 < Dim); +#else + // Host code + if (dim1 < 0 || dim1 >= Dim) { + THError("dim1 out of bounds"); + } + + if (dim2 < 0 || dim2 >= Dim) { + THError("dim2 out of bounds"); + } +#endif + + IndexT newSize[Dim]; + IndexT newStride[Dim]; + + for (int i = 0; i < Dim; ++i) { + newSize[i] = size_[i]; + newStride[i] = stride_[i]; + } + + IndexT tmp = newSize[dim1]; + newSize[dim1] = newSize[dim2]; + newSize[dim2] = tmp; + + tmp = newStride[dim1]; + newStride[dim1] = newStride[dim2]; + newStride[dim2] = tmp; + + return THCDeviceTensor(data_, newSize, newStride); +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::upcastOuter() { + // Can only create tensors of greater dimension + thc_static_assert(NewDim > Dim); + + IndexT newSize[NewDim]; + IndexT newStride[NewDim]; + + int shift = NewDim - Dim; + + for (int i = 0; i < NewDim; ++i) { + if (i < shift) { + // These are the extended dimensions + newSize[i] = (IndexT) 1; + newStride[i] = size_[0] * stride_[0]; + } else { + // Shift the remaining dimensions + newSize[i] = size_[i - shift]; + newStride[i] = stride_[i - shift]; + } + } + + return THCDeviceTensor( + data_, newSize, newStride); +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::upcastInner() { + // Can only create tensors of greater dimension + thc_static_assert(NewDim > Dim); + + IndexT newSize[NewDim]; + IndexT newStride[NewDim]; + + for (int i = 0; i < NewDim; ++i) { + if (i < Dim) { + // Existing dimensions get copied over + newSize[i] = size_[i]; + newStride[i] = stride_[i]; + } else { + // Extended dimensions + newSize[i] = (IndexT) 1; + newStride[i] = (IndexT) 1; + } + } + + return THCDeviceTensor( + data_, newSize, newStride); +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::downcastOuter() { + // Can only create tensors of lesser dimension + thc_static_assert(NewDim < Dim); + + // We can't downcast non-contiguous tensors, since it leaves + // garbage data in the tensor. The tensor needs to be contiguous + // in all of the dimensions we are collapsing (no padding in + // them). + bool cont = isContiguousRange(0, Dim - NewDim); +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) + // Device code + assert(cont); +#else + // Host code + if (!cont) { + THError("Can only downcast contiguous tensors"); + } +#endif + + IndexT newSize[NewDim]; + IndexT newStride[NewDim]; + + int ignoredDims = Dim - NewDim; + IndexT collapsedSize = 1; + + for (int i = 0; i < Dim; ++i) { + if (i < ignoredDims) { + // Collapse these dimensions + collapsedSize *= getSize(i); + } else { + // Non-collapsed dimensions + if (i == ignoredDims) { + // This is the first non-collapsed dimension + newSize[i - ignoredDims] = collapsedSize * getSize(i); + } else { + // Subsequent non-collapsed dimensions + newSize[i - ignoredDims] = getSize(i); + } + + newStride[i - ignoredDims] = getStride(i); + } + } + + return THCDeviceTensor( + data_, newSize, newStride); +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::downcastInner() { + // Can only create tensors of lesser dimension + thc_static_assert(NewDim < Dim); + + // We can't downcast non-contiguous tensors, since it leaves + // garbage data in the tensor. The tensor needs to be contiguous + // in all of the dimensions we are collapsing (no padding in + // them). + bool cont = isContiguousRange(NewDim, Dim); +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) + // Device code + assert(cont); +#else + // Host code + if (!cont) { + THError("Can only downcast contiguous tensors"); + } +#endif + + IndexT newSize[NewDim]; + IndexT newStride[NewDim]; + + IndexT collapsedSize = 1; + + for (int i = Dim - 1; i >= 0; --i) { + if (i >= NewDim) { + // Collapse these dimensions + collapsedSize *= getSize(i); + } else { + // Non-collapsed dimensions + if (i == NewDim - 1) { + // This is the first non-collapsed dimension + newSize[i] = collapsedSize * getSize(i); + newStride[i] = getStride(Dim - 1); + } else { + // Subsequent non-collapsed dimensions + newSize[i] = getSize(i); + newStride[i] = getStride(i); + } + } + } + + return THCDeviceTensor( + data_, newSize, newStride); +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::view(DataPtrType at) { + thc_static_assert(SubDim >= 1 && SubDim < Dim); + + IndexT viewSizes[SubDim]; + IndexT viewStrides[SubDim]; + + for (int i = 0; i < SubDim; ++i) { + viewSizes[i] = size_[Dim - SubDim + i]; + viewStrides[i] = stride_[Dim - SubDim + i]; + } + + return THCDeviceTensor( + at, viewSizes, viewStrides); +} + +template class PtrTraits> +template +__host__ __device__ THCDeviceTensor +THCDeviceTensor::view() { + return view(data_); +} + +template class PtrTraits> +void +THCDeviceTensor::zero(cudaStream_t stream) { +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) + assert(isContiguous()); +#else + if (!isContiguous()) { + THError("fillAsync only works on contiguous data"); + } +#endif + + cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream); +} diff --git a/aten/src/THC/THCDeviceTensor.cuh b/aten/src/THC/THCDeviceTensor.cuh new file mode 100644 index 0000000..2df26be --- /dev/null +++ b/aten/src/THC/THCDeviceTensor.cuh @@ -0,0 +1,513 @@ +#ifndef THC_DEVICE_TENSOR_INC +#define THC_DEVICE_TENSOR_INC + +#include +#include + +// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0. +template +struct THCStaticAssert; + +template <> +struct THCStaticAssert { +}; + +#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>()) + +/// Our tensor type +template class PtrTraits> +class THCDeviceTensor; + +/// Type of a subspace of a tensor +namespace detail { +template class PtrTraits> +class THCDeviceSubTensor; +} + +template +struct RestrictPtrTraits { + typedef T* __restrict__ PtrType; +}; + +template +struct DefaultPtrTraits { + typedef T* PtrType; +}; + +/** + Templated multi-dimensional array that supports strided access of + elements. Main access is through `operator[]`; e.g., + `tensor[x][y][z]`. + +- `T` is the contained type (e.g., `float`) +- `Dim` is the tensor rank +- `IndexT` is the integer type used for size/stride arrays, and for +- all indexing math. Default is `int`, but for large tensors, `int64_t` +- can be used instead. +- `PtrTraits` are traits applied to our data pointer (T*). By default, +- this is just T*, but RestrictPtrTraits can be used to apply T* +- __restrict__ for alias-free analysis. +*/ +template class PtrTraits = DefaultPtrTraits> +class THCDeviceTensor { + public: + enum { NumDim = Dim }; + typedef T DataType; + typedef IndexT IndexType; + typedef typename PtrTraits::PtrType DataPtrType; + typedef THCDeviceTensor TensorType; + + /// Default constructor + __host__ __device__ THCDeviceTensor(); + + /// Constructor that calculates strides with no padding + __host__ __device__ THCDeviceTensor(DataPtrType data, +#ifdef _MSC_VER + const IndexT (&sizes)[Dim]); +#else + const IndexT sizes[Dim]); +#endif + + /// Constructor that takes arbitrary size/stride arrays + __host__ __device__ THCDeviceTensor(DataPtrType data, +#ifdef _MSC_VER + const IndexT (&sizes)[Dim], + const IndexT (&strides)[Dim]); +#else + const IndexT sizes[Dim], + const IndexT strides[Dim]); +#endif + + /// Returns true if the two tensors are of the same dimensionality, + /// size and stride. + template + __host__ __device__ bool + isSameSizeAndStride( + const THCDeviceTensor& rhs) const; + + /// Cast to a tensor of a different type of the same size and stride + template + __host__ __device__ THCDeviceTensor cast(); + + /// Const version of `cast` + template + __host__ __device__ + const THCDeviceTensor cast() const; + + /// Returns a raw pointer to the start of our data. + __host__ __device__ __forceinline__ DataPtrType data() { + return data_; + } + + /// Returns a raw pointer to the start of our data (const). + __host__ __device__ __forceinline__ + const DataPtrType data() const { + return data_; + } + + /// Cast to a different datatype + template + __host__ __device__ __forceinline__ + typename PtrTraits::PtrType dataAs() { + return reinterpret_cast::PtrType>(data_); + } + + /// Cast to a different datatype + template + __host__ __device__ __forceinline__ + const typename PtrTraits::PtrType dataAs() const { + return reinterpret_cast::PtrType>(data_); + } + + /// Returns a read/write view of a portion of our tensor. + __host__ __device__ __forceinline__ + detail::THCDeviceSubTensor + operator[](IndexT); + + /// Returns a read/write view of a portion of our tensor (const). + __host__ __device__ __forceinline__ + const detail::THCDeviceSubTensor + operator[](IndexT) const; + + /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds + /// checking. + __host__ __device__ __forceinline__ int getSize(int i) const { + return size_[i]; + } + + /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds + /// checking. + __host__ __device__ __forceinline__ int getStride(int i) const { + return stride_[i]; + } + + /// Returns the total number of elements contained within our data + /// (product of `getSize(i)`) + __host__ __device__ ptrdiff_t numElements() const; + + /// Returns the size array. + __host__ __device__ __forceinline__ const IndexT* sizes() const { + return size_; + } + + /// Returns the stride array. + __host__ __device__ __forceinline__ const IndexT* strides() const { + return stride_; + } + + /// Returns true if there is no padding within the tensor and no + /// re-ordering of the dimensions. + /// ~~~ + /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0 + /// ~~~ + __host__ __device__ bool isContiguous() const; + + /// Returns whether a given dimension has only increasing stride + /// from the previous dimension. A tensor that was permuted by + /// exchanging size and stride only will fail this check. + /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`. + __host__ __device__ bool isConsistentlySized(int i) const; + + // Returns whether at each dimension `stride <= size`. + // If this is not the case then iterating once over the size space will + // touch the same memory locations multiple times. + __host__ __device__ bool isConsistentlySized() const; + + /// Returns true if the given dimension range [first, last) has no padding. + __host__ __device__ bool isContiguousRange(int first, int last) const; + + /// Returns a tensor of the same dimension after transposing the two + /// dimensions given. Does not actually move elements; transposition + /// is made by permuting the size/stride arrays. + /// If the dimensions are not valid, asserts. + __host__ __device__ THCDeviceTensor + transpose(int dim1, int dim2) const; + + /// Upcast a tensor of dimension `D` to some tensor of dimension + /// D' > D by padding the leading dimensions by 1 + /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]` + template + __host__ __device__ THCDeviceTensor + upcastOuter(); + + /// Upcast a tensor of dimension `D` to some tensor of dimension + /// D' > D by padding the lowest/most varying dimensions by 1 + /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]` + template + __host__ __device__ THCDeviceTensor + upcastInner(); + + /// Downcast a tensor of dimension `D` to some tensor of dimension + /// D' < D by collapsing the leading dimensions. asserts if there is + /// padding on the leading dimensions. + template + __host__ __device__ + THCDeviceTensor downcastOuter(); + + /// Downcast a tensor of dimension `D` to some tensor of dimension + /// D' < D by collapsing the leading dimensions. asserts if there is + /// padding on the leading dimensions. + template + __host__ __device__ + THCDeviceTensor downcastInner(); + + /// Returns a tensor that is a view of the `SubDim`-dimensional slice + /// of this tensor, starting at `at`. + template + __host__ __device__ THCDeviceTensor + view(DataPtrType at); + + /// Returns a tensor that is a view of the `SubDim`-dimensional slice + /// of this tensor, starting where our data begins + template + __host__ __device__ THCDeviceTensor + view(); + + /// Zeroes out the tensor asynchronously. Asserts if the contents + /// in question are not contiguous. + void zero(cudaStream_t stream = 0); + + private: + /// Raw pointer to where the tensor data begins + DataPtrType data_; + + /// Array of strides (in sizeof(T) terms) per each dimension + IndexT stride_[Dim]; + + /// Size per each dimension + IndexT size_[Dim]; +}; + +namespace detail { + +/// Specialization for a view of a single value (0-dimensional) +template class PtrTraits> +class THCDeviceSubTensor { + public: + __host__ __device__ THCDeviceSubTensor + operator=(typename TensorType::DataType val) { + *data_ = val; + return *this; + } + + // operator T& + __host__ __device__ operator typename TensorType::DataType&() { + return *data_; + } + + // const operator T& returning const T& + __host__ __device__ operator const typename TensorType::DataType&() const { + return *data_; + } + + // operator& returning T* + __host__ __device__ typename TensorType::DataType* operator&() { + return data_; + } + + // const operator& returning const T* + __host__ __device__ const typename TensorType::DataType* operator&() const { + return data_; + } + + /// Returns a raw accessor to our slice. + __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() { + return data_; + } + + /// Returns a raw accessor to our slice (const). + __host__ __device__ __forceinline__ + const typename TensorType::DataPtrType data() const { + return data_; + } + + /// Cast to a different datatype. + template + __host__ __device__ T& as() { + return *dataAs(); + } + + /// Cast to a different datatype (const). + template + __host__ __device__ const T& as() const { + return *dataAs(); + } + + /// Cast to a different datatype + template + __host__ __device__ __forceinline__ + typename PtrTraits::PtrType dataAs() { + return reinterpret_cast::PtrType>(data_); + } + + /// Cast to a different datatype (const) + template + __host__ __device__ __forceinline__ + typename PtrTraits::PtrType dataAs() const { + return reinterpret_cast::PtrType>(data_); + } + + /// Use the texture cache for reads + __device__ __forceinline__ typename TensorType::DataType ldg() const { +#if __CUDA_ARCH__ >= 350 + return __ldg(data_); +#else + return *data_; +#endif + } + + /// Use the texture cache for reads; cast as a particular type + template + __device__ __forceinline__ T ldgAs() const { +#if __CUDA_ARCH__ >= 350 + return __ldg(dataAs()); +#else + return as(); +#endif + } + + private: + /// One dimension greater can create us + friend class THCDeviceSubTensor; + + /// Our parent tensor can create us + friend class THCDeviceTensor; + + __host__ __device__ __forceinline__ THCDeviceSubTensor( + TensorType& t, + typename TensorType::DataPtrType data) + : tensor_(t), + data_(data) { + } + + /// The tensor we're referencing + TensorType& tensor_; + + /// Where our value is located + typename TensorType::DataPtrType const data_; +}; + +/// A `SubDim`-rank slice of a parent THCDeviceTensor +template class PtrTraits> +class THCDeviceSubTensor { + public: + /// Returns a view of the data located at our offset (the dimension + /// `SubDim` - 1 tensor). + __host__ __device__ __forceinline__ + THCDeviceSubTensor + operator[](typename TensorType::IndexType index) { + return THCDeviceSubTensor( + tensor_, + data_ + index * tensor_.getStride(TensorType::NumDim - SubDim)); + } + + /// Returns a view of the data located at our offset (the dimension + /// `SubDim` - 1 tensor) (const). + __host__ __device__ __forceinline__ + const THCDeviceSubTensor + operator[](typename TensorType::IndexType index) const { + return THCDeviceSubTensor( + tensor_, + data_ + index * tensor_.getStride(TensorType::NumDim - SubDim)); + } + + // operator& returning T* + __host__ __device__ typename TensorType::DataType* operator&() { + return data_; + } + + // const operator& returning const T* + __host__ __device__ const typename TensorType::DataType* operator&() const { + return data_; + } + + /// Returns a raw accessor to our slice. + __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() { + return data_; + } + + /// Returns a raw accessor to our slice (const). + __host__ __device__ __forceinline__ + const typename TensorType::DataPtrType data() const { + return data_; + } + + /// Cast to a different datatype. + template + __host__ __device__ T& as() { + return *dataAs(); + } + + /// Cast to a different datatype (const). + template + __host__ __device__ const T& as() const { + return *dataAs(); + } + + /// Cast to a different datatype + template + __host__ __device__ __forceinline__ + typename PtrTraits::PtrType dataAs() { + return reinterpret_cast::PtrType>(data_); + } + + /// Cast to a different datatype (const) + template + __host__ __device__ __forceinline__ + typename PtrTraits::PtrType dataAs() const { + return reinterpret_cast::PtrType>(data_); + } + + /// Use the texture cache for reads + __device__ __forceinline__ typename TensorType::DataType ldg() const { +#if __CUDA_ARCH__ >= 350 + return __ldg(data_); +#else + return *data_; +#endif + } + + /// Use the texture cache for reads; cast as a particular type + template + __device__ __forceinline__ T ldgAs() const { +#if __CUDA_ARCH__ >= 350 + return __ldg(dataAs()); +#else + return as(); +#endif + } + + /// Returns a tensor that is a view of the SubDim-dimensional slice + /// of this tensor, starting where our data begins + THCDeviceTensor view() { + return tensor_.template view(data_); + } + + private: + /// One dimension greater can create us + friend class THCDeviceSubTensor; + + /// Our parent tensor can create us + friend class + THCDeviceTensor; + + __host__ __device__ __forceinline__ THCDeviceSubTensor( + TensorType& t, + typename TensorType::DataPtrType data) + : tensor_(t), + data_(data) { + } + + /// The tensor we're referencing + TensorType& tensor_; + + /// The start of our sub-region + typename TensorType::DataPtrType const data_; +}; + +} // namespace detail + +template class PtrTraits> +__host__ __device__ __forceinline__ +detail::THCDeviceSubTensor, + Dim - 1, PtrTraits> +THCDeviceTensor::operator[](IndexT index) { + return detail::THCDeviceSubTensor( + detail::THCDeviceSubTensor( + *this, data_)[index]); +} + +template class PtrTraits> +__host__ __device__ __forceinline__ +const detail::THCDeviceSubTensor, + Dim - 1, PtrTraits> +THCDeviceTensor::operator[](IndexT index) const { + return detail::THCDeviceSubTensor( + detail::THCDeviceSubTensor( + const_cast(*this), data_)[index]); +} + +#include "THCDeviceTensor-inl.cuh" + +#endif // THC_DEVICE_TENSOR_INC diff --git a/aten/src/THC/THCDeviceTensorUtils-inl.cuh b/aten/src/THC/THCDeviceTensorUtils-inl.cuh new file mode 100644 index 0000000..469dd5f --- /dev/null +++ b/aten/src/THC/THCDeviceTensorUtils-inl.cuh @@ -0,0 +1,118 @@ +namespace detail { + +// Add a layer of SFINAE to support static_assert +template class PtrTraits, + int NewDim, bool B> +struct UpcastTHCRoot { + static THCDeviceTensor + make(THCState* state, THCudaTensor* t); +}; + +template class PtrTraits, + int NewDim, bool B> +struct UpcastTHC : + UpcastTHCRoot { +}; + +// Never instantiated SFINAE purposes only +template class PtrTraits, + int NewDim> +struct UpcastTHC : + UpcastTHCRoot { +}; + +template class PtrTraits, + int NewDim> +struct UpcastTHC : + UpcastTHCRoot { + static THCDeviceTensor + make(THCState* state, THCudaTensor* t) { + thc_static_assert(NewDim > Dim); + return toDeviceTensor(state, t). + template upcastOuter(); + } +}; + +// Add a layer of SFINAE to support static_assert +template class PtrTraits, + int NewDim, bool B> +struct DowncastTHCRoot { + static THCDeviceTensor + make(THCState* state, THCudaTensor* t); +}; + +template class PtrTraits, + int NewDim, bool B> +struct DowncastTHC : + DowncastTHCRoot { +}; + +// Never instantiated SFINAE purposes only +template class PtrTraits, + int NewDim> +struct DowncastTHC : + DowncastTHCRoot { +}; + +template class PtrTraits, + int NewDim> +struct DowncastTHC : + DowncastTHCRoot { + static THCDeviceTensor + make(THCState* state, THCudaTensor* t) { + thc_static_assert(NewDim < Dim); + return toDeviceTensor(state, t). + template downcastOuter(); + } +}; + +} // namespace detail + +#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i) \ + case i: \ + if (NewDim > i) { \ + return detail::UpcastTHC i)>:: \ + make(state, t); \ + } else if (NewDim == i) { \ + return toDeviceTensor(state, t); \ + } else { \ + return detail::DowncastTHC:: \ + make(state, t); \ + } \ + /* break; */ + +template class PtrTraits> +THCDeviceTensor +toDeviceTensorCast(THCState* state, THCudaTensor* t) { + switch (THCudaTensor__nDimension(state, t)) { + SWITCH_UNROLL_CUDA_CAST_FACTORY(1); + SWITCH_UNROLL_CUDA_CAST_FACTORY(2); + SWITCH_UNROLL_CUDA_CAST_FACTORY(3); + SWITCH_UNROLL_CUDA_CAST_FACTORY(4); + SWITCH_UNROLL_CUDA_CAST_FACTORY(5); + SWITCH_UNROLL_CUDA_CAST_FACTORY(6); + SWITCH_UNROLL_CUDA_CAST_FACTORY(7); + SWITCH_UNROLL_CUDA_CAST_FACTORY(8); + SWITCH_UNROLL_CUDA_CAST_FACTORY(9); + SWITCH_UNROLL_CUDA_CAST_FACTORY(10); + default: + ; + } + + // Not implemented + THError("THCDeviceTensor dimension size not supported"); + return NULL; /* never enters this piece, appeasing compiler warnings */ +} + +#undef SWITCH_UNROLL_CUDA_CAST_FACTORY diff --git a/aten/src/THC/THCDeviceTensorUtils.cuh b/aten/src/THC/THCDeviceTensorUtils.cuh new file mode 100644 index 0000000..2ab9d4e --- /dev/null +++ b/aten/src/THC/THCDeviceTensorUtils.cuh @@ -0,0 +1,80 @@ +#ifndef THC_DEVICE_TENSOR_UTILS_INC +#define THC_DEVICE_TENSOR_UTILS_INC + +#include "THCDeviceTensor.cuh" +#include "THCTensor.hpp" +#include + +/// Constructs a DeviceTensor initialized from a THCudaTensor by +/// upcasting or downcasting the tensor to that of a different +/// dimension. +template class PtrTraits> +THCDeviceTensor +toDeviceTensorCast(THCState* state, THCudaTensor* t); + +template +THCDeviceTensor +toDeviceTensorCast(THCState* state, THCudaTensor* t) { + return toDeviceTensorCast(state, t); +} + +template +THCDeviceTensor +toDeviceTensorCast(THCState* state, THCudaTensor* t) { + return toDeviceTensorCast(state, t); +} + +/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will +/// error if the dimensionality does not match exactly. +template class PtrTraits> +THCDeviceTensor +toDeviceTensor(THCState* state, THCTensor* t); + +template +THCDeviceTensor +toDeviceTensor(THCState* state, THCTensor* t) { + return toDeviceTensor(state, t); +} + +template +THCDeviceTensor +toDeviceTensor(THCState* state, THCTensor* t) { + return toDeviceTensor(state, t); +} + +template class PtrTraits> +THCDeviceTensor +toDeviceTensor(THCState* state, THCTensor* t) { + if (Dim != THCTensor__nDimension(state, t)) { + THError("THCudaTensor dimension mismatch"); + } + // Determine the maximum offset into the tensor achievable; `IndexT` + // must be smaller than this type in order to use it. + ptrdiff_t maxOffset = 0; + IndexT sizes[Dim]; + IndexT strides[Dim]; + + for (int i = 0; i < Dim; ++i) { + int64_t size = THCTensor_size(state, t, i); + int64_t stride = THCTensor_stride(state, t, i); + + maxOffset += (size - 1) * stride; + + sizes[i] = (IndexT) size; + strides[i] = (IndexT) stride; + } + + if (maxOffset > std::numeric_limits::max()) { + THError("THCudaTensor sizes too large for THCDeviceTensor conversion"); + } + + return THCDeviceTensor( + t->data(), sizes, strides); +} + +#include "THCDeviceTensorUtils-inl.cuh" + +#endif // THC_DEVICE_TENSOR_UTILS_INC diff --git a/aten/src/THC/THCDeviceUtils.cuh b/aten/src/THC/THCDeviceUtils.cuh new file mode 100644 index 0000000..7f16455 --- /dev/null +++ b/aten/src/THC/THCDeviceUtils.cuh @@ -0,0 +1,112 @@ +#ifndef THC_DEVICE_UTILS_INC +#define THC_DEVICE_UTILS_INC + +#include +/* The largest consecutive integer representable in float32 (2^24) */ +#define FLOAT32_MAX_CONSECUTIVE_INT 16777216.0f + +/** + Computes ceil(a / b) +*/ +template +__host__ __device__ __forceinline__ T THCCeilDiv(T a, T b) { + return (a + b - 1) / b; +} + +/** + Computes ceil(a / b) * b; i.e., rounds up `a` to the next highest + multiple of b +*/ +template +__host__ __device__ __forceinline__ T THCRoundUp(T a, T b) { + return THCCeilDiv(a, b) * b; +} + +/** + * For CC 3.5+, perform a load using __ldg + */ +template +__device__ __forceinline__ T doLdg(const T* p) { +#if __CUDA_ARCH__ >= 350 + return __ldg(p); +#else + return *p; +#endif +} + +__device__ __forceinline__ unsigned int ACTIVE_MASK() +{ +#if CUDA_VERSION >= 9000 + return __activemask(); +#else +// will be ignored anyway + return 0xffffffff; +#endif +} + +__device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __ballot_sync(mask, predicate); +#else + return __ballot(predicate); +#endif +} + +#ifdef __HIP_PLATFORM_HCC__ +//To handle ambiguity, add a type double version. +__device__ __forceinline__ double WARP_SHFL_XOR(double value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) { + //(HIP doesn't support double) + return (double) __shfl_xor((float) value, laneMask, width); +} +#endif +template +__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_xor_sync(mask, value, laneMask, width); +#else + return __shfl_xor(value, laneMask, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_sync(mask, value, srcLane, width); +#else + return __shfl(value, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_up_sync(mask, value, delta, width); +#else + return __shfl_up(value, delta, width); +#endif +} + +#ifdef __HIP_PLATFORM_HCC__ +//To handle ambiguity, add a type double version. +__device__ __forceinline__ double WARP_SHFL_DOWN(double value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) +{ + //(HIP doesn't support double) + return (double) __shfl_down((float) value, delta, width); +} +#endif +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + + +#endif // THC_DEVICE_UTILS_INC diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp new file mode 100644 index 0000000..bde2c39 --- /dev/null +++ b/aten/src/THC/THCGeneral.cpp @@ -0,0 +1,761 @@ +#include "THCGeneral.h" +#include "TH.h" +#include "THCAllocator.h" +#include "THCCachingHostAllocator.h" +#include "THCThreadLocal.h" +#include "THCTensorRandom.h" +#include "THCGeneral.hpp" + +#include "ATen/CUDAStream.h" + +#include "THCCachingAllocator.h" +#include +#include + +/* Size of scratch space available in global memory per each SM + stream */ +#define MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float) + +/* Minimum amount of scratch space per device. Total scratch memory per + * device is either this amount, or the # of SMs * the space per SM defined + * above, whichever is greater.*/ +#define MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE 32768 * sizeof(float) + +/* Maximum number of P2P connections (if there are more than 9 then P2P is + * enabled in groups of 8). */ +#define THC_CUDA_MAX_PEER_SIZE 8 + +void THCState_free(THCState* state) +{ + free(state); +} + +THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr( + THCState *state, int device); + +THCState* THCState_alloc(void) +{ + THCState* state = (THCState*) malloc(sizeof(THCState)); + memset(state, 0, sizeof(THCState)); + return state; +} + +static void THDefaultDeviceDeleter(void* ptr) { + THCudaCheck(cudaFree(ptr)); +} + +struct THDefaultDeviceAllocator final : public at::Allocator { + at::DataPtr allocate(size_t size) const override { + void* p = nullptr; + if (size != 0) THCudaCheck(cudaMalloc(&p, size)); + int device; + THCudaCheck(cudaGetDevice(&device)); + return {p, p, &THDefaultDeviceDeleter, at::Device(at::kCUDA, device)}; + } + at::DeleterFnPtr raw_deleter() const override { + return &THDefaultDeviceDeleter; + } +}; + +static THDefaultDeviceAllocator defaultDeviceAllocator; + +void THCudaInit(THCState* state) +{ + if (!state->cudaDeviceAllocator) { + state->cudaDeviceAllocator = &defaultDeviceAllocator; + } + if (!state->cudaHostAllocator) { + state->cudaHostAllocator = getTHCudaHostAllocator(); + } + if (!state->cudaUVAAllocator) { + state->cudaUVAAllocator = getTHCUVAAllocator(); + } + + int numDevices = 0; + THCudaCheck(cudaGetDeviceCount(&numDevices)); + state->numDevices = numDevices; + + int device = 0; + THCudaCheck(cudaGetDevice(&device)); + + state->currentPerDeviceBlasHandle = THCThreadLocal_alloc(); + state->currentPerDeviceSparseHandle = THCThreadLocal_alloc(); + + state->resourcesPerDevice = (THCCudaResourcesPerDevice*) + malloc(numDevices * sizeof(THCCudaResourcesPerDevice)); + memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice)); + + state->deviceProperties = + (struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp)); + + state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); + THCRandom_init(state, numDevices, device); + + // By default, all direct p2p kernel access (besides copy) is disallowed, + // since direct access without knowing whether or not a certain operation + // should be cross-GPU leads to synchronization errors. The user can choose + // to disable this functionality, however. + state->p2pKernelAccessEnabled = 0; + + // p2pAccessEnabled records if p2p copies are allowed between pairs of + // devices. Values include "1" (copy allowed), "0" (copy not allowed), and + // "-1" (unknown). + // Currently the max number of gpus in P2P group is 8, so if there are more + // we enable P2P in groups of 8 + state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices); + for (int i = 0; i < numDevices; ++i) { + state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices); + for (int j = 0; j < numDevices; ++j) + if (i == j) + state->p2pAccessEnabled[i][j] = 1; + else if (j / THC_CUDA_MAX_PEER_SIZE != i / THC_CUDA_MAX_PEER_SIZE) + state->p2pAccessEnabled[i][j] = 0; + else + state->p2pAccessEnabled[i][j] = -1; + } + + for (int i = 0; i < numDevices; ++i) { + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i); + THCudaCheck(cudaSetDevice(i)); + THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); + + /* The scratch space that we want to have available per each device is + based on the number of SMs available per device. We guarantee a + minimum of 128kb of space per device, but to future-proof against + future architectures that may have huge #s of SMs, we guarantee that + we have at least 16 bytes for each SM. */ + int numSM = state->deviceProperties[i].multiProcessorCount; + size_t sizePerStream = + MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ? + MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE : + numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; + res->scratchSpacePerStream = sizePerStream; + } + + /* Restore to previous device */ + THCudaCheck(cudaSetDevice(device)); + + // Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC + // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS + // handles are created lazily. + state->numUserBlasHandles = 1; + state->numUserSparseHandles = 1; + + state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically + state->heapDelta = 0; +} + +void THCudaShutdown(THCState* state) +{ + THCRandom_shutdown(state); + + free(state->rngState); + free(state->deviceProperties); + + int deviceCount = 0; + int prevDev = -1; + THCudaCheck(cudaGetDevice(&prevDev)); + THCudaCheck(cudaGetDeviceCount(&deviceCount)); + + /* cleanup p2p access state */ + for (int dev = 0; dev < deviceCount; ++dev) { + free(state->p2pAccessEnabled[dev]); + } + free(state->p2pAccessEnabled); + + /* cleanup per-device state */ + for (int dev = 0; dev < deviceCount; ++dev) { + THCudaCheck(cudaSetDevice(dev)); + THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]); + /* Free user defined BLAS handles */ + for (int i = 0; i < res->numBlasHandles; ++i) { + THCublasCheck(cublasDestroy(res->blasHandles[i])); + } + /* Free user defined sparse handles */ + for (int i = 0; i < res->numSparseHandles; ++i) { + THCusparseCheck(cusparseDestroy(res->sparseHandles[i])); + } + + free(res->blasHandles); + free(res->sparseHandles); + } + free(state->resourcesPerDevice); + if (state->cudaDeviceAllocator == THCCachingAllocator_get()) { + THCCachingAllocator_emptyCache(); + } + if (state->cudaHostAllocator == getTHCCachingHostAllocator()) { + THCCachingHostAllocator_emptyCache(); + } + THCThreadLocal_free(state->currentPerDeviceBlasHandle); + + THCudaCheck(cudaSetDevice(prevDev)); +} + +int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess) +{ + if (dev < 0 || dev >= state->numDevices) { + THError("%d is not a device", dev); + } + if (devToAccess < 0 || devToAccess >= state->numDevices) { + THError("%d is not a device", devToAccess); + } + if (state->p2pAccessEnabled[dev][devToAccess] == -1) { + int prevDev = 0; + THCudaCheck(cudaGetDevice(&prevDev)); + THCudaCheck(cudaSetDevice(dev)); + + int access = 0; + THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess)); + if (access) { + cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + // ignore and clear the error if access was already enabled + cudaGetLastError(); + } else { + THCudaCheck(err); + } + state->p2pAccessEnabled[dev][devToAccess] = 1; + } else { + state->p2pAccessEnabled[dev][devToAccess] = 0; + } + + THCudaCheck(cudaSetDevice(prevDev)); + } + return state->p2pAccessEnabled[dev][devToAccess]; +} + +void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess, + int enable) +{ + /* This will perform device bounds checking for us */ + int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess); + + if (enable != prevEnabled) { + /* If we're attempting to enable p2p access but p2p access isn't */ + /* supported, throw an error */ + if (enable) { + int access = 0; + THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess)); + + if (!access) { + THError("p2p access not supported for %d accessing %d", + dev, devToAccess); + } + } + + state->p2pAccessEnabled[dev][devToAccess] = enable; + + int prevDev = 0; + THCudaCheck(cudaGetDevice(&prevDev)); + THCudaCheck(cudaSetDevice(dev)); + + /* This should be in sync with the current access state */ + if (enable) { + THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0)); + } else { + THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess)); + } + + THCudaCheck(cudaSetDevice(prevDev)); + } +} + +int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) { + return state->p2pKernelAccessEnabled; +} + +void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) { + state->p2pKernelAccessEnabled = val; +} + +struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state) +{ + int curDev = -1; + THCudaCheck(cudaGetDevice(&curDev)); + + return &(state->deviceProperties[curDev]); +} + +struct cudaDeviceProp* THCState_getDeviceProperties(THCState* state, int device) +{ + THAssert(device >= 0 && device < state->numDevices); + return &(state->deviceProperties[device]); +} + +struct THCRNGState* THCState_getRngState(THCState *state) +{ + return state->rngState; +} + +THAllocator* THCState_getCudaHostAllocator(THCState* state) +{ + return state->cudaHostAllocator; +} + +THAllocator* THCState_getCudaUVAAllocator(THCState* state) +{ + return state->cudaUVAAllocator; +} + +THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state) +{ + return state->cudaDeviceAllocator; +} + +void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator) +{ + state->cudaDeviceAllocator = allocator; +} + +int THCState_isCachingAllocatorEnabled(THCState* state) { + return state->cudaHostAllocator == getTHCCachingHostAllocator(); +} + +int THCState_getNumDevices(THCState *state) +{ + return state->numDevices; +} + +void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasHandles) +{ + int prevDev = -1; + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); + if (numBlasHandles <= res->numBlasHandles) { + return; + } + + THCudaCheck(cudaGetDevice(&prevDev)); + THCudaCheck(cudaSetDevice(device)); + + size_t size = numBlasHandles * sizeof(cublasHandle_t); + cublasHandle_t* handles = (cublasHandle_t*) realloc(res->blasHandles, size); + for (int i = res->numBlasHandles; i < numBlasHandles; ++i) { + handles[i] = NULL; + THCublasCheck(cublasCreate(&handles[i])); + } + res->blasHandles = handles; + res->numBlasHandles = numBlasHandles; + + THCudaCheck(cudaSetDevice(prevDev)); +} + +void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSparseHandles) +{ + int prevDev = -1; + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); + if (numSparseHandles <= res->numSparseHandles) { + return; + } + + THCudaCheck(cudaGetDevice(&prevDev)); + THCudaCheck(cudaSetDevice(device)); + + size_t size = numSparseHandles * sizeof(cusparseHandle_t); + cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size); + for (int i = res->numSparseHandles; i < numSparseHandles; ++i) { + handles[i] = NULL; + THCusparseCheck(cusparseCreate(&handles[i])); + } + res->sparseHandles = handles; + res->numSparseHandles = numSparseHandles; + + THCudaCheck(cudaSetDevice(prevDev)); +} + +void THCState_reserveBlasHandles(THCState* state, int numBlasHandles) +{ + // cuBLAS handles are created lazily from THCState_getDeviceBlasHandle + // to avoid initializing unused devices + if (numBlasHandles > state->numUserBlasHandles) + { + state->numUserBlasHandles = numBlasHandles; + } +} + +void THCState_reserveSparseHandles(THCState* state, int numSparseHandles) +{ + // cuBLAS handles are created lazily from THCState_getDeviceSparseHandle + // to avoid initializing unused devices + if (numSparseHandles > state->numUserSparseHandles) + { + state->numUserSparseHandles = numSparseHandles; + } +} + +int THCState_getNumBlasHandles(THCState* state) +{ + return state->numUserBlasHandles; +} + +int THCState_getNumSparseHandles(THCState* state) +{ + return state->numUserSparseHandles; +} + +THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr( + THCState *state, int device) +{ + /* `device` is a CUDA index */ + if (device >= state->numDevices || device < 0) + { + THError("%d is not a device", device + 1 /* back to Torch index */); + } + + return &(state->resourcesPerDevice[device]); +} + +cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle) +{ + if (handle <= 0 || handle > state->numUserBlasHandles) { + THError("%d is not a valid handle, valid range is: (1, %d)", + handle, state->numUserBlasHandles); + } + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); + THCState_reserveDeviceBlasHandles(state, device, handle); + return res->blasHandles[handle - 1]; +} + +cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle) +{ + if (handle <= 0 || handle > state->numUserSparseHandles) { + THError("%d is not a valid handle, valid range is: (1, %d)", + handle, state->numUserSparseHandles); + } + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); + THCState_reserveDeviceSparseHandles(state, device, handle); + return res->sparseHandles[handle - 1]; +} + +THCStream* THCState_getStreamOnDevice(THCState* state, int device) { + return at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device); +} + +void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream) { + at::detail::CUDAStream_setStreamOnDevice(device, stream); +} + +cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device) { + return at::detail::CUDAStream_stream( + at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device)); +} + +cudaStream_t THCState_getCurrentStream(THCState *state) { + return at::detail::CUDAStream_stream( + at::detail::CUDAStream_getCurrentStreamUnsafe()); +} + +THCStream* THCState_getStream(THCState *state) { + return at::detail::CUDAStream_getCurrentStreamUnsafe(); +} + +void THCState_setStream(THCState *state, THCStream *stream) { + at::detail::CUDAStream_setStream(stream); +} + +cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) +{ + /* This is called at the point of kernel execution. + For some debugging code or improperly instrumented kernels, + `state` is null */ + if (state) { + int device; + THCudaCheck(cudaGetDevice(&device)); + + int handle = THCState_getCurrentBlasHandleIndex(state); + return THCState_getDeviceBlasHandle(state, device, handle); + } + THError("THCState and blasHandles must be set as there is no default blasHandle"); + return NULL; +} + +cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state) +{ + /* This is called at the point of kernel execution. + For some debugging code or improperly instrumented kernels, + `state` is null */ + if (state) { + int device; + THCudaCheck(cudaGetDevice(&device)); + + int handle = THCState_getCurrentSparseHandleIndex(state); + return THCState_getDeviceSparseHandle(state, device, handle); + } + THError("THCState and sparseHandles must be set as there is no default sparseHandle"); + return NULL; +} + +int THCState_getCurrentBlasHandleIndex(THCState *state) +{ + void* value = THCThreadLocal_get(state->currentPerDeviceBlasHandle); + if (value == NULL) { + return 1; + } + return (int) (intptr_t) value; +} + +int THCState_getCurrentSparseHandleIndex(THCState *state) +{ + void* value = THCThreadLocal_get(state->currentPerDeviceSparseHandle); + if (value == NULL) { + return 1; + } + return (int) (intptr_t) value; +} + +void THCState_setCurrentBlasHandleIndex(THCState *state, int handle) +{ + if (handle > state->numUserBlasHandles || handle <= 0) + { + THError("%d is not a valid handle, valid range is: (1, %d)", + handle, state->numUserBlasHandles); + } + THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle); +} + +void THCState_setCurrentSparseHandleIndex(THCState *state, int handle) +{ + if (handle > state->numUserSparseHandles || handle <= 0) + { + THError("%d is not a valid handle, valid range is: (1, %d)", + handle, state->numUserSparseHandles); + } + THCThreadLocal_set(state->currentPerDeviceSparseHandle, (void*)(intptr_t)handle); +} + +size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state) +{ + int device = -1; + THCudaCheck(cudaGetDevice(&device)); + return THCState_getDeviceScratchSpaceSize(state, device); +} + +size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device) +{ + THCCudaResourcesPerDevice* res = + THCState_getDeviceResourcePtr(state, device); + + return res->scratchSpacePerStream; +} + +void __THCudaCheck(cudaError_t err, const char *file, const int line) +{ + if(err != cudaSuccess) + { + static int alreadyFailed = 0; + if(!alreadyFailed) { + fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err)); + alreadyFailed = 1; + } + _THError(file, line, "cuda runtime error (%d) : %s", err, + cudaGetErrorString(err)); + } +} + +void __THCudaCheckWarn(cudaError_t err, const char *file, const int line) +{ + if(err != cudaSuccess) + { + fprintf(stderr, "THCudaCheckWarn FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err)); + } +} + +void __THCublasCheck(cublasStatus_t status, const char *file, const int line) +{ + if(status != CUBLAS_STATUS_SUCCESS) + { + const char* errmsg = NULL; + + switch(status) + { + case CUBLAS_STATUS_NOT_INITIALIZED: + errmsg = "library not initialized"; + break; + + case CUBLAS_STATUS_ALLOC_FAILED: + errmsg = "resource allocation failed"; + break; + + case CUBLAS_STATUS_INVALID_VALUE: + errmsg = "an invalid numeric value was used as an argument"; + break; + + case CUBLAS_STATUS_ARCH_MISMATCH: + errmsg = "an absent device architectural feature is required"; + break; + + case CUBLAS_STATUS_MAPPING_ERROR: + errmsg = "an access to GPU memory space failed"; + break; + + case CUBLAS_STATUS_EXECUTION_FAILED: + errmsg = "the GPU program failed to execute"; + break; + + case CUBLAS_STATUS_INTERNAL_ERROR: + errmsg = "an internal operation failed"; + break; + + default: + errmsg = "unknown error"; + break; + } + + _THError(file, line, "cublas runtime error : %s", errmsg); + } +} + +void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line) +{ + if(status != CUSPARSE_STATUS_SUCCESS) + { + const char* errmsg = NULL; + + switch(status) + { + case CUSPARSE_STATUS_NOT_INITIALIZED: + errmsg = "library not initialized"; + break; + + case CUSPARSE_STATUS_ALLOC_FAILED: + errmsg = "resource allocation failed"; + break; + + case CUSPARSE_STATUS_INVALID_VALUE: + errmsg = "an invalid numeric value was used as an argument"; + break; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + errmsg = "an absent device architectural feature is required"; + break; + + case CUSPARSE_STATUS_MAPPING_ERROR: + errmsg = "an access to GPU memory space failed"; + break; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + errmsg = "the GPU program failed to execute"; + break; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + errmsg = "an internal operation failed"; + break; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + errmsg = "the matrix type is not supported by this function"; + break; + + default: + errmsg = "unknown error"; + break; + } + + _THError(file, line, "cusparse runtime error : %s", errmsg); + } +} + +void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data ) +{ + state->cutorchGCFunction = cutorchGCFunction_; + state->cutorchGCData = data; +} + +void* THCudaMalloc(THCState *state, size_t size) +{ + THCudaCheck(cudaGetLastError()); + THCDeviceAllocator* allocator = state->cudaDeviceAllocator; + if (state->cutorchGCFunction != nullptr) { + try { + return allocator->raw_allocate(size); + } catch (...) { + cudaGetLastError(); // reset OOM error + (state->cutorchGCFunction)(state->cutorchGCData); + return allocator->raw_allocate(size); + } + } else { + return allocator->raw_allocate(size); + } +} + +void THCudaFree(THCState *state, void* ptr) { + state->cudaDeviceAllocator->raw_deallocate(ptr); +} + +at::DataPtr THCudaHostAlloc(THCState *state, size_t size) +{ + THCudaCheck(cudaGetLastError()); + THAllocator* allocator = state->cudaHostAllocator; + return allocator->allocate(size); +} + +void THCudaHostRecord(THCState *state, void *ptr) { + if (state->cudaHostAllocator == getTHCCachingHostAllocator()) { + THCStream* stream = THCState_getStream(state); + THCCachingHostAllocator_recordEvent(ptr, stream); + } +} + +cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes) +{ + size_t largestBlock = 0; + return THCudaMemGetInfoCached(state, freeBytes, totalBytes, &largestBlock); +} + +cudaError_t THCudaMemGetInfoCached(THCState *state, size_t* freeBytes, size_t* totalBytes, size_t* largestBlock) +{ + size_t cachedBytes = 0; + THCDeviceAllocator* allocator = state->cudaDeviceAllocator; + + *largestBlock = 0; + /* get info from CUDA first */ + cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes); + if (ret!= cudaSuccess) + return ret; + + int device; + ret = cudaGetDevice(&device); + if (ret!= cudaSuccess) + return ret; + + /* not always true - our optimistic guess here */ + *largestBlock = *freeBytes; + + if (allocator == THCCachingAllocator_get()) { + THCCachingAllocator_cacheInfo(device, &cachedBytes, largestBlock); + } + + /* Adjust resulting free bytes number. largesBlock unused for now */ + *freeBytes += cachedBytes; + return cudaSuccess; +} + +#undef MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM +#undef MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE + +#include "THCStorage.cpp" +#include "THCAllocator.cpp" + +/* from THCHalf.h */ + +half THC_float2half(float f) +{ +#if CUDA_VERSION < 9000 + half h; + TH_float2halfbits(&f, &h.x); + return h; +#else + __half_raw h_raw; + TH_float2halfbits(&f, &h_raw.x); + return half(h_raw); +#endif +} + +float THC_half2float(half h) +{ + float f; +#if CUDA_VERSION < 9000 + TH_halfbits2float(&h.x, &f); +#else + __half_raw h_raw(h); + TH_halfbits2float(&h_raw.x, &f); +#endif + return f; +} diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in new file mode 100644 index 0000000..4275916 --- /dev/null +++ b/aten/src/THC/THCGeneral.h.in @@ -0,0 +1,158 @@ +#ifndef THC_GENERAL_INC +#define THC_GENERAL_INC + +#include "THGeneral.h" +#include "THAllocator.h" +#include "THCThreadLocal.h" +#undef log10 +#undef log1p +#undef log2 +#undef expm1 + +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cusparse.h" + +#cmakedefine USE_MAGMA + +#ifdef __cplusplus +# define THC_EXTERNC extern "C" +#else +# define THC_EXTERNC extern +#endif + +#ifdef _WIN32 +# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) +# define THC_API THC_EXTERNC __declspec(dllexport) +# define THC_CLASS __declspec(dllexport) +# else +# define THC_API THC_EXTERNC __declspec(dllimport) +# define THC_CLASS __declspec(dllimport) +# endif +#else +# define THC_API THC_EXTERNC +# define THC_CLASS +#endif + +#ifndef THAssert +#define THAssert(exp) \ + do { \ + if (!(exp)) { \ + _THError(__FILE__, __LINE__, "assert(%s) failed", #exp); \ + } \ + } while(0) +#endif + +struct THCRNGState; /* Random number generator state. */ +typedef struct CUDAStreamInternals THCStream; +typedef struct THCState THCState; +struct THCState; + +typedef THAllocator THCDeviceAllocator; + +typedef struct _THCCudaResourcesPerDevice { + /* Number of materialized cuBLAS handles */ + int numBlasHandles; + /* Number of materialized cuSparse handles */ + int numSparseHandles; + /* cuBLAS handes are lazily initialized */ + cublasHandle_t* blasHandles; + /* cuSparse handes are lazily initialized */ + cusparseHandle_t* sparseHandles; + /* Size of scratch space per each stream on this device available */ + size_t scratchSpacePerStream; +} THCCudaResourcesPerDevice; + +THC_API THCState* THCState_alloc(void); +THC_API void THCState_free(THCState* state); + +THC_API void THCudaInit(THCState* state); +THC_API void THCudaShutdown(THCState* state); + +/* If device `dev` can access allocations on device `devToAccess`, this will return */ +/* 1; otherwise, 0. */ +THC_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess); +/* Enables or disables allowed p2p access using cutorch copy. If we are */ +/* attempting to enable access, throws an error if CUDA cannot enable p2p */ +/* access. */ +THC_API void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess, + int enable); + +/* By default, direct in-kernel access to memory on remote GPUs is + disabled. When set, this allows direct in-kernel access to remote + GPUs where GPU/GPU p2p access is enabled and allowed. */ +THC_API int THCState_getKernelPeerToPeerAccessEnabled(THCState* state); +THC_API void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val); + +THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state); +THC_API struct cudaDeviceProp* THCState_getDeviceProperties(THCState* state, int device); + +THC_API struct THCRNGState* THCState_getRngState(THCState* state); +THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state); +THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state); +THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state); +THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator); +THC_API int THCState_isCachingAllocatorEnabled(THCState* state); + +THC_API void THCMagma_init(THCState *state); + +/* State manipulators and accessors */ +THC_API int THCState_getNumDevices(THCState* state); + +/* Stream API */ +THC_API cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device); +THC_API cudaStream_t THCState_getCurrentStream(THCState *state); + +THC_API THCStream* THCState_getStream(THCState *state); +THC_API void THCState_setStream(THCState *state, THCStream* stream); +THC_API THCStream* THCState_getStreamOnDevice(THCState* state, int device); +THC_API void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream); + +THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles); +THC_API int THCState_getNumBlasHandles(THCState* state); + +THC_API void THCState_reserveSparseHandles(THCState* state, int numHandles); +THC_API int THCState_getNumSparseHandles(THCState* state); + +THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle); +THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state); +THC_API int THCState_getCurrentBlasHandleIndex(THCState *state); +THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle); + +THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle); +THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state); +THC_API int THCState_getCurrentSparseHandleIndex(THCState *state); +THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle); + +/* For the current device and stream, returns the allocated scratch space */ +THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state); +THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device); + +#define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs") +#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__) +#define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__) +#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__) +#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__) + +THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line); +THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line); +THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line); +THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line); + +THC_API void* THCudaMalloc(THCState *state, size_t size); +THC_API void THCudaFree(THCState *state, void* ptr); + +#ifdef __cplusplus +at::DataPtr THCudaHostAlloc(THCState *state, size_t size); +#endif + +THC_API void THCudaHostRecord(THCState *state, void *ptr); + +THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes); +THC_API cudaError_t THCudaMemGetInfoCached(THCState *state, size_t* freeBytes, size_t* totalBytes, size_t* largestBlock); +THC_API void THCSetGCHandler(THCState *state, + void (*torchGCHandlerFunction)(void *data), + void *data ); + +#endif diff --git a/aten/src/THC/THCGeneral.hpp b/aten/src/THC/THCGeneral.hpp new file mode 100644 index 0000000..89436f7 --- /dev/null +++ b/aten/src/THC/THCGeneral.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include "THCGeneral.h" + +/* Global state of THC. */ +struct THCState { + struct THCRNGState* rngState; + struct cudaDeviceProp* deviceProperties; + /* Set of all allocated resources. blasHandles and sparseHandles do not have + a default and must be explicitly initialized. We always initialize 1 + blasHandle and 1 sparseHandle but we can use more. + */ + THCCudaResourcesPerDevice* resourcesPerDevice; + /* Captured number of devices upon startup; convenience for bounds checking */ + int numDevices; + int numUserBlasHandles; + int numUserSparseHandles; + + /* Allocator using cudaMallocHost. */ + // NB: These allocators (specifically, cudaHostAllocator) MUST implement + // maybeGlobalBoundDeleter, because we have a few use-cases where we need to + // do raw allocations with them (for Thrust). + // TODO: Make this statically obvious + at::Allocator* cudaHostAllocator; + at::Allocator* cudaUVAAllocator; + at::Allocator* cudaDeviceAllocator; + + /* Index of the current selected BLAS handle. The actual BLAS handle used + depends on the current device. */ + THCThreadLocal/**/ currentPerDeviceBlasHandle; + /* Index of the current selected sparse handle. The actual sparse handle used + depends on the current device. */ + THCThreadLocal/**/ currentPerDeviceSparseHandle; + + /* Table of enabled peer-to-peer access between directed pairs of GPUs. + If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */ + int** p2pAccessEnabled; + + /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU + copies are allowed via p2p if p2p access is enabled at all for + the pair of GPUs in question, but if this flag is true, then + all cross-GPU access checks are disabled, allowing kernels to + directly access memory on another GPUs. + Note that p2p access must exist and be enabled for the pair of + GPUs in question. */ + int p2pKernelAccessEnabled; + + void (*cutorchGCFunction)(void *data); + void *cutorchGCData; + ptrdiff_t heapSoftmax; + ptrdiff_t heapDelta; +}; diff --git a/aten/src/THC/THCGenerateAllTypes.h b/aten/src/THC/THCGenerateAllTypes.h new file mode 100644 index 0000000..27a8bd2 --- /dev/null +++ b/aten/src/THC/THCGenerateAllTypes.h @@ -0,0 +1,37 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateAllTypes.h" +#endif + +#define THCGenerateAllTypes + +#define THCTypeIdxByte 1 +#define THCTypeIdxChar 2 +#define THCTypeIdxShort 3 +#define THCTypeIdxInt 4 +#define THCTypeIdxLong 5 +#define THCTypeIdxFloat 6 +#define THCTypeIdxDouble 7 +#define THCTypeIdxHalf 8 +#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T) + +#include "THCGenerateByteType.h" +#include "THCGenerateCharType.h" +#include "THCGenerateShortType.h" +#include "THCGenerateIntType.h" +#include "THCGenerateLongType.h" +#include "THCGenerateHalfType.h" +#include "THCGenerateFloatType.h" +#include "THCGenerateDoubleType.h" + +#undef THCTypeIdxByte +#undef THCTypeIdxChar +#undef THCTypeIdxShort +#undef THCTypeIdxInt +#undef THCTypeIdxLong +#undef THCTypeIdxFloat +#undef THCTypeIdxDouble +#undef THCTypeIdxHalf +#undef THCTypeIdx_ + +#undef THCGenerateAllTypes +#undef THC_GENERIC_FILE diff --git a/aten/src/THC/THCGenerateByteType.h b/aten/src/THC/THCGenerateByteType.h new file mode 100644 index 0000000..4f76800 --- /dev/null +++ b/aten/src/THC/THCGenerateByteType.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateByteType.h" +#endif + +#define real uint8_t +#define accreal int64_t +#define Real Byte +#define CReal CudaByte +#define THC_REAL_IS_BYTE +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_BYTE + +#ifndef THCGenerateAllTypes +#undef THC_GENERIC_FILE +#endif diff --git a/aten/src/THC/THCGenerateCharType.h b/aten/src/THC/THCGenerateCharType.h new file mode 100644 index 0000000..ec86b1a --- /dev/null +++ b/aten/src/THC/THCGenerateCharType.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateCharType.h" +#endif + +#define real int8_t +#define accreal int64_t +#define Real Char +#define CReal CudaChar +#define THC_REAL_IS_CHAR +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_CHAR + +#ifndef THCGenerateAllTypes +#undef THC_GENERIC_FILE +#endif diff --git a/aten/src/THC/THCGenerateDoubleType.h b/aten/src/THC/THCGenerateDoubleType.h new file mode 100644 index 0000000..fdf6a8e --- /dev/null +++ b/aten/src/THC/THCGenerateDoubleType.h @@ -0,0 +1,22 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateDoubleType.h" +#endif + +#define real double +#define accreal double +#define Real Double +#define CReal CudaDouble +#define THC_REAL_IS_DOUBLE +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_DOUBLE + +#ifndef THCGenerateAllTypes +#ifndef THCGenerateFloatTypes +#undef THC_GENERIC_FILE +#endif +#endif diff --git a/aten/src/THC/THCGenerateFloatType.h b/aten/src/THC/THCGenerateFloatType.h new file mode 100644 index 0000000..997988d --- /dev/null +++ b/aten/src/THC/THCGenerateFloatType.h @@ -0,0 +1,24 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateFloatType.h" +#endif + +#define real float +/* FIXME: fp64 has bad performance on some platforms; avoid using it unless + we opt into it? */ +#define accreal float +#define Real Float +#define CReal Cuda +#define THC_REAL_IS_FLOAT +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_FLOAT + +#ifndef THCGenerateAllTypes +#ifndef THCGenerateFloatTypes +#undef THC_GENERIC_FILE +#endif +#endif diff --git a/aten/src/THC/THCGenerateFloatTypes.h b/aten/src/THC/THCGenerateFloatTypes.h new file mode 100644 index 0000000..11bf46d --- /dev/null +++ b/aten/src/THC/THCGenerateFloatTypes.h @@ -0,0 +1,32 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateFloatTypes.h" +#endif + +#define THCGenerateFloatTypes + +#define THCTypeIdxByte 1 +#define THCTypeIdxChar 2 +#define THCTypeIdxShort 3 +#define THCTypeIdxInt 4 +#define THCTypeIdxLong 5 +#define THCTypeIdxFloat 6 +#define THCTypeIdxDouble 7 +#define THCTypeIdxHalf 8 +#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T) + +#include "THCGenerateHalfType.h" +#include "THCGenerateFloatType.h" +#include "THCGenerateDoubleType.h" + +#undef THCTypeIdxByte +#undef THCTypeIdxChar +#undef THCTypeIdxShort +#undef THCTypeIdxInt +#undef THCTypeIdxLong +#undef THCTypeIdxFloat +#undef THCTypeIdxDouble +#undef THCTypeIdxHalf +#undef THCTypeIdx_ + +#undef THCGenerateFloatTypes +#undef THC_GENERIC_FILE diff --git a/aten/src/THC/THCGenerateHalfType.h b/aten/src/THC/THCGenerateHalfType.h new file mode 100644 index 0000000..77d4c0a --- /dev/null +++ b/aten/src/THC/THCGenerateHalfType.h @@ -0,0 +1,38 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h" +#endif + +#include "THCHalf.h" + +#if defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF) + +#define real half +#define accreal float +#define Real Half + +// if only here via FORCE_TH_HALF, don't define CReal since +// FORCE_TH_HALF should only be used for TH types +#ifdef CUDA_HALF_TENSOR +#define CReal CudaHalf +#endif + +#define THC_REAL_IS_HALF +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real + +#ifdef CUDA_HALF_TENSOR +#undef CReal +#endif + +#undef THC_REAL_IS_HALF + +#endif // defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF) + +#ifndef THCGenerateAllTypes +#ifndef THCGenerateFloatTypes +#undef THC_GENERIC_FILE +#endif +#endif diff --git a/aten/src/THC/THCGenerateIntType.h b/aten/src/THC/THCGenerateIntType.h new file mode 100644 index 0000000..ec393dd --- /dev/null +++ b/aten/src/THC/THCGenerateIntType.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateIntType.h" +#endif + +#define real int32_t +#define accreal int64_t +#define Real Int +#define CReal CudaInt +#define THC_REAL_IS_INT +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_INT + +#ifndef THCGenerateAllTypes +#undef THC_GENERIC_FILE +#endif diff --git a/aten/src/THC/THCGenerateLongType.h b/aten/src/THC/THCGenerateLongType.h new file mode 100644 index 0000000..f47840c --- /dev/null +++ b/aten/src/THC/THCGenerateLongType.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateLongType.h" +#endif + +#define real int64_t +#define accreal int64_t +#define Real Long +#define CReal CudaLong +#define THC_REAL_IS_LONG +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_LONG + +#ifndef THCGenerateAllTypes +#undef THC_GENERIC_FILE +#endif diff --git a/aten/src/THC/THCGenerateShortType.h b/aten/src/THC/THCGenerateShortType.h new file mode 100644 index 0000000..cfc5536 --- /dev/null +++ b/aten/src/THC/THCGenerateShortType.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#error "You must define THC_GENERIC_FILE before including THGenerateShortType.h" +#endif + +#define real int16_t +#define accreal int64_t +#define Real Short +#define CReal CudaShort +#define THC_REAL_IS_SHORT +#line 1 THC_GENERIC_FILE +#include THC_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef CReal +#undef THC_REAL_IS_SHORT + +#ifndef THCGenerateAllTypes +#undef THC_GENERIC_FILE +#endif diff --git a/aten/src/THC/THCGenerator.hpp b/aten/src/THC/THCGenerator.hpp new file mode 100644 index 0000000..ea5d1ba --- /dev/null +++ b/aten/src/THC/THCGenerator.hpp @@ -0,0 +1,20 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include +#include + +typedef struct THCGeneratorState { + struct curandStateMtgp32* gen_states; + struct mtgp32_kernel_params *kernel_params; + int initf; + uint64_t initial_seed; + std::atomic philox_seed_offset; +} THCGeneratorState; + +struct THCGenerator { + std::mutex mutex; /* mutex for using this generator */ + THCGeneratorState state; +}; diff --git a/aten/src/THC/THCHalf.cu b/aten/src/THC/THCHalf.cu new file mode 100644 index 0000000..7863260 --- /dev/null +++ b/aten/src/THC/THCHalf.cu @@ -0,0 +1,51 @@ +#include "THCHalf.h" +#include "THCThrustAllocator.cuh" +#include +#include + +struct __half2floatOp { + __device__ float operator()(half v) { return __half2float(v); } +}; + +struct __float2halfOp { + __device__ half operator()(float v) { return __float2half(v); } +}; + +void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) { + THCThrustAllocator thrustAlloc(state); + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + in, in + len, out, __float2halfOp()); +} + +void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { + THCThrustAllocator thrustAlloc(state); + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + in, in + len, out, __half2floatOp()); +} + +THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { + cudaDeviceProp* prop = + THCState_getCurrentDeviceProperties(state); + + // CC 5.3+ + return (prop->major > 5 || + (prop->major == 5 && prop->minor == 3)); +} + +THC_EXTERNC int THC_fastHalfInstructions(THCState *state) { + cudaDeviceProp* prop = + THCState_getCurrentDeviceProperties(state); + + // Check for CC 6.0 only (corresponds to P100) + return (prop->major == 6 && prop->minor == 0); +} diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h new file mode 100644 index 0000000..d9b8cba --- /dev/null +++ b/aten/src/THC/THCHalf.h @@ -0,0 +1,35 @@ +#ifndef THC_HALF_CONVERSION_INC +#define THC_HALF_CONVERSION_INC + +#include "THCGeneral.h" + +/* We compile with CudaHalfTensor support if we have this: */ +#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__) +#define CUDA_HALF_TENSOR 1 +#endif + +#ifdef CUDA_HALF_TENSOR + +#include +#include + +#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) +#ifndef __cplusplus +typedef __half_raw half; +#endif +#endif + +THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); +THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len); +THC_API half THC_float2half(float a); +THC_API float THC_half2float(half a); + +/* Check for native fp16 support on the current device (CC 5.3+) */ +THC_API int THC_nativeHalfInstructions(THCState *state); + +/* Check for performant native fp16 support on the current device */ +THC_API int THC_fastHalfInstructions(THCState *state); + +#endif /* CUDA_HALF_TENSOR */ + +#endif diff --git a/aten/src/THC/THCIntegerDivider.cuh b/aten/src/THC/THCIntegerDivider.cuh new file mode 100644 index 0000000..cf71deb --- /dev/null +++ b/aten/src/THC/THCIntegerDivider.cuh @@ -0,0 +1,120 @@ +#ifndef THC_INTEGER_DIVIDER_INC +#define THC_INTEGER_DIVIDER_INC + +#include + +// A utility class to implement integer division by muliplication, given a fixed +// divisor. +// +// WARNING: The fast divider algorithm is only implemented for unsigned int; +// otherwise we default to plain integer division. For unsigned int, +// we further assume that the dividend is at most INT32_MAX. Thus, +// IntDivider must NOT be used for general integer division. +// +// This reduced range is enough for our purpose, and it allows us to +// slightly simplify the computation. +// +// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1< 0), we can find a "magic number" m (2^N +// <= m < 2^(N+1)) and shift s such that: +// +// \floor(n / d) = \floor((m * n) / 2^(N+s)). +// +// Given such m and s, the integer division can be then implemented as: +// +// let m' = m - 2^N // 0 <= m' < 2^N +// +// fast_integer_division(n): +// // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned +// // integer. Then take the higher N bits. +// t = (m' * n) >> N +// +// // Here we use the fact that n is less than 2^(N-1): otherwise the value +// // of (t + n) may not fit in an N-bit integer. +// return (t + n) >> s +// +// Finding such a magic number is surprisingly easy: +// +// s = \ceil(\log_2 d) +// m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic. +// +// See also: +// - Division by Invariant Integers Using Multiplication, +// Torbjörn Granlund and Peter L. Montgomery, 1994. +// +// - http://www.hackersdelight.org/magic.htm +// +// - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html + +// Result of div/mod operation stored together. +template +struct DivMod { + Value div, mod; + + __host__ __device__ DivMod(Value div, Value mod) : div(div), mod(mod) { } +}; + +// Base case: we only have an implementation for uint32_t for now. For +// everything else, we use plain division. +template +struct IntDivider { + IntDivider() { } // Dummy constructor for arrays. + IntDivider(Value d) : divisor(d) { } + + __host__ __device__ inline Value div(Value n) const { return n / divisor; } + __host__ __device__ inline Value mod(Value n) const { return n % divisor; } + __host__ __device__ inline DivMod divmod(Value n) const { + return DivMod(n / divisor, n % divisor); + } + + Value divisor; +}; + +// Implement fast integer division. +template <> +struct IntDivider { + static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int."); + + IntDivider() { } // Dummy constructor for arrays. + + IntDivider(unsigned int d) : divisor(d) { + assert(divisor >= 1 && divisor <= INT32_MAX); + + // TODO: gcc/clang has __builtin_clz() but it's not portable. + for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break; + + uint64_t one = 1; + uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; + m1 = magic; + assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits. + } + + __host__ __device__ inline unsigned int div(unsigned int n) const { +#ifdef __CUDA_ARCH__ + // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and + // 'm1'. + unsigned int t = __umulhi(n, m1); + return (t + n) >> shift; +#else + // Using uint64_t so that the addition does not overflow. + uint64_t t = ((uint64_t) n * m1) >> 32; + return (t + n) >> shift; +#endif + } + + __host__ __device__ inline unsigned int mod(unsigned int n) const { + return n - div(n) * divisor; + } + + __host__ __device__ inline DivMod divmod(unsigned int n) const { + unsigned int q = div(n); + return DivMod(q, n - q * divisor); + } + + unsigned int divisor; // d above. + unsigned int m1; // Magic number: m' above. + unsigned int shift; // Shift amounts. +}; + +#endif // THC_INTEGER_DIVIDER_INC diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh new file mode 100644 index 0000000..36823c0 --- /dev/null +++ b/aten/src/THC/THCNumerics.cuh @@ -0,0 +1,812 @@ +#ifndef THC_NUMERICS_INC +#define THC_NUMERICS_INC + +#include +#include +#include +#include "THCHalf.h" + +/// Class for numeric limits of the particular data type, which +/// includes support for `half`. +/// Unfortunately since `half` does not have a constructor, these have +/// to be expressed as functions (either that or non-const statics). +template +struct THCNumerics { +}; + +template +static inline __host__ __device__ scalar_t powi(scalar_t a, scalar_t b) { + assert(THCNumerics::ge(b, 0)); + scalar_t result = 1; + while (b) { + if (b & 1) { + result *= a; + } + b /= 2; + a *= a; + } + return result; +} + +template <> +struct THCNumerics { + static inline __host__ __device__ uint8_t min() { return 0; } + static inline __host__ __device__ uint8_t max() { return UCHAR_MAX; } + + static inline __host__ __device__ bool lt(uint8_t a, uint8_t b) { return a < b; } + static inline __host__ __device__ bool le(uint8_t a, uint8_t b) { return a <= b; } + static inline __host__ __device__ bool gt(uint8_t a, uint8_t b) { return a > b; } + static inline __host__ __device__ bool ge(uint8_t a, uint8_t b) { return a >= b; } + static inline __host__ __device__ bool eq(uint8_t a, uint8_t b) { return a == b; } + static inline __host__ __device__ bool ne(uint8_t a, uint8_t b) { return a != b; } + + static inline __host__ __device__ uint8_t neg(int8_t a) { return -a; } + static inline __host__ __device__ uint8_t add(uint8_t a, uint8_t b) { return a + b; } + static inline __host__ __device__ uint8_t mul(uint8_t a, uint8_t b) { return a * b; } + static inline __host__ __device__ uint8_t sub(uint8_t a, uint8_t b) { return a - b; } + static inline __host__ __device__ uint8_t div(uint8_t a, uint8_t b) { return a / b; } + static inline __host__ __device__ uint8_t abs(uint8_t a) { return a; } + static inline __host__ __device__ uint8_t pow(uint8_t a, uint8_t b) { return powi(a, b); } + static inline __host__ __device__ bool isnan(uint8_t a) { return false; } + static inline __host__ __device__ bool isinf(uint8_t a) { return false; } +}; + +template <> +struct THCNumerics { + static inline __host__ __device__ int8_t min() { return SCHAR_MIN; } + static inline __host__ __device__ int8_t max() { return SCHAR_MAX; } + + static inline __host__ __device__ bool lt(int8_t a, int8_t b) { return a < b; } + static inline __host__ __device__ bool le(int8_t a, int8_t b) { return a <= b; } + static inline __host__ __device__ bool gt(int8_t a, int8_t b) { return a > b; } + static inline __host__ __device__ bool ge(int8_t a, int8_t b) { return a >= b; } + static inline __host__ __device__ bool eq(int8_t a, int8_t b) { return a == b; } + static inline __host__ __device__ bool ne(int8_t a, int8_t b) { return a != b; } + + static inline __host__ __device__ int8_t neg(int8_t a) { return -a; } + static inline __host__ __device__ int8_t add(int8_t a, int8_t b) { return a + b; } + static inline __host__ __device__ int8_t mul(int8_t a, int8_t b) { return a * b; } + static inline __host__ __device__ int8_t sub(int8_t a, int8_t b) { return a - b; } + static inline __host__ __device__ int8_t div(int8_t a, int8_t b) { return a / b; } + static inline __host__ __device__ int8_t abs(int8_t a) { return ::abs((int)a); } + static inline __host__ __device__ int8_t pow(int8_t a, int8_t b) { return powi(a, b); } + static inline __host__ __device__ bool isnan(int8_t a) { return false; } + static inline __host__ __device__ bool isinf(int8_t a) { return false; } +}; + +template <> +struct THCNumerics { + static inline __host__ __device__ int16_t min() { return SHRT_MIN; } + static inline __host__ __device__ int16_t max() { return SHRT_MAX; } + + static inline __host__ __device__ bool lt(int16_t a, int16_t b) { return a < b; } + static inline __host__ __device__ bool le(int16_t a, int16_t b) { return a <= b; } + static inline __host__ __device__ bool gt(int16_t a, int16_t b) { return a > b; } + static inline __host__ __device__ bool ge(int16_t a, int16_t b) { return a >= b; } + static inline __host__ __device__ bool eq(int16_t a, int16_t b) { return a == b; } + static inline __host__ __device__ bool ne(int16_t a, int16_t b) { return a != b; } + + static inline __host__ __device__ int16_t neg(int16_t a) { return -a; } + static inline __host__ __device__ int16_t add(int16_t a, int16_t b) { return a + b; } + static inline __host__ __device__ int16_t mul(int16_t a, int16_t b) { return a * b; } + static inline __host__ __device__ int16_t sub(int16_t a, int16_t b) { return a - b; } + static inline __host__ __device__ int16_t div(int16_t a, int16_t b) { return a / b; } + static inline __host__ __device__ int16_t abs(int16_t a) { return ::abs((int)a); } + static inline __host__ __device__ int16_t pow(int16_t a, int16_t b) { return powi(a, b); } + static inline __host__ __device__ bool isnan(int16_t a) { return false; } + static inline __host__ __device__ bool isinf(int16_t a) { return false; } +}; + +template <> +struct THCNumerics { + static inline __host__ __device__ int32_t min() { return INT_MIN; } + static inline __host__ __device__ int32_t max() { return INT_MAX; } + + static inline __host__ __device__ bool lt(int32_t a, int32_t b) { return a < b; } + static inline __host__ __device__ bool le(int32_t a, int32_t b) { return a <= b; } + static inline __host__ __device__ bool gt(int32_t a, int32_t b) { return a > b; } + static inline __host__ __device__ bool ge(int32_t a, int32_t b) { return a >= b; } + static inline __host__ __device__ bool eq(int32_t a, int32_t b) { return a == b; } + static inline __host__ __device__ bool ne(int32_t a, int32_t b) { return a != b; } + + static inline __host__ __device__ int32_t neg(int32_t a) { return -a; } + static inline __host__ __device__ int32_t add(int32_t a, int32_t b) { return a + b; } + static inline __host__ __device__ int32_t mul(int32_t a, int32_t b) { return a * b; } + static inline __host__ __device__ int32_t sub(int32_t a, int32_t b) { return a - b; } + static inline __host__ __device__ int32_t div(int32_t a, int32_t b) { return a / b; } + static inline __host__ __device__ int32_t abs(int32_t a) { return ::abs(a); } + static inline __host__ __device__ int32_t pow(int32_t a, int32_t b) { return powi(a, b); } + static inline __host__ __device__ bool isnan(int32_t a) { return false; } + static inline __host__ __device__ bool isinf(int32_t a) { return false; } +}; + +template <> +struct THCNumerics { +#ifdef _MSC_VER + static inline __host__ __device__ int64_t min() { return _I64_MIN; } + static inline __host__ __device__ int64_t max() { return _I64_MAX; } +#else + static inline __host__ __device__ int64_t min() { return LONG_MIN; } + static inline __host__ __device__ int64_t max() { return LONG_MAX; } +#endif + + static inline __host__ __device__ bool lt(int64_t a, int64_t b) { return a < b; } + static inline __host__ __device__ bool le(int64_t a, int64_t b) { return a <= b; } + static inline __host__ __device__ bool gt(int64_t a, int64_t b) { return a > b; } + static inline __host__ __device__ bool ge(int64_t a, int64_t b) { return a >= b; } + static inline __host__ __device__ bool eq(int64_t a, int64_t b) { return a == b; } + static inline __host__ __device__ bool ne(int64_t a, int64_t b) { return a != b; } + + + static inline __host__ __device__ int64_t neg(int64_t a) { return -a; } + static inline __host__ __device__ int64_t add(int64_t a, int64_t b) { return a + b; } + static inline __host__ __device__ int64_t mul(int64_t a, int64_t b) { return a * b; } + static inline __host__ __device__ int64_t sub(int64_t a, int64_t b) { return a - b; } + static inline __host__ __device__ int64_t div(int64_t a, int64_t b) { return a / b; }; + static inline __host__ __device__ int64_t abs(int64_t a) { return labs(a); } + static inline __host__ __device__ int64_t pow(int64_t a, int64_t b) { return powi(a, b); } + static inline __host__ __device__ bool isnan(int64_t a) { return false; } + static inline __host__ __device__ bool isinf(int64_t a) { return false; } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct THCNumerics { +#if CUDA_VERSION < 9000 + static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; } + static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; } +#else + static inline __host__ __device__ half min() { __half_raw h; h.x = 0xfbff; return h; } + static inline __host__ __device__ half max() { __half_raw h; h.x = 0x7bff; return h; } +#endif + + static inline __host__ __device__ bool lt(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hlt(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa < fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) < THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool le(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hle(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa <= fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) <= THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool gt(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hgt(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa > fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) > THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool ge(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hge(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa >= fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) >= THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool eq(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __heq(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa == fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) == THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool ne(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hne(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa != fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) != THC_half2float(b); +#endif + } + + static inline __host__ __device__ half exp(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hexp(a); +#else + float fa = __half2float(a); + return __float2half(expf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(expf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half exp10(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hexp10(a); +#else + float fa = __half2float(a); + return __float2half(exp10f(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(exp10f(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half log(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hlog(a); +#else + float fa = __half2float(a); + return __float2half(logf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(logf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half log10(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(log10f(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(log10f(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half log1p(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(log1pf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(log1pf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half log2(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(log2f(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(log2f(THC_half2float(a))); +#endif + } + +static inline __host__ __device__ half lgamma(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(lgammaf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(lgammaf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half expm1(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(expm1f(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(expm1f(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half cos(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hcos(a); +#else + float fa = __half2float(a); + return __float2half(cosf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(cosf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half sin(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hsin(a); +#else + float fa = __half2float(a); + return __float2half(sinf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(sinf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half sqrt(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hsqrt(a); +#else + float fa = __half2float(a); + return __float2half(sqrtf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(sqrtf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half rsqrt(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hrsqrt(a); +#else + float fa = __half2float(a); + return __float2half(rsqrtf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(rsqrtf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half ceil(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hceil(a); +#else + float fa = __half2float(a); + return __float2half(ceilf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(ceilf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half floor(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return hfloor(a); +#else + float fa = __half2float(a); + return __float2half(floorf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(floorf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half trunc(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return htrunc(a); +#else + float fa = __half2float(a); + return __float2half(truncf(fa)); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(truncf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half neg(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hneg(a); +#else + float fa = __half2float(a); + return __float2half(-fa); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(-(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half acos(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(acosf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(acosf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half cosh(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(coshf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(coshf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half asin(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(asinf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(asinf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half sinh(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(sinhf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(sinhf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half tan(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(tanf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(tanf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half atan(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(atanf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(atanf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half tanh(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(tanhf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(tanhf(THC_half2float(a))); +#endif + } + + + static inline __host__ __device__ half erf(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(erff(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(erff(THC_half2float(a))); +#endif + } + + + static inline __host__ __device__ half erfc(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(erfcf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(erfcf(THC_half2float(a))); +#endif + } + + + static inline __host__ __device__ half erfinv(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(erfinvf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(erfinvf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half abs(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(fabs(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(fabs(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half round(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(roundf(fa)); +#else // __CUDA_ARCH__ + return THC_float2half(roundf(THC_half2float(a))); +#endif + } + + static inline __host__ __device__ half frac(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(fa - truncf(fa)); +#else // __CUDA_ARCH__ + float fa = THC_half2float(a); + return THC_float2half(fa - floorf(fa)); +#endif + } + + static inline __host__ __device__ half cinv(half a) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + return __float2half(1.0f / fa); +#else // __CUDA_ARCH__ + return THC_float2half(1.0f / THC_half2float(a)); +#endif + } + + static inline __host__ __device__ half add(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hadd(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa + fb ); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) + THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half div(half a, half b) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa / fb ); +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) / THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half mul(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hmul(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa * fb ); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) * THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half sub(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hsub(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa - fb ); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) - THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half pow(half a, half b) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half(powf(fa, fb)); +#else // __CUDA_ARCH__ + return THC_float2half(powf(THC_half2float(a), THC_half2float(b))); +#endif + } + + static inline __host__ __device__ half atan2(half a, half b) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half(atan2f(fa, fb)); +#else // __CUDA_ARCH__ + return THC_float2half(atan2f(THC_half2float(a), THC_half2float(b))); +#endif + } + + static inline __host__ __device__ bool isnan(half a) { + // implemented using that a!=a if and only if a is nan + return ne(a, a); + } + + static inline __host__ __device__ bool isinf(half a) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hisinf(a) != 0; +#else + float fa = __half2float(a); + return ::isinf(fa); +#endif +#else // __CUDA_ARCH__ + return ::isinf(THC_half2float(a)); +#endif + } + +}; +#endif + +template <> +struct THCNumerics { + static inline __host__ __device__ float min() { return -FLT_MAX; } + static inline __host__ __device__ float max() { return FLT_MAX; } + + static inline __host__ __device__ bool lt(float a, float b) { return a < b; } + static inline __host__ __device__ bool le(float a, float b) { return a <= b; } + static inline __host__ __device__ bool gt(float a, float b) { return a > b; } + static inline __host__ __device__ bool ge(float a, float b) { return a >= b; } + static inline __host__ __device__ bool eq(float a, float b) { return a == b; } + static inline __host__ __device__ bool ne(float a, float b) { return a != b; } + + static inline __host__ __device__ float lgamma(float a) { return lgammaf(a);} + static inline __host__ __device__ float erfinv(float a) { return erfinvf(a);} + static inline __host__ __device__ float exp (float a) { return expf(a); } + static inline __host__ __device__ float exp10(float a) { return exp10f(a); } + static inline __host__ __device__ float log (float a) { return logf(a); } + static inline __host__ __device__ float log10(float a) { return log10f(a); } + static inline __host__ __device__ float log1p(float a) { return log1pf(a); } + static inline __host__ __device__ float log2 (float a) { return log2f(a); } + static inline __host__ __device__ float expm1(float a) { return expm1f(a); } + static inline __host__ __device__ float cos (float a) { return cosf(a); } + static inline __host__ __device__ float sin (float a) { return sinf(a); } + static inline __host__ __device__ float sqrt (float a) { return sqrtf(a); } + static inline __host__ __device__ float rsqrt(float a) { return rsqrtf(a); } + static inline __host__ __device__ float ceil (float a) { return ceilf(a); } + static inline __host__ __device__ float floor(float a) { return floorf(a); } + static inline __host__ __device__ float trunc(float a) { return truncf(a); } + static inline __host__ __device__ float neg (float a) { return -a; } + static inline __host__ __device__ float acos (float a) { return acosf(a); } + static inline __host__ __device__ float cosh (float a) { return coshf(a); } + static inline __host__ __device__ float acosh(float a) { return acoshf(a); } + static inline __host__ __device__ float asin (float a) { return asinf(a); } + static inline __host__ __device__ float sinh (float a) { return sinhf(a); } + static inline __host__ __device__ float asinh(float a) { return asinhf(a); } + static inline __host__ __device__ float tan (float a) { return tanf(a); } + static inline __host__ __device__ float atan (float a) { return atanf(a); } + static inline __host__ __device__ float tanh (float a) { return tanhf(a); } + static inline __host__ __device__ float erf (float a) { return erff(a); } + static inline __host__ __device__ float erfc (float a) { return erfcf(a); } + static inline __host__ __device__ float abs (float a) { return fabsf(a); } + static inline __host__ __device__ float round(float a) { return roundf(a); } + static inline __host__ __device__ float frac (float a) { return a - truncf(a); } + static inline __host__ __device__ float cinv (float a) { return 1.0f / a; } + static inline __host__ __device__ float add (float a, float b) { return a + b; } + static inline __host__ __device__ float div (float a, float b) { return a / b; } + static inline __host__ __device__ float mul (float a, float b) { return a * b; } + static inline __host__ __device__ float sub (float a, float b) { return a - b; } + static inline __host__ __device__ float pow (float a, float b) { return powf(a, b); } + static inline __host__ __device__ float atan2(float a, float b) { return atan2f(a, b); } + static inline __host__ __device__ bool isnan(float a) { return ::isnan(a); } + static inline __host__ __device__ bool isinf(float a) { return ::isinf(a); } +}; + +template <> +struct THCNumerics { + static inline __host__ __device__ double min() { return -DBL_MAX; } + static inline __host__ __device__ double max() { return DBL_MAX; } + + static inline __host__ __device__ bool lt(double a, double b) { return a < b; } + static inline __host__ __device__ bool le(double a, double b) { return a <= b; } + static inline __host__ __device__ bool gt(double a, double b) { return a > b; } + static inline __host__ __device__ bool ge(double a, double b) { return a >= b; } + static inline __host__ __device__ bool eq(double a, double b) { return a == b; } + static inline __host__ __device__ bool ne(double a, double b) { return a != b; } + + static inline __host__ __device__ double lgamma(double a) { return ::lgamma(a);} + static inline __host__ __device__ double erfinv(double a) { return ::erfinv(a);} + static inline __host__ __device__ double exp (double a) { return ::exp(a); } + static inline __host__ __device__ double exp10(double a) { return ::exp10(a); } + static inline __host__ __device__ double log (double a) { return ::log(a); } + static inline __host__ __device__ double log10(double a) { return ::log10(a); } + static inline __host__ __device__ double log1p(double a) { return ::log1p(a); } + static inline __host__ __device__ double log2 (double a) { return ::log2(a); } + static inline __host__ __device__ double expm1(double a) { return ::expm1(a); } + static inline __host__ __device__ double cos (double a) { return ::cos(a); } + static inline __host__ __device__ double sin (double a) { return ::sin(a); } + static inline __host__ __device__ double sqrt (double a) { return ::sqrt(a); } + static inline __host__ __device__ double rsqrt(double a) { return ::rsqrt(a); } + static inline __host__ __device__ double ceil (double a) { return ::ceil(a); } + static inline __host__ __device__ double floor(double a) { return ::floor(a); } + static inline __host__ __device__ double trunc(double a) { return ::trunc(a); } + static inline __host__ __device__ double neg (double a) { return -a; } + static inline __host__ __device__ double acos (double a) { return ::acos(a); } + static inline __host__ __device__ double cosh (double a) { return ::cosh(a); } + static inline __host__ __device__ double acosh(double a) { return ::acosh(a); } + static inline __host__ __device__ double asin (double a) { return ::asin(a); } + static inline __host__ __device__ double sinh (double a) { return ::sinh(a); } + static inline __host__ __device__ double asinh(double a) { return ::asinh(a); } + static inline __host__ __device__ double tan (double a) { return ::tan(a); } + static inline __host__ __device__ double atan (double a) { return ::atan(a); } + static inline __host__ __device__ double tanh (double a) { return ::tanh(a); } + static inline __host__ __device__ double erf (double a) { return ::erf(a); } + static inline __host__ __device__ double erfc (double a) { return ::erfc(a); } + static inline __host__ __device__ double abs (double a) { return ::abs(a); } + static inline __host__ __device__ double round(double a) { return ::round(a); } + static inline __host__ __device__ double frac (double a) { return a - ::trunc(a); } + static inline __host__ __device__ double cinv (double a) { return 1.0 / a; } + static inline __host__ __device__ double add (double a, double b) { return a + b; } + static inline __host__ __device__ double div (double a, double b) { return a / b; } + static inline __host__ __device__ double mul (double a, double b) { return a * b; } + static inline __host__ __device__ double sub (double a, double b) { return a - b; } + static inline __host__ __device__ double pow (double a, double b) { return ::pow(a, b); } + static inline __host__ __device__ double atan2(double a, double b) { return ::atan2(a, b); } + static inline __host__ __device__ bool isnan(double a) { return ::isnan(a); } + static inline __host__ __device__ bool isinf(double a) { return ::isinf(a); } +}; + +/// `half` has some type conversion issues associated with it, since it +/// is a struct without a constructor/implicit conversion constructor. +/// We use this to convert scalar values to the given type that the +/// tensor expects. +template +struct ScalarConvert { + static __host__ __device__ Out to(const In v) { return (Out) v; } +}; + +#ifdef CUDA_HALF_TENSOR +template +struct ScalarConvert { + static __host__ __device__ Out to(const half v) { +#ifdef __CUDA_ARCH__ + return (Out) __half2float(v); +#else + return (Out) THC_half2float(v); +#endif + } +}; + +template +struct ScalarConvert { + static __host__ __device__ half to(const In v) { +#ifdef __CUDA_ARCH__ + return __float2half((float) v); +#else + return THC_float2half((float) v); +#endif + } +}; + +template <> +struct ScalarConvert { + static __host__ __device__ half to(const half v) { + return v; + } +}; + +template +__host__ __device__ T scalar_cast(U u) { + return ScalarConvert::to(u); +} + +#endif + +#endif // THC_NUMERICS_INC diff --git a/aten/src/THC/THCReduce.cuh b/aten/src/THC/THCReduce.cuh new file mode 100644 index 0000000..2735847 --- /dev/null +++ b/aten/src/THC/THCReduce.cuh @@ -0,0 +1,641 @@ +#ifndef THC_REDUCE_INC +#define THC_REDUCE_INC + +// +// This file contains dimension reduction operation functions and +// kernels that work on both contiguous and non-contiguous tensor +// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned +// arguments without copying or temporary storage. +// + +#include "THCTensorTypeUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include "THCNumerics.cuh" + +// Threads per thread block +#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 +#define CHUNKPERBLOCK 256 + +template +__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() { + // Each thread handles one slice + return getLinearBlockId() * THC_NONCONTIG_REDUCE_BLOCK_SIZE + threadIdx.x; +} + +// quick hack to enable two-stage use of reduceChunk +template +struct SimpleCopyOp +{ + __device__ __forceinline__ T operator()(const T val) const + { + return val; + } +}; + +__device__ __forceinline__ int lastpow2(int n) +{ + int out = 1 << (31 - __clz(n)); + if(n == out) + out >>= 1; + return out; +} + +template + +__device__ __forceinline__ void reduceChunk + (T* out, + U* in, + const int& inbounds, + const IndexType& reductionStride, + const IndexType& reductionSize, + const IndexType& inOffset, + const IndexType& outOffset, + const int& shmem_lim, + AccT init, + AccT* shmem, + ModifyOp modifyOp, + ReduceOp reduceOp, + FinalizeOp finalizeOp) +{ + AccT load_reg[4]; + AccT local_reg = init; + + //Unroll this loop + //for(IndexType i=threadIdx.y; i(in[inOffset + i*reductionStride]); + load_reg[0] = modifyOp(val0); + const AccT val1 = scalar_cast(in[inOffset + (i + blockDim.y)*reductionStride]); + load_reg[1] = modifyOp(val1); + const AccT val2 = scalar_cast(in[inOffset + (i + blockDim.y*2)*reductionStride]); + load_reg[2] = modifyOp(val2); + const AccT val3 = scalar_cast(in[inOffset + (i + blockDim.y*3)*reductionStride]); + load_reg[3] = modifyOp(val3); + local_reg = reduceOp(local_reg, load_reg[0]); + local_reg = reduceOp(local_reg, load_reg[1]); + local_reg = reduceOp(local_reg, load_reg[2]); + local_reg = reduceOp(local_reg, load_reg[3]); + } + else if (i + blockDim.y*2 < reductionSize) + { + const AccT val0 = scalar_cast(in[inOffset + i*reductionStride]); + load_reg[0] = modifyOp(val0); + const AccT val1 = scalar_cast(in[inOffset + (i + blockDim.y)*reductionStride]); + load_reg[1] = modifyOp(val1); + const AccT val2 = scalar_cast(in[inOffset + (i + blockDim.y*2)*reductionStride]); + load_reg[2] = modifyOp(val2); + local_reg = reduceOp(local_reg, load_reg[0]); + local_reg = reduceOp(local_reg, load_reg[1]); + local_reg = reduceOp(local_reg, load_reg[2]); + } + else if (i + blockDim.y < reductionSize) + { + const AccT val0 = scalar_cast(in[inOffset + i*reductionStride]); + load_reg[0] = modifyOp(val0); + const AccT val1 = scalar_cast(in[inOffset + (i + blockDim.y)*reductionStride]); + load_reg[1] = modifyOp(val1); + local_reg = reduceOp(local_reg, load_reg[0]); + local_reg = reduceOp(local_reg, load_reg[1]); + } + else if (i < reductionSize) + { + const AccT val0 = scalar_cast(in[inOffset + i*reductionStride]); + local_reg = reduceOp(local_reg, modifyOp(val0)); + } + } + + *shmem = local_reg; + for(int i = lastpow2(shmem_lim); i > 0; i >>= 1) + { + __syncthreads(); + if(threadIdx.y < i && threadIdx.y + i < shmem_lim) + *shmem = reduceOp(*shmem, *(shmem + i*blockDim.x)); + } + + if(threadIdx.y == 0 && inbounds) + out[outOffset] = scalar_cast(finalizeOp(*shmem)); +} + +// Kernel that handles an entire reduction of a slice of a tensor per each thread +template + +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(32 * 16, 4) +#endif +__global__ void kernelReduceNoncontigDim_shared + (TensorInfo out, + TensorInfo in, + IndexType reductionStride, + IndexType reductionSize, + IndexType totalSlices, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + FinalizeOp finalizeOp, + volatile AccT* stagingData, + int* semaphores) +{ + IndexType sliceIndex = blockIdx.x*blockDim.x + threadIdx.x; + + __shared__ int isLastBlockDone; + __shared__ AccT local_reduce[THC_NONCONTIG_REDUCE_BLOCK_SIZE]; + AccT* shmem = &local_reduce[threadIdx.x + threadIdx.y*blockDim.x]; + + // This kernel is intended for the latency-bound case, so we want to launch enough blocks + // to cover the entire output. This means we don't need grid-stride loops. + const IndexType outOffset = + IndexToOffset::get(sliceIndex, out); + const IndexType inOffset = + IndexToOffset::get(sliceIndex, in); + const int inbounds = (sliceIndex < totalSlices); + + if(gridDim.y == 1) + reduceChunk + (out.data, + in.data, + inbounds, + reductionStride, + reductionSize, + inOffset, + outOffset, + reductionSize < blockDim.y ? reductionSize : blockDim.y, + init, + shmem, + modifyOp, + reduceOp, + finalizeOp); + else + { + int* semaphore = semaphores + blockIdx.x; + + const IndexType chunkStart = blockIdx.y*CHUNKPERBLOCK; + const IndexType chunkSize = reductionSize - chunkStart < CHUNKPERBLOCK ? + reductionSize - chunkStart : CHUNKPERBLOCK; + const IndexType reductionStrideStaging = totalSlices; + const IndexType stagingOffset = sliceIndex; + + reduceChunk + (stagingData, + in.data, + inbounds, + reductionStride, + chunkSize, + inOffset + chunkStart*reductionStride, + stagingOffset + blockIdx.y*reductionStrideStaging, + chunkSize < blockDim.y ? chunkSize : blockDim.y, + init, + shmem, + modifyOp, + reduceOp, + SimpleCopyOp()); + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + + if(threadIdx.x == 0 && threadIdx.y == 0) + { + int old = atomicAdd(semaphore, 1); + isLastBlockDone = (old == gridDim.y - 1); + } + + __syncthreads(); + + // The staging area contains gridDim.y elements along each slice. The final reduction + // begins by treating the first blockDim.y elements as "init" values. + if(isLastBlockDone) + { + if(threadIdx.y < gridDim.y) + init = stagingData[stagingOffset + threadIdx.y*reductionStrideStaging]; + IndexType remaining = gridDim.y < blockDim.y ? 0 : gridDim.y - blockDim.y; + reduceChunk + (out.data, + stagingData, + inbounds, + reductionStrideStaging, + remaining, // if 0, loop in reduceChunk is skipped, otherwise... + stagingOffset + blockDim.y*reductionStrideStaging, // ...loop begins at blockDim+1th element + outOffset, + gridDim.y < blockDim.y ? gridDim.y : blockDim.y, + init, + shmem, + SimpleCopyOp(), + reduceOp, + finalizeOp); + } + } +} + + +// Kernel that handles an entire reduction of a slice of a tensor per each thread +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(32 * 16, 4) +#endif +__global__ void +kernelReduceNoncontigDim(TensorInfo out, + TensorInfo in, + IndexType reductionStride, + IndexType reductionSize, + IndexType totalSlices, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + FinalizeOp finalizeOp) { + const IndexType sliceIndex = getReduceNoncontigDimSliceIndex(); + + if (sliceIndex >= totalSlices) { + return; + } + + // Each thread picks a point in `out` and `in` for which it is + // producing the reduction + const IndexType outOffset = + IndexToOffset::get(sliceIndex, out); + const IndexType inBaseOffset = + IndexToOffset::get(sliceIndex, in); + + // For each point in reductionSize, reduce into `r` + IndexType inOffset = inBaseOffset; + AccT r = init; + + for (IndexType i = 0; i < reductionSize; ++i) { + const AccT val = scalar_cast(in.data[inOffset]); + r = reduceOp(r, modifyOp(val)); + inOffset += reductionStride; + } + + // Write out reduced value + out.data[outOffset] = scalar_cast(finalizeOp(r)); +} + +template +__device__ __forceinline__ IndexType getReduceContigDimSliceIndex() { + // Each block handles one slice + return getLinearBlockId(); +} + +// Kernel that handles an entire reduction of a slice of a tensor per +// each block +template +__global__ void +kernelReduceContigDim(TensorInfo out, + TensorInfo in, + IndexType reductionSize, + IndexType totalSlices, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + FinalizeOp finalizeOp) { + const IndexType sliceIndex = getReduceContigDimSliceIndex(); + + if (sliceIndex >= totalSlices) { + return; + } + + // Get the offset in `out` for the reduction + const IndexType outOffset = + IndexToOffset::get(sliceIndex, out); + + // Get the base offset in `in` for this block's reduction + const IndexType inBaseOffset = + IndexToOffset::get(sliceIndex, in); + + // Each thread in the block will reduce some subset of elements in + // the slice. The elements are guaranteed contiguous starting at + // `inBaseOffset`. + AccT r = init; + for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) { + const AccT val = scalar_cast(in.data[inBaseOffset + i]); + r = reduceOp(r, modifyOp(val)); + } + + // Reduce within the block + // FIXME: extern name + extern __shared__ char smemChar[]; + AccT* smem = (AccT*) smemChar; + r = reduceBlock(smem, blockDim.x, r, reduceOp, init); + + if (threadIdx.x == 0) { + // Write out reduced value + out.data[outOffset] = scalar_cast(finalizeOp(r)); + } +} + +inline dim3 getNoncontigReduceBlock() { + return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE); +} + +inline dim3 getContigReduceBlock(ptrdiff_t numSlices, int64_t reductionSize) { + // If the number of slices is low but the reduction dimension size + // is high, then we should increase block size for greater parallelism. + // Aim for at least 32 warps per SM (assume 15 SMs; don't bother + // inquiring the real number for now). + int maxWarps = 4; // better occupancy if many blocks are around + // For numSlices > 15 * 8, there are > 32 warps active per SM. + if (numSlices < 15 * 8) { + maxWarps = 8; + if (numSlices < 15 * 4) { + maxWarps = 16; + if (numSlices < 15 * 2) { + maxWarps = 32; + } + } + } + + // Scale up block size based on the reduction dimension size + int64_t warpsInReductionSize = THCCeilDiv(reductionSize, (int64_t) 32); + int numWarps = warpsInReductionSize > (int64_t) maxWarps ? + maxWarps : (int) warpsInReductionSize; + + return dim3(numWarps * 32); +} + +inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) { + // One output point per thread + return THC_getGridFromTiles(THCCeilDiv(elements, + (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid); +} + +inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) { + // One output point per block + return THC_getGridFromTiles(elements, grid); +} + +// Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for +// all in where i and the out's 0 are indexed at dimension `dim` +template +bool THC_reduceDim(THCState* state, + TensorType* out, + TensorType* in, + const ModifyOp modifyOp, + const ReduceOp reduceOp, + const FinalizeOp finalizeOp, + AccT init, + int dim, + int keepdim) { + ptrdiff_t inElements = THCTensor_nElement(state, in); + + int64_t reductionSize = THCTensor_size(state, in, dim); + int64_t reductionStride = THCTensor_stride(state, in, dim); + ptrdiff_t outElements = inElements / reductionSize; + + if (THCTensor__nDimension(state, out) > MAX_CUTORCH_DIMS || + THCTensor__nDimension(state, in) > MAX_CUTORCH_DIMS) { + return false; + } + + if (THCTensor__nDimension(state, in) == 0) { + // Zero-dim tensor; do nothing + return true; + } + + // Is the reduction dimension contiguous? If so, then we can use a + // shared memory reduction kernel to increase performance. + bool contigReduction = (reductionStride == 1); + + dim3 block; + dim3 grid; + int smemSize = 0; // contiguous reduction uses smem + if (contigReduction) { + if (!getContigReduceGrid(outElements, grid)) { + return false; + } + + block = getContigReduceBlock(outElements, reductionSize); + smemSize = sizeof(AccT) * block.x; + } else { + if (!getNoncontigReduceGrid(outElements, grid)) { + return false; + } + + block = getNoncontigReduceBlock(); + + if(outElements <= 4096) + { + // gridDim.x and blockDim.x parallelize work across slices. + // blockDim.y enables some intra-block reduction within slices. + // gridDim.y enables inter-block reduction within slices. + + // Each block covers 32 output elements. + int blockdimx = 32; + int griddimx = THCCeilDiv((int64_t)outElements, (int64_t)blockdimx); + + // Each warp reduces at most 4 slices. This heuristic can be tuned, + // but locking blockdimy to 16 is robust and reasonably performant. + int blockdimy = 16; + + int griddimy = 1; + bool coop = false; + // Rough heuristics to decide if using cooperating blocks is worthwhile + if( outElements <= 32 && reductionSize >= 4096) coop = true; + if( 32 < outElements && outElements <= 64 && reductionSize >= 4096) coop = true; + if( 64 < outElements && outElements <= 128 && reductionSize >= 4096) coop = true; + if( 128 < outElements && outElements <= 256 && reductionSize >= 4096) coop = true; + if( 256 < outElements && outElements <= 512 && reductionSize >= 4096) coop = true; + if( 512 < outElements && outElements <= 1024 && reductionSize >= 4096) coop = true; + if(1024 < outElements && outElements <= 2048 && reductionSize >= 2048) coop = true; + if(2048 < outElements && outElements <= 4096 && reductionSize >= 2048) coop = true; + // Each block reduces at most CHUNKPERBLOCK (currently 256) slices. + if(coop) + griddimy = THCCeilDiv((int64_t)reductionSize, (int64_t)CHUNKPERBLOCK); + + grid = dim3(griddimx, griddimy, 1); + block = dim3(blockdimx, blockdimy, 1); + } + } + + // Resize out to correspond to the reduced size with keepdim=True. + + // Preserve noncontiguities by unsqueezing out if necessary + THCTensor_preserveReduceDimSemantics( + state, out, THCTensor__nDimension(state, in), dim, keepdim); + + // Resize out + THLongStorage* sizes = THCTensor_newSizeOf(state, in); + THLongStorage_set(sizes, dim, 1); + THCTensor_resize(state, out, sizes, NULL); + THLongStorage_free(sizes); + + // It is possible that the tensor dimensions are able to be collapsed, + // and thus we can reduce the actual code complexity of the copy by + // exploiting this knowledge statically, since the div/mod is the + // most expensive part of the operation, more so than memory accesses. + // For instance, when copying a non-contiguous to a contiguous tensor + // (or vice versa), the contiguous tensor can be collapsed to one + // dimension, and the loop to translate the linear index to the array + // index can be similarly collapsed. That is what this unrolling is for. +#define HANDLE_CASE(TYPE, OUT, IN) \ + if (contigReduction) { \ + kernelReduceContigDim \ + <<>> \ + (outInfo, inInfo, reductionSize, \ + (TYPE) outElements, init, modifyOp, reduceOp, finalizeOp); \ + } else { \ + if(block.y == 1){ \ + kernelReduceNoncontigDim< \ + ScalarType, \ + TYPE, AccT, ModifyOp, ReduceOp, FinalizeOp, \ + OUT, IN> \ + <<>> \ + (outInfo, inInfo, reductionStride, reductionSize, \ + (TYPE) outElements, init, modifyOp, reduceOp, finalizeOp); \ + } \ + else \ + { \ + void* stagingData; \ + void* semaphores; \ + \ + if(grid.y > 1) \ + { \ + stagingData = THCudaMalloc(state, sizeof(AccT)*outElements*grid.y);\ + semaphores = THCudaMalloc(state, sizeof(int)*grid.x); \ + THCudaCheck(cudaMemsetAsync \ + (semaphores, \ + 0, \ + sizeof(int)*grid.x, \ + THCState_getCurrentStream(state))); \ + } \ + \ + kernelReduceNoncontigDim_shared \ + \ + <<>> \ + (outInfo, \ + inInfo, \ + reductionStride, \ + reductionSize, \ + (TYPE) outElements, \ + init, \ + modifyOp, \ + reduceOp, \ + finalizeOp, \ + (volatile AccT*)stagingData, \ + (int*)semaphores); \ + \ + if(grid.y > 1) \ + { \ + THCudaFree(state, stagingData); \ + THCudaFree(state, semaphores); \ + } \ + } \ + } + +#define HANDLE_IN_CASE(TYPE, OUT, IN) \ + { \ + switch (IN) { \ + case 1: \ + HANDLE_CASE(TYPE, OUT, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, OUT, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, OUT, -1); \ + break; \ + } \ + } + +#define HANDLE_OUT_CASE(TYPE, OUT, IN) \ + { \ + switch (OUT) { \ + case 1: \ + HANDLE_IN_CASE(TYPE, 1, IN); \ + break; \ + case 2: \ + HANDLE_IN_CASE(TYPE, 2, IN); \ + break; \ + default: \ + HANDLE_IN_CASE(TYPE, -1, IN); \ + break; \ + } \ + } + + if(THCTensor_canUse32BitIndexMath(state, out) && + THCTensor_canUse32BitIndexMath(state, in)) + { + TensorInfo outInfo = + getTensorInfo(state, out); + outInfo.collapseDims(); + + TensorInfo inInfo = + getTensorInfo(state, in); + inInfo.reduceDim(dim); + inInfo.collapseDims(); + HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims); + } + else + { + TensorInfo outInfo = + getTensorInfo(state, out); + outInfo.collapseDims(); + + TensorInfo inInfo = + getTensorInfo(state, in); + inInfo.reduceDim(dim); + inInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (outInfo.dims == 1 && inInfo.dims == 1) { + HANDLE_CASE(uint64_t, 1, 1); + } else { + HANDLE_CASE(uint64_t, -1, -1); + } + } +#undef HANDLE_CASE +#undef HANDLE_IN_CASE +#undef HANDLE_OUT_CASE + + + if (!keepdim) { + THCTensor_squeeze1d(state, out, out, dim); + } + return true; +} + +#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE +#undef CHUNKPERBLOCK + +#endif // THC_REDUCE_INC diff --git a/aten/src/THC/THCReduceAll.cuh b/aten/src/THC/THCReduceAll.cuh new file mode 100644 index 0000000..5850e77 --- /dev/null +++ b/aten/src/THC/THCReduceAll.cuh @@ -0,0 +1,331 @@ +#ifndef THC_REDUCEALL_INC +#define THC_REDUCEALL_INC + +// +// This file contains dimension reduction operation functions and +// kernels that work on both contiguous and non-contiguous tensor +// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned +// arguments without copying or temporary storage, for reducing an +// entire tensor to one value. +// + +#include "THCReduceApplyUtils.cuh" + +// Size per each reduction block +#define THC_REDUCE_ALL_BLOCK_SIZE 1024L + +// Cutoff size for two-pass reduction +#define THC_TWO_PASS_REDUCTION_SIZE 2048L + +// Kernel that handles an entire reduction of a tensor in one pass +template +__global__ void +kernelReduceAll(TensorInfo in, + IndexType totalElements, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + AccT* out) { + // With a block-wide stride, have each thread perform its own reduction. + AccT r = init; + for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) { + const IndexType inOffset = IndexToOffset::get(i, in); + const AccT val = scalar_cast(in.data[inOffset]); + r = reduceOp(r, modifyOp(val)); + } + + // Reduce within the block + extern __shared__ char smemChar[]; + AccT* smem = (AccT*) smemChar; + r = reduceBlock(smem, blockDim.x, r, reduceOp, init); + + if (threadIdx.x == 0) { + // Write out reduced value + *out = r; + } +} + +template +__device__ __forceinline__ IndexType getStartIndex(IndexType totalSize) { + IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x); + return blockIdx.x * sizePerBlock; +} + +template +__device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) { + IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x); + return min((IndexType) ((blockIdx.x + 1) * sizePerBlock), totalSize); +} + +// Kernel that handles an entire reduction of a tensor in two passes +template +__global__ void +kernelReduceAllPass1(TensorInfo in, + IndexType totalElements, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + AccT* scratchSpace) { + const IndexType startIndex = getStartIndex(totalElements); + const IndexType endIndex = getEndIndex(totalElements); + + // With a block-wide stride, have each thread perform its own reduction. + AccT r = init; + for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) { + const IndexType inOffset = IndexToOffset::get(i, in); + const AccT val = scalar_cast(in.data[inOffset]); + r = reduceOp(r, modifyOp(val)); + } + + // Reduce within the block + extern __shared__ char smemChar[]; + AccT* smem = (AccT*) smemChar; + r = reduceBlock(smem, blockDim.x, r, reduceOp, init); + + if (threadIdx.x == 0) { + // Write out block-wide reduced value + scratchSpace[blockIdx.x] = r; + } +} + +template +__global__ void +kernelReduceAllPass2(int numPass1Blocks, + T init, + ReduceOp reduceOp, + T* scratchSpace, + T* out) { + T r = init; + if (threadIdx.x < numPass1Blocks) { + r = scratchSpace[threadIdx.x]; + } + + // Reduce within the block + extern __shared__ char smemChar[]; + T* smem = (T*) smemChar; + r = reduceBlock(smem, numPass1Blocks, r, reduceOp, init); + + if (threadIdx.x == 0) { + *out = r; + } +} + +// Perform a two-pass reduction if the tensor is large enough to +// warrant it. +inline bool isTwoPassReductionSize(ptrdiff_t elements) { + return (elements > THC_TWO_PASS_REDUCTION_SIZE); +} + +template +inline ptrdiff_t getTwoPassBlocks(THCState* state, ptrdiff_t elements) { + ptrdiff_t numBlocks = THCCeilDiv(elements, (ptrdiff_t)THC_REDUCE_ALL_BLOCK_SIZE); + + // We can only have as many blocks as there is scratch space + ptrdiff_t scratchSpace = + THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T); + THAssert(scratchSpace > 0); + + // Limit to 1024 due to dimensionality constraint + if (scratchSpace > 1024) { + scratchSpace = 1024; + } + + if (numBlocks > scratchSpace) { + numBlocks = scratchSpace; + } + + return numBlocks; +} + +// Get the block/grid size that we want +template +inline void getPass1ReduceBlockGrid(THCState* state, ptrdiff_t elements, + dim3& grid, dim3& block) { + grid = dim3(getTwoPassBlocks(state, elements)); + block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); +} + +template +inline void getPass2ReduceBlockGrid(THCState* state, ptrdiff_t elements, + dim3& grid, dim3& block) { + grid = dim3(1); + // We only need as many threads as there were blocks originally + block = dim3(getTwoPassBlocks(state, elements)); +} + +inline void getSinglePassReduceBlockGrid(ptrdiff_t elements, + dim3& grid, dim3& block) { + grid = dim3(1); + block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); +} + +template +void callReduceAll(THCState* state, + const TensorInfo& in, + ptrdiff_t totalElements, + AccT init, + const ModifyOp& modifyOp, + const ReduceOp& reduceOp, + AccT* devOut) { + dim3 grid; + dim3 block; + + if (isTwoPassReductionSize(totalElements)) { + void* scratchSpace = THCudaMalloc(state, THCState_getCurrentDeviceScratchSpaceSize(state)); + + getPass1ReduceBlockGrid(state, totalElements, grid, block); + size_t smemSize = block.x * sizeof(AccT); + + kernelReduceAllPass1 + <<>>( + in, (IndexType) totalElements, init, modifyOp, reduceOp, + (AccT*) scratchSpace); + + int numPass1Blocks = grid.x; + getPass2ReduceBlockGrid(state, totalElements, grid, block); + smemSize = block.x * sizeof(AccT); + + kernelReduceAllPass2 + <<>>( + numPass1Blocks, init, reduceOp, + (AccT*) scratchSpace, devOut); + + THCudaFree(state, scratchSpace); + } else { + getSinglePassReduceBlockGrid(totalElements, grid, block); + size_t smemSize = block.x * sizeof(AccT); + + kernelReduceAll + <<>>( + in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut); + } +} + +// Reduces the entire tensor to one value. `out` points to +// host-resident memory. +template +bool THC_reduceAll(THCState* state, + TensorType* in, + const ModifyOp& modifyOp, + const ReduceOp& reduceOp, + AccT init, + AccT* out, + int outOnDevice) { + ptrdiff_t inElements = THCTensor_nElement(state, in); + + if (THCTensor__nDimension(state, in) > MAX_CUTORCH_DIMS) { + return false; + } + + if (THCTensor__nDimension(state, in) == 0) { + // Zero-dim tensor; do nothing + *out = init; + return true; + } + + bool freeDevOut = false; + AccT* devOut = out; + if (!outOnDevice) { + // Use the stream-specific scratch space for the reduction kernel + // to write out its value + devOut = static_cast(THCudaMalloc(state, + THCState_getCurrentDeviceScratchSpaceSize(state))); + freeDevOut = true; + } + + // It is possible that the tensor dimensions are able to be collapsed, + // and thus we can reduce the actual code complexity of the copy by + // exploiting this knowledge statically, since the div/mod is the + // most expensive part of the operation, more so than memory accesses. + // For instance, when copying a non-contiguous to a contiguous tensor + // (or vice versa), the contiguous tensor can be collapsed to one + // dimension, and the loop to translate the linear index to the array + // index can be similarly collapsed. That is what this unrolling is for. +#define HANDLE_CASE(TYPE, IN) \ + callReduceAll( \ + state, inInfo, inElements, init, modifyOp, \ + reduceOp, devOut); + +#define HANDLE_IN_CASE(TYPE, IN) \ + { \ + switch (IN) { \ + case 1: \ + HANDLE_CASE(TYPE, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, -1); \ + break; \ + } \ + } + + if (THCTensor_canUse32BitIndexMath(state, in)) { + TensorInfo inInfo = + getTensorInfo(state, in); + inInfo.collapseDims(); + + HANDLE_IN_CASE(unsigned int, inInfo.dims); + } else { + TensorInfo inInfo = + getTensorInfo(state, in); + inInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (inInfo.dims == 1) { + HANDLE_IN_CASE(uint64_t, 1); + } else { + HANDLE_IN_CASE(uint64_t, -1); + } + } +#undef HANDLE_CASE +#undef HANDLE_IN_CASE + + // If our destination is not on the device, copy the value back to + // the host (synchronous!) + if (!outOnDevice) { + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(out, + devOut, + sizeof(AccT), + cudaMemcpyDeviceToHost, + stream)); + THCudaCheck(cudaStreamSynchronize(stream)); + } + + if (freeDevOut) { + THCudaFree(state, devOut); + } + + return true; +} + +#undef THC_REDUCE_ALL_BLOCK_SIZE +#undef THC_TWO_PASS_REDUCTION_SIZE + +#endif // THC_REDUCEALL_INC diff --git a/aten/src/THC/THCReduceApplyUtils.cu b/aten/src/THC/THCReduceApplyUtils.cu new file mode 100644 index 0000000..df0169e --- /dev/null +++ b/aten/src/THC/THCReduceApplyUtils.cu @@ -0,0 +1,35 @@ +#include "THCReduceApplyUtils.cuh" + +#include +#include + +// Maximum size per grid dimension that we assume (compute capability >= 2.0) +#define MAX_GRID_SIZE 65535LL + +void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) { + int64_t dims = THCudaTensor__nDimension(state, tensor); + THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING); +} + +bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid) { + if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) { + return false; + } + + int64_t gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + int64_t gridY = 1; + int64_t gridZ = 1; + + if (gridTiles > MAX_GRID_SIZE) { + gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE); + gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + + if (gridTiles > MAX_GRID_SIZE) { + gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE); + gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + } + } + + grid = dim3(gridX, gridY, gridZ); + return true; +} diff --git a/aten/src/THC/THCReduceApplyUtils.cuh b/aten/src/THC/THCReduceApplyUtils.cuh new file mode 100644 index 0000000..bf979c5 --- /dev/null +++ b/aten/src/THC/THCReduceApplyUtils.cuh @@ -0,0 +1,152 @@ +#ifndef THC_REDUCE_APPLY_UTILS_INC +#define THC_REDUCE_APPLY_UTILS_INC + +#include +#include +#include "THCGeneral.h" +#include "THCTensor.h" +#include "THCDeviceUtils.cuh" +#include "THCTensorInfo.cuh" + +// Enum that indicates whether tensor arguments are read/write or +// read-only +enum TensorArgType { ReadWrite, ReadOnly }; + +template +__device__ __forceinline__ IndexType getLinearBlockId() { + return blockIdx.z * gridDim.y * gridDim.x + + blockIdx.y * gridDim.x + + blockIdx.x; +} + +// Reduce N values concurrently, i.e. suppose N = 2, and there are 4 threads: +// (1, 2), (3, 4), (5, 6), (7, 8), then the return in threadVals for thread 0 +// is (1 + 3 + 5 + 7, 2 + 4 + 6 + 8) = (16, 20) +// +// If smem is not used again, there is no need to __syncthreads before this +// call. However, if smem will be used, e.g., this function is called in a loop, +// then __syncthreads is needed either before or afterwards to prevent non-0 +// threads overriding smem in the next loop before num-0 thread reads from it. +template +__device__ void reduceNValuesInBlock(T *smem, + T threadVals[N], + const unsigned int numVals, + ReduceOp reduceOp, + T init) { + if (numVals == 0) { + #pragma unroll + for (int i = 0; i < N; ++i) { + threadVals[i] = init; + } + return; + } + + // We store each of the N values contiguously, so if N = 2, all values for + // the first threadVal for each thread in the block are stored followed by + // all of the values for the second threadVal for each thread in the block + if (threadIdx.x < numVals) { + #pragma unroll + for (int i = 0; i < N; ++i) { + smem[i * numVals + threadIdx.x] = threadVals[i]; + } + } + __syncthreads(); + + // Number of lanes in the final reduction --> this is used to determine + // where to put the outputs of each of the n things we are reducing. If + // nLP = 32, then we have the 32 outputs for the first threadVal, + // followed by the 32 outputs for the second threadVal, etc. + const unsigned int numLanesParticipating = min(numVals, warpSize); + + if (numVals > warpSize && ((threadIdx.x / warpSize) == 0 )) { + #pragma unroll + for (int i = 0; i < N; ++i) { + threadVals[i] = threadIdx.x < numVals ? threadVals[i] : init; + } + + for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) { + #pragma unroll + for (int j = 0; j < N; ++j) { + threadVals[j] = reduceOp(threadVals[j], smem[j * numVals + i]); + } + } + + #pragma unroll + for (int i = 0; i < N; ++i) { + smem[i * numLanesParticipating + threadIdx.x] = threadVals[i]; + } + } + __syncthreads(); + + if (threadIdx.x == 0) { + if (numLanesParticipating == 32) { + #pragma unroll + for (int i = 0; i < N; ++i) { + #pragma unroll + for (int j = 1; j < 32; ++j) { + threadVals[i] = reduceOp(threadVals[i], smem[i * 32 + j]); + } + } + } else { + #pragma unroll + for (int i = 0; i < N; ++i) { + for (int j = 1; j < numLanesParticipating; ++j) { + threadVals[i] = reduceOp(threadVals[i], smem[i * numVals + j]); + } + } + } + } +} + +// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will +// return the reduced value +// +// If smem is not used again, there is no need to __syncthreads before this +// call. However, if smem will be used, e.g., this function is called in a loop, +// then __syncthreads is needed either before or afterwards to prevent non-0 +// threads overriding smem in the next loop before num-0 thread reads from it. +template +__device__ T reduceBlock(T* smem, + const unsigned int numVals, + T threadVal, + ReduceOp reduceOp, + T init) { + reduceNValuesInBlock(smem, &threadVal, numVals, reduceOp, init); + return threadVal; +} + + +// Block-wide reduction where each thread locally reduces N +// values before letting a single warp take over - assumes +// threadVals is in registers, not shared memory +// +// If smem is not used again, there is no need to __syncthreads before this +// call. However, if smem will be used, e.g., this function is called in a loop, +// then __syncthreads is needed either before or afterwards to prevent non-0 +// threads overriding smem in the next loop before num-0 thread reads from it. +template +__device__ T reduceBlockWithNThreadLocalReductions(T *smem, + T threadVals[N], + const unsigned int numVals, + ReduceOp reduceOp, + T init) { + int offset = threadIdx.x * N; + T local = offset < numVals ? threadVals[0] : init; + + #pragma unroll + for (int i = 1; i < N; ++i) { + ++offset; + T next = offset < numVals ? threadVals[i] : init; + local = reduceOp(local, next); + } + + return reduceBlock(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init); +} + +// Make sure the given tensor doesn't have too many dimensions +void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg); + +// Produces a grid with at least one point per tile +THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid); + +#endif // THC_REDUCE_APPLY_UTILS_INC diff --git a/aten/src/THC/THCScanUtils.cuh b/aten/src/THC/THCScanUtils.cuh new file mode 100644 index 0000000..ef7c297 --- /dev/null +++ b/aten/src/THC/THCScanUtils.cuh @@ -0,0 +1,211 @@ +#ifndef THC_SCAN_UTILS_INC +#define THC_SCAN_UTILS_INC + +#include "THCAsmUtils.cuh" +#include "THCDeviceUtils.cuh" + +// Collection of in-kernel scan / prefix sum utilities + +// Inclusive Scan via an upsweep/downsweep mechanism. Assumes: +// +// 1. Power2ScanSize is a power of 2. This code still works for collections that +// do not exactly contain a power of 2 number of elements, simply round up to the +// nearest power of 2 and then call. +// +// 2. That there are two-elements per thread, i.e. the size of the smem storage +// is 2 * blockDim.x * sizeof(T). +// +// Consider a (+)-Scan on the following elements: +// +// Upsweep: +// +// 0 1 2 3 4 5 6 7 +// 1 5 9 13 +// 6 22 +// 28 +// +// Downsweep: +// 15 +// 3 10 21 +template +__device__ void inclusivePrefixScan(T *smem, BinaryOp binop) { + // Reduce step ("upsweep") +#pragma unroll + for (int stride = 1; stride < Power2ScanSize; stride <<= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if (index < Power2ScanSize) { + smem[index] = binop(smem[index], smem[index - stride]); + } + __syncthreads(); + } + + // Post-reduce step ("downsweep") +#pragma unroll + for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if ((index + stride) < Power2ScanSize) { + smem[index + stride] = binop(smem[index + stride], smem[index]); + } + __syncthreads(); + } +} + +// Generic Op that can be used to support segmented scans by re-using +// the basic inclusiveScanOp. Merely requires that the input data has both +// a flag and val component +template +struct SegmentedScanOp { + __host__ __device__ SegmentedScanOp(BinaryOp binop): _binop(binop) {} + __host__ __device__ inline T operator()(const T& a, const T& b) { + T c; + c.val = a.flag ? a.val : _binop(a.val, b.val); + c.flag = a.flag | b.flag; + return c; + } + + BinaryOp _binop; +}; + +// Extends the above Inclusive Scan to support segments. It has the same properties +// but also takes a flag array that indicates the starts of "segments", i.e. individual +// units to scan. For example, consider the following (+)-scan that is segmented: +// +// Input: [1, 3, 2, 4, 1, 2, 3, 2, 1, 4] +// Flags: [1, 0, 0, 1, 0, 1, 1, 0, 1, 0] +// Output: 1 4 6 4 5 2 3 5 1 5 +// +// So we see that each "flag" resets the scan to that index. +template +__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) { + // Reduce step ("upsweep") +#pragma unroll + for (int stride = 1; stride < Power2ScanSize; stride <<= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if (index < Power2ScanSize) { + smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]); + bmem[index] = bmem[index] | bmem[index - stride]; + } + __syncthreads(); + } + + // Post-reduce step ("downsweep") +#pragma unroll + for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if ((index + stride) < Power2ScanSize) { + smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]); + bmem[index + stride] = bmem[index + stride] | bmem[index]; + } + __syncthreads(); + } +} + +// Inclusive prefix sum using shared memory +template +__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) { + // FIXME: this is a slow, simple implementation; need up/down sweep, + // prevent smem conflicts + smem[threadIdx.x] = in; + + __syncthreads(); + + for (int offset = 1; offset < blockDim.x; offset *= 2) { + T val = 0; + + if (threadIdx.x >= offset) { + val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]); + } + + __syncthreads(); + if (threadIdx.x >= offset) { + smem[threadIdx.x] = val; + } + + __syncthreads(); + } + + *out = smem[threadIdx.x]; + + // Prevent write-after-read dependencies on smem usage above if necessary + if (KillWARDependency) { + __syncthreads(); + } +} + +// Exclusive prefix sum using shared memory +template +__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) { + // FIXME: crappy implementation + // We kill write-after-read dependencies separately below, hence the `false` + inclusivePrefixScan(smem, in, out, binop); + + *out -= in; + *carry = smem[blockDim.x - 1]; + + // Prevent write-after-read dependencies on smem usage above if necessary + if (KillWARDependency) { + __syncthreads(); + } +} + +// Inclusive prefix sum for binary vars using intra-warp voting + +// shared memory +template +__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) { + // Within-warp, we use warp voting. + T vote = WARP_BALLOT(in); + T index = __popc(getLaneMaskLe() & vote); + T carry = __popc(vote); + + int warp = threadIdx.x / 32; + + // Per each warp, write out a value + if (getLaneId() == 0) { + smem[warp] = carry; + } + + __syncthreads(); + + // Sum across warps in one thread. This appears to be faster than a + // warp shuffle scan for CC 3.0+ + if (threadIdx.x == 0) { + int current = 0; + for (int i = 0; i < blockDim.x / 32; ++i) { + T v = smem[i]; + smem[i] = binop(smem[i], current); + current = binop(current, v); + } + } + + __syncthreads(); + + // load the carry from the preceding warp + if (warp >= 1) { + index = binop(index, smem[warp - 1]); + } + + *out = index; + + if (KillWARDependency) { + __syncthreads(); + } +} + +// Exclusive prefix sum for binary vars using intra-warp voting + +// shared memory +template +__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) { + inclusiveBinaryPrefixScan(smem, in, out, binop); + + // Inclusive to exclusive + *out -= (T) in; + + // The outgoing carry for all threads is the last warp's sum + *carry = smem[(blockDim.x / 32) - 1]; + + if (KillWARDependency) { + __syncthreads(); + } +} + +#endif // THC_SCAN_UTILS_INC diff --git a/aten/src/THC/THCSleep.cu b/aten/src/THC/THCSleep.cu new file mode 100644 index 0000000..d305762 --- /dev/null +++ b/aten/src/THC/THCSleep.cu @@ -0,0 +1,21 @@ +#include "THCSleep.h" + + +__global__ void spin_kernel(int64_t cycles) +{ + // see concurrentKernels CUDA sampl + int64_t start_clock = clock64(); + int64_t clock_offset = 0; + while (clock_offset < cycles) + { + clock_offset = clock64() - start_clock; + } +} + +THC_API void THC_sleep(THCState* state, int64_t cycles) +{ + dim3 grid(1); + dim3 block(1); + spin_kernel<<>>(cycles); + THCudaCheck(cudaGetLastError()); +} diff --git a/aten/src/THC/THCSleep.h b/aten/src/THC/THCSleep.h new file mode 100644 index 0000000..ebd7e40 --- /dev/null +++ b/aten/src/THC/THCSleep.h @@ -0,0 +1,10 @@ +#ifndef THC_SPIN_INC +#define THC_SPIN_INC + +#include "THCGeneral.h" +#include + +// enqueues a kernel that spins for the specified number of cycles +THC_API void THC_sleep(THCState* state, int64_t cycles); + +#endif diff --git a/aten/src/THC/THCSortUtils.cu b/aten/src/THC/THCSortUtils.cu new file mode 100644 index 0000000..2561034 --- /dev/null +++ b/aten/src/THC/THCSortUtils.cu @@ -0,0 +1,17 @@ +#include "THCSortUtils.cuh" + +// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks +uint64_t nextHighestPowerOf2(uint64_t n) { + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; +#ifndef _MSC_VER + n |= n >> 32; +#endif + n++; + + return n; +} diff --git a/aten/src/THC/THCSortUtils.cuh b/aten/src/THC/THCSortUtils.cuh new file mode 100644 index 0000000..518063a --- /dev/null +++ b/aten/src/THC/THCSortUtils.cuh @@ -0,0 +1,216 @@ +#ifndef THC_SORT_UTILS_INC +#define THC_SORT_UTILS_INC + +#include "THCReduceApplyUtils.cuh" +#include "THCTensorTypeUtils.cuh" +#include "THCNumerics.cuh" + +// Collection of kernel sort routines +template +struct LTComp { + __device__ inline bool operator()(const T& a, const T& b) const { + return THCNumerics::lt(a, b); + } +}; + +template +struct GTComp { + __device__ inline bool operator()(const T& a, const T& b) const { + return THCNumerics::gt(a, b); + } +}; + +template +__device__ inline void swapVars(T& t1, T& t2) { + T tmp = t1; + t1 = t2; + t2 = tmp; +} + +template +__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA, + K& kB, V& vB, bool& validB, + bool dir, + const Comparator& comp) { + // Invalid entries always sort to the end + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(vA, vB); + swapVars(validA, validB); + } +}; + +template +__device__ inline void bitonicSwapKeys(K& kA, bool& validA, + K& kB, bool& validB, + bool dir, + const Comparator& comp) { + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(validA, validB); + } +} + +template +__device__ inline void bitonicSort(K keys[Power2SortSize], + V values[Power2SortSize], + bool valid[Power2SortSize], + const Comparator& comp) { +#pragma unroll + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#pragma unroll + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwap( + keys[pos], values[pos], valid[pos], + keys[pos + stride], values[pos + stride], valid[pos + stride], + flag, comp); + } + } + +#pragma unroll + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwap( + keys[pos], values[pos], valid[pos], + keys[pos + stride], values[pos + stride], valid[pos + stride], + false, comp); + } + + __syncthreads(); + +} + +template +__device__ inline void bitonicSortKeys(K keys[Power2SortSize], + bool valid[Power2SortSize], + const Comparator& comp) { +#pragma unroll + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#pragma unroll + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], valid[pos], + keys[pos + stride], valid[pos + stride], + flag, comp); + } + } + +#pragma unroll + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], valid[pos], + keys[pos + stride], valid[pos + stride], + false, comp); + } + + __syncthreads(); + +} + +// Sorts (key, value) pairs (in different tensors) in-place; i.e., +// modifies the input `keys` and `values` +template +__launch_bounds__(1024) +__global__ void +bitonicSortKVInPlace(TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + TensorInfo values, + IndexType valueSliceStride, + const Comparator& comp) { + // Find the slice of the tensor that we are sorting + const IndexType linearIndex = getLinearBlockId(); + // Tiling the slices could have us be out of bounds, if there are a + // lot of slices to sort + if (linearIndex >= keySlices) { + return; + } + + __shared__ K sharedKeys[Power2SortSize]; + __shared__ V sharedValues[Power2SortSize]; + __shared__ bool sharedValid[Power2SortSize]; + + const IndexType keyStartOffset = + IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + IndexToOffset::get(linearIndex, values); + + // If the sort size is 1, the data is already sorted + if (Power2SortSize == 1) { + return; + } else { + // Otherwise, each thread is responsible for loading and storing 2 + // elements. The sort size is guaranteed to be >= 2 + const int elem1 = threadIdx.x; + const int elem2 = threadIdx.x + (Power2SortSize / 2); + + bool valid1 = (elem1 < keySliceSize); + K k1 = valid1 ? + keys.data[keyStartOffset + elem1 * keySliceStride] : ScalarConvert::to(0); + V v1 = valid1 ? + values.data[valueStartOffset + elem1 * valueSliceStride] : ScalarConvert::to(0); + + sharedKeys[elem1] = k1; + sharedValues[elem1] = v1; + sharedValid[elem1] = valid1; + + bool valid2 = (elem2 < keySliceSize); + K k2 = valid2 ? + keys.data[keyStartOffset + elem2 * keySliceStride] : ScalarConvert::to(0); + V v2 = valid2 ? + values.data[valueStartOffset + elem2 * valueSliceStride] : ScalarConvert::to(0); + + sharedKeys[elem2] = k2; + sharedValues[elem2] = v2; + sharedValid[elem2] = valid2; + + // Sort! + bitonicSort( + sharedKeys, sharedValues, sharedValid, comp); + + // elem1 and elem2 values might be out-of-range, if the data size we are + // sorting is smaller than half the power2 size + if (valid1) { + keys.data[keyStartOffset + elem1 * keySliceStride] = + sharedKeys[elem1]; + values.data[valueStartOffset + elem1 * valueSliceStride] = + sharedValues[elem1]; + } + + if (valid2) { + keys.data[keyStartOffset + elem2 * keySliceStride] = + sharedKeys[elem2]; + values.data[valueStartOffset + elem2 * valueSliceStride] = + sharedValues[elem2]; + } + } +} + +uint64_t nextHighestPowerOf2(uint64_t n); + +#endif // THC_SORT_UTILS_INC diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp new file mode 100644 index 0000000..c4ff07c --- /dev/null +++ b/aten/src/THC/THCStorage.cpp @@ -0,0 +1,111 @@ +#include "THCStorage.hpp" +#include "THCGeneral.h" + +#include "THCHalf.h" + +#include + +#include "generic/THCStorage.cpp" +#include "THCGenerateAllTypes.h" + +THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type) +{ + return THCStorage_newWithSize(state, scalar_type, 0); +} + +THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size) +{ + return THCStorage_newWithAllocator( + state, scalar_type, size, + state->cudaDeviceAllocator); +} + +THCStorage* THCStorage_newWithAllocator(THCState *state, + at::ScalarType scalar_type, + ptrdiff_t size, + at::Allocator* allocator) +{ + THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); + memset(storage, 0, sizeof(THCStorage)); + new (&storage->refcount) std::atomic(1); + new (&storage->weakcount) std::atomic(1); + new (&storage->finalizer) std::unique_ptr(nullptr); + storage->scalar_type = scalar_type; + storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; + storage->allocator = allocator; + storage->size = size; + + at::DataPtr ptr; + try { + ptr = allocator->allocate(size * at::elementSize(scalar_type)); + } catch(...) { + free(storage); + throw; + } + new (&storage->data_ptr) at::DataPtr(std::move(ptr)); + return storage; +} + +void THCStorage_free(THCState *state, THCStorage *storage) +{ + THStorage_free(storage); +} + +void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) +{ + THArgCheck(size >= 0, 2, "invalid size"); + THAssert(self->allocator != nullptr); + int device; + THCudaCheck(cudaGetDevice(&device)); + + if(!(self->flag & TH_STORAGE_RESIZABLE)) + THError("Trying to resize storage that is not resizable"); + + size_t elementSize = at::elementSize(self->scalar_type); + + if(size == 0) + { + self->data_ptr = at::DataPtr(nullptr, at::Device(at::kCUDA, device)); + self->size = 0; + } + else + { + at::DataPtr data = + self->allocator->allocate(size * elementSize); + + if (self->data_ptr) { + // Enable p2p access when the memcpy is across devices + THCState_getPeerToPeerAccess(state, device, THCStorage_getDevice(state, self)); + + THCudaCheck(cudaMemcpyAsync(data.get(), + self->data_ptr.get(), + THMin(self->size, size) * elementSize, + cudaMemcpyDeviceToDevice, + THCState_getCurrentStream(state))); + } + + // Destructively overwrite data_ptr + self->data_ptr = std::move(data); + self->size = size; + } +} + +int THCStorage_getDevice(THCState* state, const THCStorage* storage) { + return storage->data_ptr.device().index(); +} + +THCStorage* THCStorage_newWithDataAndAllocator( + THCState *state, at::ScalarType scalar_type, at::DataPtr&& data, ptrdiff_t size, + at::Allocator *allocator) { + THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); + memset(storage, 0, sizeof(THCStorage)); + storage->scalar_type = scalar_type; + new (&storage->data_ptr) at::DataPtr(std::move(data)); + storage->size = size; + new (&storage->refcount) std::atomic(1); + new (&storage->weakcount) std::atomic(1); + new (&storage->finalizer) std::unique_ptr(nullptr); + storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; + storage->allocator = allocator; + return storage; +} diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu new file mode 100644 index 0000000..43a2934 --- /dev/null +++ b/aten/src/THC/THCStorage.cu @@ -0,0 +1,13 @@ +#include "THCStorage.hpp" + +#include "THCThrustAllocator.cuh" +#include +#include +#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__) +#include +#endif + +#include "THCHalf.h" + +#include "generic/THCStorage.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCStorage.h b/aten/src/THC/THCStorage.h new file mode 100644 index 0000000..22a607c --- /dev/null +++ b/aten/src/THC/THCStorage.h @@ -0,0 +1,12 @@ +#ifndef THC_STORAGE_INC +#define THC_STORAGE_INC + +#include "THStorage.h" +#include "THCGeneral.h" + +#define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME) + +#include "generic/THCStorage.h" +#include "THCGenerateAllTypes.h" + +#endif diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp new file mode 100644 index 0000000..ae5ad7b --- /dev/null +++ b/aten/src/THC/THCStorage.hpp @@ -0,0 +1,39 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include "THCStorage.h" +#include + +#include "ATen/ScalarType.h" +#include "ATen/ScalarTypeUtils.h" +#include + +namespace at { + +template <> +struct CTypeToScalarType<__half> : public CTypeToScalarType {}; + +} + +THC_API THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type); +THC_API THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size); + +THC_API THCStorage* THCStorage_newWithAllocator(THCState *state, + at::ScalarType scalar_type, + ptrdiff_t size, + at::Allocator* allocator); + +THC_API void THCStorage_retain(THCState *state, THCStorage *storage); + +// This exists to have a data-type independent way of freeing (necessary for THPPointer). +THC_API void THCStorage_free(THCState *state, THCStorage *self); + +THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size); +THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage); + +THC_API THCStorage* THCStorage_newWithDataAndAllocator( + THCState *state, at::ScalarType scalar_type, + at::DataPtr&& data, ptrdiff_t size, + at::Allocator* allocator); diff --git a/aten/src/THC/THCStorageCopy.cpp b/aten/src/THC/THCStorageCopy.cpp new file mode 100644 index 0000000..9e42df5 --- /dev/null +++ b/aten/src/THC/THCStorageCopy.cpp @@ -0,0 +1,7 @@ +#include "THCStorageCopy.h" +#include "THCTensor.hpp" + +#include "THCTensorCopy.h" + +#include "generic/THCStorageCopy.cpp" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCStorageCopy.cu b/aten/src/THC/THCStorageCopy.cu new file mode 100644 index 0000000..8d7c869 --- /dev/null +++ b/aten/src/THC/THCStorageCopy.cu @@ -0,0 +1,10 @@ +#include "THCStorageCopy.h" +#include "THCGeneral.h" + +#include "THCHalf.h" +#include "THCTensorCopy.h" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/THCStorageCopy.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCStorageCopy.h b/aten/src/THC/THCStorageCopy.h new file mode 100644 index 0000000..837056f --- /dev/null +++ b/aten/src/THC/THCStorageCopy.h @@ -0,0 +1,11 @@ +#ifndef THC_STORAGE_COPY_INC +#define THC_STORAGE_COPY_INC + +#include "THCStorage.h" +#include "THCGeneral.h" +#include "THCHalf.h" + +#include "generic/THCStorageCopy.h" +#include "THCGenerateAllTypes.h" + +#endif diff --git a/aten/src/THC/THCStream.cpp b/aten/src/THC/THCStream.cpp new file mode 100644 index 0000000..01fc9b0 --- /dev/null +++ b/aten/src/THC/THCStream.cpp @@ -0,0 +1,32 @@ +#include "THCStream.h" +#include "ATen/CUDAStream.h" + +THC_API THCStream* THCStream_defaultStream(int device) { + return at::detail::CUDAStream_getDefaultStreamOnDevice(device); +} + +THC_API THCStream* THCStream_new(int flags) { + return THCStream_newWithPriority(flags, at::CUDAStream::DEFAULT_PRIORITY); +} + +THC_API THCStream* THCStream_newWithPriority(int flags, int priority) { + return at::detail::CUDAStream_createAndRetainWithOptions(flags, priority); +} + +THC_API cudaStream_t THCStream_stream(THCStream* stream) { + return at::detail::CUDAStream_stream(stream); +} + +THC_API int THCStream_device(THCStream* stream) { + return at::detail::CUDAStream_device(stream); +} + +THC_API void THCStream_retain(THCStream* stream) { + at::detail::CUDAStream_retain(stream); +} + +THC_API void THCStream_free(THCStream* stream) { + at::detail::CUDAStream_free(stream); +} + + diff --git a/aten/src/THC/THCStream.h b/aten/src/THC/THCStream.h new file mode 100644 index 0000000..87e5037 --- /dev/null +++ b/aten/src/THC/THCStream.h @@ -0,0 +1,26 @@ +#ifndef THC_STREAM_INC +#define THC_STREAM_INC + +#include "THCGeneral.h" + +/* +* Note: legacy API. +* +* Stream usage should be done through ATen/Context.h. +*/ +typedef struct CUDAStreamInternals THCStream; + +// Stream creation +THC_API THCStream* THCStream_defaultStream(int device); +THC_API THCStream* THCStream_new(int flags); +THC_API THCStream* THCStream_newWithPriority(int flags, int priority); + +// Getters +THC_API cudaStream_t THCStream_stream(THCStream*); +THC_API int THCStream_device(THCStream*); + +// Memory management +THC_API void THCStream_retain(THCStream*); +THC_API void THCStream_free(THCStream*); + +#endif // THC_STREAM_INC diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp new file mode 100644 index 0000000..55e46bf --- /dev/null +++ b/aten/src/THC/THCTensor.cpp @@ -0,0 +1,443 @@ +#include "THCGeneral.h" +#include "THCTensor.hpp" +#include "THCTensorCopy.h" + +#include + +#include "generic/THCTensor.cpp" +#include "THCGenerateAllTypes.h" + +#include "THCTensorInfo.cuh" + +int THCTensor_nDimension(THCState *state, const THCTensor *self) { + return self->dim(); +} + +int THCTensor__nDimension(THCState *state, const THCTensor *self) { + return self->_dim(); +} + +int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim) { + THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range"); + return self->size[dim]; +} + +int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim) { + THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range"); + return self->stride[dim]; +} +THLongStorage *THCTensor_newSizeOf(THCState *state, THCTensor *self) { + THLongStorage *size = THLongStorage_newWithSize(self->dim()); + THLongStorage_rawCopy(size, self->size); + return size; +} + +THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type) { + switch(scalar_type) { + case at::ScalarType::Byte: + return THCudaByteTensor_new(state); + case at::ScalarType::Char: + return THCudaCharTensor_new(state); + case at::ScalarType::Short: + return THCudaShortTensor_new(state); + case at::ScalarType::Int: + return THCudaIntTensor_new(state); + case at::ScalarType::Long: + return THCudaLongTensor_new(state); +#ifdef CUDA_HALF_TENSOR + case at::ScalarType::Half: + return THCudaHalfTensor_new(state); +#endif + case at::ScalarType::Float: + return THCudaTensor_new(state); + case at::ScalarType::Double: + return THCudaDoubleTensor_new(state); + default: + AT_ERROR("unexpected ScalarType: ", at::toString(scalar_type)); + } +} + +void THCTensor_resize(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride) { + THArgCheck(size != NULL, 2, "invalid size"); + if(stride) + THArgCheck(stride->size == size->size, 3, "invalid stride"); + + THCTensor_resizeNd(state, self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL)); +} + +void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) { + int isSame = 0; + int d; + if(self->dim() == src->dim()) + { + isSame = 1; + for(d = 0; d < self->dim(); d++) + { + if(self->size[d] != src->size[d]) + { + isSame = 0; + break; + } + } + } + + if(!isSame) + THCTensor_resizeNd(state, self, src->dim(), src->size, NULL); +} + +void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride) +{ + int d; + ptrdiff_t totalSize; + bool hascorrectsize = true; + +#ifndef USE_TH_SCALAR + AT_CHECK(nDimension > 0, "resizeNd nDimension must be greater than 0"); +#else + AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative"); +#endif + + for(d = 0; d < nDimension; d++) + { +#ifndef USE_TH_SIZE_ZERO_DIM + // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this + // currently exist and expect a size [0] tensor to be returned. + if (d == 0 && size[d] == 0) { + nDimension = 1; + } else { + AT_CHECK(size[d] > 0, "sizes must be non-negative"); + } +#endif + if((self->dim() > d) && (size[d] != self->size[d])) { + hascorrectsize = false; + } + + // NB: this used to test that stride[d] was >= 0 + if((self->dim() > d) && stride && (stride[d] != self->stride[d])) { + hascorrectsize = false; + } + } + + if(nDimension != self->dim()) { + hascorrectsize = false; + } + + if(hascorrectsize) { + return; + } + + if(nDimension != self->dim()) + { + self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension); + self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension); + self->dim_ = nDimension; + } + + totalSize = 1; + for(d = nDimension-1; d >= 0; d--) + { + self->size[d] = size[d]; + if(stride && (stride[d] >= 0) ) { + self->stride[d] = stride[d]; + } else { + if(d == nDimension-1) { + self->stride[d] = 1; + } else { + // Keep stride monotonically increasing to match NumPy. + self->stride[d] = std::max(self->size[d+1],1)*self->stride[d+1]; + } + } + totalSize += (self->size[d]-1)*self->stride[d]; + } + + if(totalSize+self->storageOffset > 0) + { + if(!self->storage) { + THError("Tensor: invalid null storage"); + } + if(totalSize+self->storageOffset > self->storage->size) { + THCStorage_resize(state, self->storage, totalSize+self->storageOffset); + } + } +} + +void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src) +{ + if(self != src) + THCTensor_setStorageNd(state, + self, + src->storage, + src->storageOffset, + src->dim(), + src->size, + src->stride); +} + +void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) +{ + /* storage */ + if(self->storage != storage) + { + if (!self->storage) { + THError("Tensor: invalid null storage"); + } + auto scalar_type = self->storage->scalar_type; + THCStorage_free(state, self->storage); + + if(storage) + { + self->storage = storage; + THStorage_retain(self->storage); + } + else + self->storage = THCStorage_new(state, scalar_type); + } + + /* storageOffset */ + if(storageOffset < 0) + THError("Tensor: invalid storage offset"); + self->storageOffset = storageOffset; + + /* size and stride */ + THCTensor_resizeNd(state, self, nDimension, size, stride); +} + +void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension) +{ + int d; + + if(!src) + src = self; + + THArgCheck(dimension < src->dim(), 3, "dimension out of range"); + + THCTensor_set(state, self, src); + +#ifdef TH_SCALAR + if(src->size[dimension] == 1) +#else + if(src->size[dimension] == 1 && src->dim() > 1) +#endif + { + for(d = dimension; d < self->dim()-1; d++) + { + self->size[d] = self->size[d+1]; + self->stride[d] = self->stride[d+1]; + } + self->dim_--; + } +} + +void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension) +{ + int d; + + if(!src) + src = self; + + THArgCheck((dimension >= 0) && (dimension <= src->dim()), 3, "dimension out of range"); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 3, "cannot unsqueeze empty tensor"); +#endif + + THCTensor_set(state, self, src); + + self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1)); + self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1)); + self->dim_++; + for (d = self->dim()-1; d > dimension; d--) { + self->size[d] = self->size[d-1]; + self->stride[d] = self->stride[d-1]; + } + if (dimension+1 < self->dim()) { + self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1]; + } else { + self->stride[dimension] = 1; + } + self->size[dimension] = 1; +} + +bool THCTensor_isContiguous(THCState *state, const THCTensor *self) { + if (self->is_empty()) return true; + int64_t z = 1; + int d; + for(d = self->dim()-1; d >= 0; d--) + { + if(self->size[d] != 1) + { + if(self->stride[d] == z) + z *= self->size[d]; + else + return false; + } + } + return true; +} + +bool THCTensor_allContiguous(THCState *state, THCTensor **inputs, int numInputs) { + THAssert(numInputs > 0); + for (int i = 0; i < numInputs; ++i) { + if (!THCTensor_isContiguous(state, inputs[i])) { + return false; + } + } + return true; +} + +ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self) { + if(self->_dim() == 0) + return 0; + else + { + ptrdiff_t nElement = 1; + int d; + for(d = 0; d < self->_dim(); d++) + nElement *= self->size[d]; + return nElement; + } +} + +void THCTensor_retain(THCState *state, THCTensor *self) { + self->refcount++; +} + + +void THCTensor_free(THCState *state, THCTensor *self) { + THTensor_free(self); +} + +int THCTensor_getDevice(THCState* state, const THCTensor* tensor) { + if (!tensor->storage) return -1; + return THCStorage_getDevice(state, tensor->storage); +} + +bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs) { + THAssert(numInputs > 0); + int device = THCTensor_getDevice(state, inputs[0]); + for (int i = 1; i < numInputs; ++i) { + if (THCTensor_getDevice(state, inputs[i]) != device) { + return false; + } + } + return true; +} + +bool THCTensor_canUse32BitIndexMath(THCState* state, const THCTensor* t, ptrdiff_t max_elem) { + ptrdiff_t elements = THCTensor_nElement(state, t); + if (elements >= max_elem) { + return false; + } + + ptrdiff_t offset = 0; + ptrdiff_t linearId = elements - 1; + + for (int i = THCTensor__nDimension(state, t) - 1; i >= 0; --i) { + ptrdiff_t curDimIndex = + linearId % THCTensor_size(state, t, i); + ptrdiff_t curDimOffset = curDimIndex * + THCTensor_stride(state, t, i); + offset += curDimOffset; + linearId /= THCTensor_size(state, t, i); + } + + if (offset >= max_elem) { + return false; + } + + return true; +} + +bool THCTensor_all32BitIndexable(THCState* state, THCTensor** inputs, int numInputs) { + for (int i = 0; i < numInputs; ++i) { + if (!THCTensor_canUse32BitIndexMath(state, inputs[i])) { + return false; + } + } + return true; +} + +/* Due to the resize semantics of ops with `out=` keywords, if */ \ +/* the output `tensor` has the same shape as the output of the */ \ +/* reduction operation, then any noncontiguities in the output */ \ +/* `tensor` should be preserved. This needs to be special cased b/c */ \ +/* otherwise, when keepdim=False, the implementations of reduction */ \ +/* ops resize `tensor` to the reduced size with keepdim=True, and */ \ +/* then later squeeze `tensor` to the correct output size, breaking */ \ +/* the contiguity guarantees of the resize semantics. */ \ +void THCTensor_preserveReduceDimSemantics(THCState *state, THCTensor *tensor, + int in_dims, int64_t dimension, int keepdim) { + int out_dims = THCTensor__nDimension(state, tensor); + if (out_dims > 0 && !keepdim && out_dims == in_dims - 1) { + THCTensor_unsqueeze1d(state, tensor, tensor, dimension); + } +} + +namespace { + +struct SizeAndStride { + int64_t size; + int64_t stride; +}; + +/* + A comparator that will sort SizeAndStride structs by stride, + in ascending order. + */ +int compareSizeAndStride(const void* a, const void* b) { + const SizeAndStride* aS = (const SizeAndStride*) a; + const SizeAndStride* bS = (const SizeAndStride*) b; + + if (aS->stride < bS->stride) return -1; + if (aS->stride == bS->stride) return 0; + return 1; +} + +} + +/* Returns false if there is no possibility that the tensor */ +/* has "overlapping" indices and true otherwise. */ +/* "Overlapping" indices are two+ valid indices that specify */ +/* the same offset within the tensor. */ +/* The function does this by checking for a sufficient but not */ +/* necessary condition of no overlap. In particular, that */ +/* that there exists an ordering of the tensor's dimensions */ +/* that is nicely "nested," with each dimension contained */ +/* within the next one. */ +bool THCTensor_maybeOverlappingIndices(THCState* state, const THCTensor* t) { + /* Extract size/stride arrays; only consider size >1 dims. */ + SizeAndStride info[MAX_CUTORCH_DIMS]; + + int dims = THCTensor__nDimension(state, t); + int nonSize1Dims = 0; + for (int i = 0; i < dims; ++i) { + int64_t size = THCTensor_size(state, t, i); + + if (size > 1) { + info[nonSize1Dims].size = size; + info[nonSize1Dims].stride = + THCTensor_stride(state, t, i); + + if (info[nonSize1Dims].stride < 1) { + return true; + } + + ++nonSize1Dims; + } + } + + /* Short-circuits if tensor is a single element. */ + if (nonSize1Dims == 0) { + return false; + } + + /* Ascending order (innermost dimension in sorted view is at [0]) */ + qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); + + for (int i = 0; i < (nonSize1Dims - 1); ++i) { + if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) { + return true; + } + } + + return false; +} diff --git a/aten/src/THC/THCTensor.cu b/aten/src/THC/THCTensor.cu new file mode 100644 index 0000000..34de80f --- /dev/null +++ b/aten/src/THC/THCTensor.cu @@ -0,0 +1,5 @@ +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/THCTensor.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensor.h b/aten/src/THC/THCTensor.h new file mode 100644 index 0000000..368ec99 --- /dev/null +++ b/aten/src/THC/THCTensor.h @@ -0,0 +1,20 @@ +#ifndef THC_TENSOR_INC +#define THC_TENSOR_INC + +#include "THTensor.h" +#include "THCStorage.h" +#include "THCGeneral.h" + +#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME) + +#define THC_DESC_BUFF_LEN 64 + +typedef struct THC_CLASS THCDescBuff +{ + char str[THC_DESC_BUFF_LEN]; +} THCDescBuff; + +#include "generic/THCTensor.h" +#include "THCGenerateAllTypes.h" + +#endif diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp new file mode 100644 index 0000000..56147b2 --- /dev/null +++ b/aten/src/THC/THCTensor.hpp @@ -0,0 +1,52 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include "THCTensor.h" +#include "THTensor.hpp" +#include "THCStorage.hpp" + +#include +#include + +// See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim(). +THC_API int THCTensor_nDimension(THCState *state, const THCTensor *self); +THC_API int THCTensor__nDimension(THCState *state, const THCTensor *self); + +THC_API int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim); +THC_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim); +THC_API THLongStorage *THCTensor_newSizeOf(THCState *state, THCTensor *self); + +THC_API THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type); + +THC_API void THCTensor_resize(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride); +THC_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, int64_t *size, int64_t *stride); +THC_API void THCTensor_resizeAs(THCState *state, THCTensor *tensor, THCTensor *src); + +THC_API void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride); + +THC_API void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_); +THC_API void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_); + +THC_API bool THCTensor_isContiguous(THCState *state, const THCTensor *self); +THC_API bool THCTensor_allContiguous(THCState *state, THCTensor **inputs, int numInputs); +THC_API ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self); + +THC_API void THCTensor_retain(THCState *state, THCTensor *self); +THC_API void THCTensor_free(THCState *state, THCTensor *self); + +THC_API int THCTensor_getDevice(THCState* state, const THCTensor* tensor); +THC_API bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs); + +/* Can we use 32 bit math for indexing? */ +THC_API bool THCTensor_canUse32BitIndexMath(THCState* state, const THCTensor* t, ptrdiff_t max_elem=INT32_MAX); +/* Are all tensors 32-bit indexable? */ +THC_API bool THCTensor_all32BitIndexable(THCState* state, THCTensor** inputs, int numInputs); +THC_API void THCTensor_preserveReduceDimSemantics(THCState *state, THCTensor *tensor, int in_dims, + int64_t dimension, int keepdim); +/* Returns false if there is no possibility that the tensor */ +/* has more than one index that references the same datapoint, */ +/* true otherwise. */ +THC_API bool THCTensor_maybeOverlappingIndices(THCState* state, const THCTensor* t); diff --git a/aten/src/THC/THCTensorCopy.cpp b/aten/src/THC/THCTensorCopy.cpp new file mode 100644 index 0000000..09e043c --- /dev/null +++ b/aten/src/THC/THCTensorCopy.cpp @@ -0,0 +1,7 @@ +#include "THCTensorCopy.h" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "THCCachingHostAllocator.h" + +#include "generic/THCTensorCopy.cpp" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu new file mode 100644 index 0000000..7f42f72 --- /dev/null +++ b/aten/src/THC/THCTensorCopy.cu @@ -0,0 +1,210 @@ +#include "THCApply.cuh" +#include "THCHalf.h" +#include "THCNumerics.cuh" +#include "THCTensorCopy.hpp" +#include + +inline int curGPU() { + int curDev; + THCudaCheck(cudaGetDevice(&curDev)); + return curDev; +} + +// Copy operator for the pointwise apply kernel +template +struct CopyOp { + __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) { +#if __CUDA_ARCH__ >= 350 + *dst = ScalarConvert::to(__ldg(src)); +#else + *dst = ScalarConvert::to(*src); +#endif + } +}; + +// Copy for the same type to the same type +template +void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src) { + + ptrdiff_t totalElements = THCTensor_nElement(state, dst); + + THArgCheck(totalElements == + THCTensor_nElement(state, src), + 2, "sizes do not match"); + + if (THCTensor__nDimension(state, dst) == 0) { + // Zero-dim tensor; copy nothing + return; + } + + // We can memcpy the memory if: + // -both tensors are contiguous; or, + // -there is only one element to copy; or, + // -FIXME: if both tensors have matching size and stride arrays, and no + // holes within (in other words, there is some permutation that can be applied + // to the size/strides such that the resulting tensor is + // contiguous). + // -AND: both tensors have the same type. + bool sameType = std::is_same::value; + bool srcContig = THCTensor_isContiguous(state, src); + bool dstContig = THCTensor_isContiguous(state, dst); + bool memcpyEligible = + ((srcContig && dstContig) || (totalElements == 1)) && sameType; + + int srcDev = THCTensor_getDevice(state, src); + int dstDev = THCTensor_getDevice(state, dst); + int oldDev = curGPU(); + + // Try to enable p2p access. This also handles the case srcDev == dstDev. + bool p2pEnabled = THCState_getPeerToPeerAccess(state, srcDev, dstDev); + + // We always perform the copy on the source device, using the + // current stream on the source device. + // If the copy is on the default stream, then we fully synchronize + // both src and dst's default streams for completion of the + // copy. We have to explicitly do this for non-contig copies. + // This mimics the behavior of cross-device cudaMemcpyAsync on + // the default stream. + // If the copy is not on the default stream, then it is up to the + // user to add needed synchronization on the dst device, since the + // stream on the dst device that wishes to synchronize may not be + // the same index as the one on the src device. + cudaStream_t copyStream = THCState_getCurrentStreamOnDevice(state, srcDev); + if (srcDev != dstDev && copyStream == NULL) { + // This is a cross-device copy on the default stream. We perform a + // two-way barrier between both devices' default streams before + // the copy. This ensures that any write-after-write and + // write-after-read dependencies on the destination side are + // handled, so that no one is operating on the dst memory when + // we perform the copy. + // src waits on dst barrier (src already waits on src) + cudaEvent_t dstReady; + THCudaCheck(cudaSetDevice(dstDev)); + THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming)); + THCudaCheck(cudaEventRecord(dstReady, NULL)); + + THCudaCheck(cudaSetDevice(srcDev)); + THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0)); + THCudaCheck(cudaEventDestroy(dstReady)); + } else if (srcDev != oldDev) { + THCudaCheck(cudaSetDevice(srcDev)); + } + + // We are now on srcDev + if (memcpyEligible) { + // Perform the copy + THCudaCheck(cudaMemcpyAsync( + dst->template data(), + src->template data(), + totalElements * + sizeof(ScalarTypeDst), + cudaMemcpyDeviceToDevice, + copyStream)); + } else { + // Non-contiguous copy or a type-conversion copy + + // We avoid creating temporary memory copies if possible. + // If both src and dst are on the same device, or if they are on + // different devices and p2p access is enabled, perform the copy + // by a pointwise copy kernel. + // Otherwise, we'll have to make contiguous (which will in fact + // invoke copy() again), and then perform the copy. + // FIXME: might want to consider only running the pointwise kernel + // if both src and dst innermost dimensions are contiguous. If + // they are not, then taking the hit of the memory allocation/free + // might be worth it to avoid non-coalesced reads or writes. + if (p2pEnabled) { + bool succ = + THC_pointwiseApply2( + state, dst, src, + CopyOp()); + + THArgCheck(succ, 2, CUTORCH_DIM_WARNING); + } else { + // GPUs can't access each other directly, but the tensors + // involved are non-contiguous and/or are different types. + + // Make sure the src is contiguous and in the same type as dst + THCudaCheck(cudaSetDevice(srcDev)); + THCTensor* srcContig = NULL; + + if (sameType) { + srcContig = THCTensor_newContiguous(state, src); + + } else { + // Types are different + // Copy into the new format, contiguous, on the source device + srcContig = THCTensor_new(state, + at::CTypeToScalarType::to()); + THCTensor_resizeAs(state, srcContig, dst); + + bool succ = + THC_pointwiseApply2( + state, srcContig, src, + CopyOp()); + + THArgCheck(succ, 2, CUTORCH_DIM_WARNING); + } + + // Make sure the dst is contiguous + THCudaCheck(cudaSetDevice(dstDev)); + THCTensor* dstContig = THCTensor_newContiguous(state, dst); + + // Now, we are ready for a cross-device memcpy of contiguous + // data, of the same layout and type + THCudaCheck(cudaSetDevice(srcDev)); + + THCudaCheck(cudaMemcpyAsync( + dstContig->template data(), + srcContig->template data(), + totalElements * + sizeof(ScalarTypeDst), + cudaMemcpyDeviceToDevice, + copyStream)); + + // We are done with the src + THCTensor_free(state, srcContig); + + if (dst != dstContig) { + THCTensor_freeCopyTo(state, dstContig, dst); + } else { + THCTensor_free(state, dstContig); + } + + // We're still on srcDev at this point + } + } + + if (srcDev != dstDev && copyStream == NULL) { + // dst waits on src barrier (dst already waits on dst). We cannot + // operate on dst's copy until the copy is complete. + + // Still on srcDev, record default stream event + cudaEvent_t srcReady; + THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming)); + THCudaCheck(cudaEventRecord(srcReady, NULL)); + + THCudaCheck(cudaSetDevice(dstDev)); + THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0)); + THCudaCheck(cudaEventDestroy(srcReady)); + + // We are now on dstDev (right above). Restore prior device from dst + if (dstDev != oldDev) { + THCudaCheck(cudaSetDevice(oldDev)); + } + } else { + // We are still on srcDev. Restore prior device from src + if (srcDev != oldDev) { + THCudaCheck(cudaSetDevice(oldDev)); + } + } + + THCudaCheck(cudaGetLastError()); +} + +#include "generic/THCTensorCopy.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorCopy.h b/aten/src/THC/THCTensorCopy.h new file mode 100644 index 0000000..74f2b59 --- /dev/null +++ b/aten/src/THC/THCTensorCopy.h @@ -0,0 +1,12 @@ +#ifndef TH_CUDA_TENSOR_COPY_INC +#define TH_CUDA_TENSOR_COPY_INC + +#include "THCTensor.h" +#include "THCGeneral.h" +#include "THCHalf.h" +#include "THCStream.h" + +#include "generic/THCTensorCopy.h" +#include "THCGenerateAllTypes.h" + +#endif diff --git a/aten/src/THC/THCTensorCopy.hpp b/aten/src/THC/THCTensorCopy.hpp new file mode 100644 index 0000000..8e3c762 --- /dev/null +++ b/aten/src/THC/THCTensorCopy.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "THCTensorCopy.h" + +template +void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src); + +template +THCTensor *THCTensor_newClone(THCState *state, THCTensor *self); + +template +THCTensor *THCTensor_newContiguous(THCState *state, THCTensor *self); + +template +void THCTensor_freeCopyTo(THCState *state, THCTensor *self, THCTensor *dst); + +template +void THCTensor_copyIgnoringOverlaps(THCState* state, THCTensor* dst, THCTensor* src); diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu new file mode 100644 index 0000000..68bea1b --- /dev/null +++ b/aten/src/THC/THCTensorIndex.cu @@ -0,0 +1,482 @@ +#include "THC.h" +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCBlas.h" +#include "THCTensorCopy.h" +#include "THCTensorRandom.h" +#include "THCHalf.h" +#include "THCApply.cuh" +#include "THCReduce.cuh" +#include "THCDeviceUtils.cuh" +#include "THCNumerics.cuh" +#include "THCAtomics.cuh" +#include "THCThrustAllocator.cuh" +#include "THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" +#include +#include +#include // for std::min + +// We prefer this kernel to avoid reloading index points if the number +// of indices is a small number. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is large, then the +// indexCopyLargeIndex kernel is a better choice to increase +// parallelism. +template +__global__ void indexCopySmallIndex(TensorInfo dst, + TensorInfo src, + TensorInfo indices, + int dstCopyDim, + int srcCopyDim, + IndexType innerSize, + int64_t dstCopyDimSize) { + // In order to avoid reloading the index that we are copying, load + // it once to handle all of the points that are being selected, so + // it can be reused as much as possible. This kernel is chosen when + // this is a good choice (small number of chosen indices), since + // re-accessing indices in addition to src elements can be slow. + for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { + // Lua indices begin at 1 + IndexType dstIndex = + indices.data[IndexToOffset::get(srcIndex, indices)] - TH_INDEX_BASE; + assert(dstIndex < dstCopyDimSize); + + // We stride over the output ignoring the indexed dimension + // (innerSize), whose offset calculation is handled differently + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < innerSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType dstOffset = + IndexToOffset::get(linearIndex, dst); + + dstOffset += dstIndex * dst.strides[dstCopyDim]; + + IndexType srcOffset = + IndexToOffset::get(linearIndex, src); + srcOffset += srcIndex * src.strides[srcCopyDim]; + + dst.data[dstOffset] = src.data[srcOffset]; + } + } +} + +// We prefer this kernel to balance parallelism across index points, +// if there are a large number of indices. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is small, then the +// indexCopySmallIndex kernel is a better choice to reduce memory +// accesses. +template +__global__ void indexCopyLargeIndex(TensorInfo dst, + TensorInfo src, + TensorInfo indices, + int dstCopyDim, + int srcCopyDim, + IndexType totalSize, + IndexType innerSize, + int64_t dstCopyDimSize) { + // We stride over the output including the indexed dimension + // (totalSize), and calculate the destination index point based on that + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType srcIndex, elementInSlice; + if (IndexIsMajor) { + srcIndex = linearIndex / innerSize; + elementInSlice = linearIndex % innerSize; + } + else { + elementInSlice = linearIndex / innerSize; + srcIndex = linearIndex % innerSize; + } + + // Lua indices begin at 1 + IndexType dstIndex = + indices.data[IndexToOffset::get(srcIndex, indices)] - TH_INDEX_BASE; + assert(dstIndex < dstCopyDimSize); + + IndexType dstOffset = + IndexToOffset::get(elementInSlice, dst); + dstOffset += dstIndex * dst.strides[dstCopyDim]; + + IndexType srcOffset = + IndexToOffset::get(elementInSlice, src); + srcOffset += srcIndex * src.strides[srcCopyDim]; + + dst.data[dstOffset] = src.data[srcOffset]; + } +} + +// We prefer this kernel to avoid reloading index points if the number +// of indices is a small number. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is large, then the +// indexAddLargeIndex kernel is a better choice to increase +// parallelism. +template +__global__ void indexAddSmallIndex(TensorInfo dst, + TensorInfo src, + TensorInfo indices, + int dstAddDim, + int srcAddDim, + IndexType innerSize, + int64_t dstAddDimSize) { + // In order to avoid reloading the index that we are copying, load + // it once to handle all of the points that are being selected, so + // it can be reused as much as possible. This kernel is chosen when + // this is a good choice (small number of chosen indices), since + // re-accessing indices in addition to src elements can be slow. + for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { + // Lua indices begin at 1 + IndexType dstIndex = + indices.data[IndexToOffset::get(srcIndex, indices)] - TH_INDEX_BASE; + assert(dstIndex < dstAddDimSize); + + // We stride over the output ignoring the indexed dimension + // (innerSize), whose offset calculation is handled differently + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < innerSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType dstOffset = + IndexToOffset::get(linearIndex, dst); + dstOffset += dstIndex * dst.strides[dstAddDim]; + + IndexType srcOffset = + IndexToOffset::get(linearIndex, src); + srcOffset += srcIndex * src.strides[srcAddDim]; + + atomicAdd(&dst.data[dstOffset], src.data[srcOffset]); + } + } +} + +// We prefer this kernel to balance parallelism across index points, +// if there are a large number of indices. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is small, then the +// indexAddSmallIndex kernel is a better choice to reduce memory +// accesses. +template +__global__ void indexAddLargeIndex(TensorInfo dst, + TensorInfo src, + TensorInfo indices, + int dstAddDim, + int srcAddDim, + IndexType totalSize, + IndexType innerSize, + int64_t dstAddDimSize) { + // We stride over the output including the indexed dimension + // (totalSize), and calculate the destination index point based on that + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType srcIndex, elementInSlice; + if (IndexIsMajor) { + srcIndex = linearIndex / innerSize; + elementInSlice = linearIndex % innerSize; + } + else { + elementInSlice = linearIndex / innerSize; + srcIndex = linearIndex % innerSize; + } + + // Lua indices begin at 1 + IndexType dstIndex = + indices.data[IndexToOffset::get(srcIndex, indices)] - TH_INDEX_BASE; + assert(dstIndex < dstAddDimSize); + + IndexType dstOffset = + IndexToOffset::get(elementInSlice, dst); + dstOffset += dstIndex * dst.strides[dstAddDim]; + + IndexType srcOffset = + IndexToOffset::get(elementInSlice, src); + srcOffset += srcIndex * src.strides[srcAddDim]; + + atomicAdd(&dst.data[dstOffset], src.data[srcOffset]); + } +} + +// We prefer this kernel to avoid reloading index points if the number +// of indices is a small number. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is large, then the +// indexFillLargeIndex kernel is a better choice to increase +// parallelism. +template +__global__ void indexFillSmallIndex(TensorInfo dst, + TensorInfo indices, + int dstFillDim, + IndexType innerSize, + int64_t dstFillDimSize, + T val) { + // In order to avoid reloading the index that we are copying, load + // it once to handle all of the points that are being selected, so + // it can be reused as much as possible. This kernel is chosen when + // this is a good choice (small number of chosen indices), since + // re-accessing indices in addition to src elements can be slow. + for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { + // Lua indices begin at 1 + IndexType dstIndex_ = + indices.data[IndexToOffset::get(dstIndex, indices)] - TH_INDEX_BASE; + assert(dstIndex_ < dstFillDimSize); + + // We stride over the output ignoring the indexed dimension + // (innerSize), whose offset calculation is handled differently + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < innerSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType dstOffset = + IndexToOffset::get(linearIndex, dst); + dstOffset += dstIndex_ * dst.strides[dstFillDim]; + + dst.data[dstOffset] = val; + } + } +} + +// We prefer this kernel to balance parallelism across index points, +// if there are a large number of indices. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is small, then the +// indexFillSmallIndex kernel is a better choice to reduce memory +// accesses. +template +__global__ void indexFillLargeIndex(TensorInfo dst, + TensorInfo indices, + int dstFillDim, + IndexType totalSize, + IndexType innerSize, + int64_t dstFillDimSize, + T val) { + // We stride over the output including the indexed dimension + // (totalSize), and calculate the destination index point based on that + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType dstIndex, elementInSlice; + if (IndexIsMajor) { + dstIndex = linearIndex / innerSize; + elementInSlice = linearIndex % innerSize; + } + else { + elementInSlice = linearIndex / innerSize; + dstIndex = linearIndex % innerSize; + } + + // Lua indices begin at 1 + IndexType dstIndex_ = + indices.data[IndexToOffset::get(dstIndex, indices)] - TH_INDEX_BASE; + assert(dstIndex_ < dstFillDimSize); + + IndexType dstOffset = + IndexToOffset::get(elementInSlice, dst); + dstOffset += dstIndex_ * dst.strides[dstFillDim]; + + dst.data[dstOffset] = val; + } +} + +// We prefer this kernel to avoid reloading index points if the number +// of indices is a small number. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is large, then the +// indexSelectLargeIndex kernel is a better choice to increase +// parallelism. +template +__global__ void indexSelectSmallIndex(TensorInfo dst, + TensorInfo src, + TensorInfo indices, + int dstSelectDim, + int srcSelectDim, + IndexType innerSize, + int64_t srcSelectDimSize) { + // In order to avoid reloading the index that we are copying, load + // it once to handle all of the points that are being selected, so + // it can be reused as much as possible. This kernel is chosen when + // this is a good choice (small number of chosen indices), since + // re-accessing indices in addition to src elements can be slow. + for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { + // Lua indices begin at 1 + IndexType srcIndex = + indices.data[IndexToOffset::get(dstIndex, indices)] - TH_INDEX_BASE; + assert(srcIndex < srcSelectDimSize); + + // We stride over the output ignoring the indexed dimension + // (innerSize), whose offset calculation is handled differently + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < innerSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType dstOffset = + IndexToOffset::get(linearIndex, dst); + dstOffset += dstIndex * dst.strides[dstSelectDim]; + + IndexType srcOffset = + IndexToOffset::get(linearIndex, src); + srcOffset += srcIndex * src.strides[srcSelectDim]; + + dst.data[dstOffset] = src.data[srcOffset]; + } + } +} + +// We prefer this kernel to balance parallelism across index points, +// if there are a large number of indices. +// This kernel in fact works for all choices of problem size, but if +// the number of indices chosen is small, then the +// indexSelectSmallIndex kernel is a better choice to reduce memory +// accesses. +template +__global__ void indexSelectLargeIndex(TensorInfo dst, + TensorInfo src, + TensorInfo indices, + int dstSelectDim, + int srcSelectDim, + IndexType totalSize, + IndexType innerSize, + int64_t srcSelectDimSize) { + // We stride over the output including the indexed dimension + // (totalSize), and calculate the destination index point based on that + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalSize; + linearIndex += gridDim.x * blockDim.x) { + IndexType dstIndex, elementInSlice; + if (IndexIsMajor) { + dstIndex = linearIndex / innerSize; + elementInSlice = linearIndex % innerSize; + } + else { + elementInSlice = linearIndex / innerSize; + dstIndex = linearIndex % innerSize; + } + + // Lua indices begin at 1 + IndexType srcIndex = + indices.data[IndexToOffset::get(dstIndex, indices)] - TH_INDEX_BASE; + assert(srcIndex < srcSelectDimSize); + + IndexType dstOffset = + IndexToOffset::get(elementInSlice, dst); + dstOffset += dstIndex * dst.strides[dstSelectDim]; + + IndexType srcOffset = + IndexToOffset::get(elementInSlice, src); + srcOffset += srcIndex * src.strides[srcSelectDim]; + + dst.data[dstOffset] = src.data[srcOffset]; + } +} + +template +__device__ __forceinline__ IndexType indexToOffset( + const TensorInfo& info, + int64_t index, + IndexType size) +{ + IndexType linearIndex = static_cast(index); + assert(linearIndex < size && linearIndex >= -size); + if (linearIndex < 0) { + linearIndex += size; + } + return IndexToOffset::get(linearIndex, info) - TH_INDEX_BASE; +} + +struct WrapIndexOp { + WrapIndexOp(int64_t size) : size(size) {} + + __device__ __forceinline__ void operator()(int64_t* out, int64_t* in) { + auto idx = *in; + assert(idx < size && idx >= -size); + *out = idx < 0 ? idx + size : idx; + } + + int64_t size; +}; + +template +struct TensorTakeOp { + TensorTakeOp(TensorInfo info, IndexType numel, int64_t*, int64_t*) + : info(info), numel(numel) {} + + __device__ __forceinline__ void operator()(T* out, int64_t* index) { + auto offset = indexToOffset(info, *index, numel); + *out = info.data[offset]; + } + + const TensorInfo info; + IndexType numel; +}; + +template +struct TensorPutOp { + TensorPutOp(TensorInfo info, IndexType numel, int64_t*, int64_t*) + : info(info), numel(numel) {} + + __device__ __forceinline__ void operator()(T* value, int64_t* index) { + auto offset = indexToOffset(info, *index, numel); + info.data[offset] = *value; + } + + const TensorInfo info; + IndexType numel; +}; + +template +struct TensorPutAccumulateOp { + TensorPutAccumulateOp(TensorInfo info, IndexType numel, int64_t* start, int64_t* end) + : info(info), numel(numel), start(start), end(end) {} + + __device__ __forceinline__ void operator()(T* value, int64_t* index) { + if (index == start || *index != *(index - 1)) { + int64_t linear_index = *index; + auto offset = indexToOffset(info, linear_index, numel); + do { + info.data[offset] = THCNumerics::add(info.data[offset], *value); + index++; + value++; + } while (index != end && *index == linear_index); + } + } + + const TensorInfo info; + IndexType numel; + int64_t* start; + int64_t* end; +}; + + +template class Op, typename TensorType> +void dispatchTakePutImpl(THCState *state, TensorType *a, TensorType *b, THCudaLongTensor *index) { + // These are only valid if index is contiguous + auto start = THCudaLongTensor_data(state, index); + auto end = start + THCudaLongTensor_numel(state, index); + + auto aInfo = getTensorInfo(state, a); + aInfo.collapseDims(); + auto numel = THCTensor_nElement(state, a); + if (aInfo.isContiguous()) { + auto op = Op(aInfo, numel, start, end); + THC_pointwiseApply2(state, b, index, op); + } else { + auto op = Op(aInfo, numel, start, end); + THC_pointwiseApply2(state, b, index, op); + } +} + +template class Op, typename TensorType> +void dispatchTakePut(THCState *state, TensorType *a, TensorType *b, THCudaLongTensor *index) { + if (THCTensor_canUse32BitIndexMath(state, a, INT_MAX)) { + dispatchTakePutImpl(state, a, b, index); + } else { + dispatchTakePutImpl(state, a, b, index); + } +} + +#include "generic/THCTensorIndex.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorInfo.cuh b/aten/src/THC/THCTensorInfo.cuh new file mode 100644 index 0000000..a42fa65 --- /dev/null +++ b/aten/src/THC/THCTensorInfo.cuh @@ -0,0 +1,260 @@ +#ifndef THC_TENSOR_INFO_INC +#define THC_TENSOR_INFO_INC + +#include +#include +#include "THCGeneral.h" +#include "THCIntegerDivider.cuh" +#include "THCTensor.h" + +// Maximum number of dimensions allowed for cutorch +#define MAX_CUTORCH_DIMS 25 + +// Warning string for tensor arguments that are too large or have too +// many dimensions +#define CUTORCH_STR(X) #X +#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \ + CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions" + +// CUDA kernel argument that defines tensor layout +template +struct TensorInfo { + TensorInfo(T* p, + int dim, + IndexType sz[MAX_CUTORCH_DIMS], + IndexType st[MAX_CUTORCH_DIMS]); + + // Set the size of the given dimension to 1, as if it were a + // reduction dim (allows you to calculate offsets of the reduction + // slice) + void reduceDim(int dim); + + /* + Updates the TensorInfo's dims, sizes, and strides to reflect a "collapse" of + the info, possibly excluding the optional excludeDim. A "collapsed" version + of the info is the fewest dims that order the tensor's elements in the same + way as the original info. If excludeDim is specified, the collapse is the + fewest dims that order the tensor's elements as the original and preserve the + excluded dimension, unless the tensor collapses to a point. + + Returns the (new) index of the preserved dimension if excludeDim is + specified. Returns 0 if the tensor is collapsed to a point. Returns -1 + otherwise. + */ + int collapseDims(const int excludeDim = -1); + + // Contiguous tensors of more than one dimension are collapsed down + // to one tensor + __host__ __device__ inline bool isContiguous() const { + return (dims == 1 && strides[0] == 1); + } + + T* data; + IndexType sizes[MAX_CUTORCH_DIMS]; + IndexType strides[MAX_CUTORCH_DIMS]; + int dims; +}; + +template +TensorInfo::TensorInfo(T* p, + int dim, + IndexType sz[MAX_CUTORCH_DIMS], + IndexType st[MAX_CUTORCH_DIMS]) { + data = p; + dims = dim; + assert(dims > 0 && dims < MAX_CUTORCH_DIMS); + + for (int i = 0; i < dim; ++i) { + sizes[i] = sz[i]; + strides[i] = st[i]; + } +} + +template +void +TensorInfo::reduceDim(int dim) { + assert(dim < dims && dim >= 0); + sizes[dim] = 1; +} + +template +int +TensorInfo::collapseDims(const int excludeDim) { + + assert(excludeDim >= -1 && excludeDim < dims); + + int stopDim = (excludeDim == -1) ? dims : excludeDim; + int newIndex = -1; + int oldIndex = 0; + int remappedExcludedDim = -1; + + while (oldIndex < dims) { + // Finds a dimension to collapse into + for (; oldIndex < stopDim; ++oldIndex) { + if (sizes[oldIndex] == 1) { + continue; + } + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + ++oldIndex; + break; + } + + // Collapses dims + for (; oldIndex < stopDim; ++oldIndex) { + if (sizes[oldIndex] == 1) { + continue; + } + + if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) { + sizes[newIndex] *= sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + } else { + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + } + } + + // Handles excludeDim being set (oldIndex == excludeDim) + if (oldIndex != dims) { + + // Preserves excluded dimension + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + remappedExcludedDim = newIndex; + + // Restarts iteration after excludeDim + ++oldIndex; + stopDim = dims; + } + } + + // Handles special case of all dims size 1 + if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) { + dims = 1; + sizes[0] = 1; + strides[0] = 1; + + return 0; + } + + dims = newIndex + 1; + return remappedExcludedDim; +} + +// Translate a linear index for the apply to a T* offset; +// specialized on `Dims` to reduce nvcc compilation time +template +struct IndexToOffset { + static __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + // Uses static dims + for (int i = Dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +template +struct IndexToOffset { + static inline __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + // Uses dynamic dims + for (int i = info.dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +// OffsetInfo is a faster implementation of IndexToOffset that uses faster +// integer division: we transform each division into integer multiplication by a +// pre-computed constant. (See IntDivider for details.) +template +struct OffsetInfo { + explicit OffsetInfo(const TensorInfo& tinfo) { + assert(tinfo.dims == Dims); + data = tinfo.data; + + for (int i = 0; i < Dims; ++i) { + sizes[i] = IntDivider(tinfo.sizes[i]); + strides[i] = tinfo.strides[i]; + } + } + + __host__ __device__ T* get(IndexType linearIndex) const { + IndexType offset = 0; + + for (int i = Dims - 1; i > 0; --i) { + DivMod divmod = sizes[i].divmod(linearIndex); + linearIndex = divmod.div; + offset += divmod.mod * strides[i]; + } + + return &data[offset + linearIndex * strides[0]]; + } + + T* data; + IntDivider sizes[Dims]; + IndexType strides[Dims]; +}; + +// For 1D tensors the offset equals linear index * stride. +template +struct OffsetInfo { + explicit OffsetInfo(const TensorInfo& tinfo) + : data{tinfo.data}, stride{tinfo.strides[0]} {} + + __host__ __device__ T* get(IndexType linearIndex) const { + return &data[linearIndex * stride]; + } + + T* data; + const IndexType stride; +}; + +// Dims=-1 is used when the dimension is unknown at compile time. +// +// Unfortunately, pre-computation does not work here, because of a bug in nvcc +// (tested on CUDA 8.0): if a kernel argument contains an array that is +// dynamically accessed, the whole array is first copied into the local memory. +// (That is, every kernel thread makes its own copy of the argument, even if it +// is never updated.) Pre-computation makes it worse because now we have more +// data to copy. +// +// So let's fall back to vanilla division approach. + +template +struct OffsetInfo { + explicit OffsetInfo(const TensorInfo& tinfo) + : tinfo(tinfo) { } + + __host__ __device__ T* get(IndexType linearIndex) const { + IndexType offset = IndexToOffset::get(linearIndex, tinfo); + return &tinfo.data[offset]; + } + + TensorInfo tinfo; +}; + +#endif // THC_TENSOR_INFO_INC diff --git a/aten/src/THC/THCTensorMasked.cuh b/aten/src/THC/THCTensorMasked.cuh new file mode 100644 index 0000000..814e263 --- /dev/null +++ b/aten/src/THC/THCTensorMasked.cuh @@ -0,0 +1,58 @@ +#ifndef THC_TENSOR_MASKED_CUH +#define THC_TENSOR_MASKED_CUH +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCReduce.cuh" +#include "THCThrustAllocator.cuh" + +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif + +template +struct TensorMaskedFillOp { + TensorMaskedFillOp(T v) : value(v) {} + __device__ inline void operator()(T* t, MaskT* mask) { + if (*mask) { + *t = value; + } + } + + T value; +}; + +template +struct TensorMaskedCopyOp { + TensorMaskedCopyOp(T* s) : in(s) {} + + __device__ inline void operator()(T* out, + MaskT* mask, + MaskPrefixSumT* maskPrefixSum) { + if (*mask) { + *out = in[*maskPrefixSum]; + } + } + + // Where we are copying from + T* in; +}; + +template +struct TensorMaskedSelectOp { + TensorMaskedSelectOp(T* t) : out(t) {} + __device__ inline void operator()(MaskT* mask, + MaskPrefixSumT* maskPrefixSum, + T* in) { + if (*mask) { + out[*maskPrefixSum] = *in; + } + } + + T* out; +}; + +#endif // THC_TENSOR_MASKED_CUH diff --git a/aten/src/THC/THCTensorMath.cu b/aten/src/THC/THCTensorMath.cu new file mode 100644 index 0000000..4eded20 --- /dev/null +++ b/aten/src/THC/THCTensorMath.cu @@ -0,0 +1,140 @@ +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCNumerics.cuh" +#include "THCTensorMath.cuh" +#include "THCThrustAllocator.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif +#include + +template +struct TensorFillOp { + TensorFillOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* v) { *v = val; } + + const T val; +}; + +// copypasta from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu +template +class strided_range +{ + public: + + typedef typename thrust::iterator_difference::type difference_type; + + struct stride_functor : public thrust::unary_function + { + difference_type stride; + + stride_functor(difference_type stride) + : stride(stride) {} + + __host__ __device__ + difference_type operator()(const difference_type& i) const + { + return stride * i; + } + }; + + typedef typename thrust::counting_iterator CountingIterator; + typedef typename thrust::transform_iterator TransformIterator; + typedef typename thrust::permutation_iterator PermutationIterator; + + // type of the strided_range iterator + typedef PermutationIterator iterator; + + // construct strided_range for the range [first,last) + strided_range(Iterator first, Iterator last, difference_type stride) + : first(first), last(last), stride(stride) {} + + iterator begin(void) const + { + return PermutationIterator(first, + TransformIterator(CountingIterator(0), + stride_functor(stride))); + } + + iterator end(void) const + { + return begin() + ((last - first) + (stride - 1)) / stride; + } + + protected: + Iterator first; + Iterator last; + difference_type stride; +}; + +struct idx_functor +{ + int64_t div; + int64_t size; + + __host__ __device__ + idx_functor(int64_t div, int64_t size) : div(div), size(size) {} + + __host__ __device__ + int64_t operator()(int64_t val) { + return (val / div) % size + TH_INDEX_BASE; + } +}; + +template +struct NonZeroOp +{ + NonZeroOp() {} + __host__ __device__ bool operator()(T lhs) const { + if (THCNumerics::ne(lhs, ScalarConvert::to(0.0))) { + return true; + } else { + return false; + } + } +}; + +template +struct LinspaceOp { + __host__ __device__ LinspaceOp(accT start, accT step): + start_(start), step_(step) { } + __device__ __forceinline__ T operator()(ptrdiff_t index) { + accT increment = THCNumerics::mul(step_, ScalarConvert::to(index)); + accT value = THCNumerics::add(start_, increment); + return ScalarConvert::to(value); + } + + const accT start_, step_; +}; + +template +struct LogspaceOp { + __host__ __device__ LogspaceOp(accT start, accT step): + start_(start), step_(step) { } + __device__ __forceinline__ T operator()(ptrdiff_t index) { + accT increment = THCNumerics::mul(step_, ScalarConvert::to(index)); + accT value = THCNumerics::exp10(THCNumerics::add(start_, increment)); + return ScalarConvert::to(value); + } + + const accT start_, step_; +}; + + +#include "generic/THCTensorMath.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorMath.cuh b/aten/src/THC/THCTensorMath.cuh new file mode 100644 index 0000000..202090e --- /dev/null +++ b/aten/src/THC/THCTensorMath.cuh @@ -0,0 +1,130 @@ +#ifndef THC_TENSORMATH_CUH +#define THC_TENSORMATH_CUH + +// Copy the kth diagonal of a matrix B to a vector A. +template +__global__ void THCTensor_copyFromDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideA) { + for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < size; + linearIndex += gridDim.x * blockDim.x) { + const ptrdiff_t bOffset = start + strideSum * linearIndex; + a[strideA * linearIndex] = b[bOffset]; + } +} + +// Copy vector B to the kth diagonal of a matrix A +template +__global__ void THCTensor_copyToDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideB) { + for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < size; + linearIndex += gridDim.x * blockDim.x) { + const ptrdiff_t aOffset = start + strideSum * linearIndex; + a[aOffset] = b[strideB * linearIndex]; + } +} + +#define CAT_ARRAY_BATCH_SIZE 1024 +#define CAT_ARRAY_MAX_INPUT_DIMS 4 + +inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) { + int curDevice = -1; + cudaGetDevice(&curDevice); + + if (curDevice == -1) { + return false; + } + + // Assume a reasonable number of SMs if no state is available + int numSM = + state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15; + //X dim of grid for cat array cooperates on a single tensor in the cat. + //Given half of the GPU, full utilization will always occur. + grid = dim3( 2LL * numSM, (long long) nTensors ); + + return true; +} + +// Similar to any other IndexToOffset calculation for copying along a given dimension. +template +struct CatArrIndexToOffset { + static inline __device__ IndexType compute( + const IndexType outputSize[Dims], + const IndexType outputStride[Dims], + const IndexType dimSize, + const unsigned int concatDim, + IndexType linearIndex) { + IndexType offset = 0; + +#pragma unroll + for (int i = Dims - 1; i >= 1; --i) { + IndexType curDimSize = i == concatDim ? dimSize : outputSize[i]; + IndexType nextDimIndex = linearIndex / curDimSize; + IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex; + IndexType curDimOffset = curDimIndex * outputStride[i]; + offset += curDimOffset; + linearIndex = nextDimIndex; + } + + return offset + linearIndex * outputStride[0]; + } +}; + +template +struct CatArrInputTensor { + T* input; + IndexType offset; + IndexType dimSize; + IndexType nElements; +}; + +template +struct OutputTensorSizeStride { + IndexType outputSize[MaxDims]; + IndexType outputStride[MaxDims]; +}; + +/** + * Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a grid-stride loop based off of + * the blockIdx.x, threadIdx.x for each input to copy each element from each input tensor into the output. + * + * output: base pointer to the storage associated with the output tensor + * inputs: GPU-allocated array of input metadata for each input to concatenate in the kernel + * os: the size/stride vectors for the output tensor + * concatDim: dimension along which we are concatenating + * dimStride: the stride of the output tensor at the concatDim + * + * The most important assumption made is that the input tensors are contiguous. + */ + + + +template +__global__ void CatArrayBatchedCopy( + T* output, + CatArrInputTensor* inputs, + OutputTensorSizeStride os, + const int concatDim, + IndexType dimStride) { + + IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; + IndexType nElements = inputs[blockIdx.y].nElements; + + if(tid >= nElements) return; + + T* data = inputs[blockIdx.y].input; + IndexType offset = inputs[blockIdx.y].offset; + IndexType dimSize = inputs[blockIdx.y].dimSize; + IndexType dataOffset = offset * dimStride; + + IndexType stride = gridDim.x * blockDim.x; + + while( tid < nElements){ + IndexType elementOffset = CatArrIndexToOffset::compute( + os.outputSize, os.outputStride, dimSize, concatDim, tid); + output[dataOffset + elementOffset] = data[tid]; + + tid += stride; + } +} + +#endif diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h new file mode 100644 index 0000000..7696749 --- /dev/null +++ b/aten/src/THC/THCTensorMath.h @@ -0,0 +1,58 @@ +#ifndef TH_CUDA_TENSOR_MATH_INC +#define TH_CUDA_TENSOR_MATH_INC + +#include "THCTensor.h" +#include "THCGeneral.h" + +#include "generic/THCTensorMath.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathBlas.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathMagma.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathPairwise.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathPointwise.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathReduce.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathCompare.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathCompareT.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathScan.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMasked.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorScatterGather.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorIndex.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorSort.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMode.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorTopK.h" +#include "THCGenerateAllTypes.h" + +THC_API int THCudaByteTensor_logicalAndAll(THCState *state, THCudaByteTensor *self); +THC_API int THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self); + +THC_API void THCudaByteTensor_logicalAnd(THCState *state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim); +THC_API void THCudaByteTensor_logicalAny(THCState *state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim); + +#endif diff --git a/aten/src/THC/THCTensorMathBlas.cu b/aten/src/THC/THCTensorMathBlas.cu new file mode 100644 index 0000000..5551b0c --- /dev/null +++ b/aten/src/THC/THCTensorMathBlas.cu @@ -0,0 +1,10 @@ +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCBlas.h" +#include "THCTensorCopy.h" +#include "THCNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/THCTensorMathBlas.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorMathCompare.cuh b/aten/src/THC/THCTensorMathCompare.cuh new file mode 100644 index 0000000..9fac608 --- /dev/null +++ b/aten/src/THC/THCTensorMathCompare.cuh @@ -0,0 +1,87 @@ +#ifndef THC_TENSORMATH_COMPARE_CUH +#define THC_TENSORMATH_COMPARE_CUH + +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCNumerics.cuh" + +template +struct TensorLTValueOp { + TensorLTValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert::to(THCNumerics::lt(*in, value)); + } + + const T value; +}; + +template +struct TensorGTValueOp { + TensorGTValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert::to(THCNumerics::gt(*in, value)); + } + + const T value; +}; + + +template +struct TensorLEValueOp { + TensorLEValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert::to(THCNumerics::le(*in, value)); + } + + const T value; +}; + +template +struct TensorGEValueOp { + TensorGEValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert::to(THCNumerics::ge(*in, value)); + } + + const T value; +}; + +template +struct TensorEQValueOp { + TensorEQValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert::to(THCNumerics::eq(*in, value)); + } + + const T value; +}; + +template +struct TensorNEValueOp { + TensorNEValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert::to(THCNumerics::ne(*in, value)); + } + + const T value; +}; + +template +void THC_logicalValue(THCState *state, + TensorTypeOut *self_, + TensorType *src, + Op op) { + THLongStorage* st = THCTensor_newSizeOf(state, src); + THCTensor_resize(state, self_, st, NULL); + THLongStorage_free(st); + + if (!THC_pointwiseApply2(state, self_, src, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif // THC_TENSORMATH_COMPARE_CUH diff --git a/aten/src/THC/THCTensorMathCompareT.cuh b/aten/src/THC/THCTensorMathCompareT.cuh new file mode 100644 index 0000000..9b1fb4e --- /dev/null +++ b/aten/src/THC/THCTensorMathCompareT.cuh @@ -0,0 +1,74 @@ +#ifndef THC_TENSORMATH_COMPARET_CUH +#define THC_TENSORMATH_COMPARET_CUH + +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCNumerics.cuh" +#include "THCReduce.cuh" + +template +struct TensorLTOp { + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert::to(THCNumerics::lt(*a, *b)); + } +}; + +template +struct TensorGTOp { + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert::to(THCNumerics::gt(*a, *b)); + } +}; + +template +struct TensorLEOp { + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert::to(THCNumerics::le(*a, *b)); + } +}; + +template +struct TensorGEOp { + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert::to(THCNumerics::ge(*a, *b)); + } +}; + +template +struct TensorEQOp { + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert::to(THCNumerics::eq(*a, *b)); + } +}; + +template +struct TensorNEOp { + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert::to(THCNumerics::ne(*a, *b)); + } +}; + +template +void THC_logicalTensor(THCState *state, + TensorTypeOut *self_, + TensorType *src1, + TensorType *src2, + Op op) { + THLongStorage* st = THCTensor_newSizeOf(state, src1); + THCTensor_resize(state, self_, st, NULL); + THLongStorage_free(st); + + THArgCheck(THCTensor_nElement(state, src1) == + THCTensor_nElement(state, src2), 3, + "sizes do not match"); + + if (!THC_pointwiseApply3(state, self_, src1, src2, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif // THC_TENSORMATH_COMPARET_CUH diff --git a/aten/src/THC/THCTensorMathMagma.cu b/aten/src/THC/THCTensorMathMagma.cu new file mode 100644 index 0000000..4aa6249 --- /dev/null +++ b/aten/src/THC/THCTensorMathMagma.cu @@ -0,0 +1,29 @@ +#include "THCGeneral.h" +#include "THCTensorMath.h" +#include "THCTensorCopy.h" +#include "THCTensorMathMagma.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" +#include + +#ifdef USE_MAGMA +#include +#else +#include "THCBlas.h" +#endif + +#ifndef DIVUP +#define DIVUP(x, y) (((x) + (y) - 1) / (y)) +#endif + +#define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)" + +void THCMagma_init(THCState *state) +{ +#ifdef USE_MAGMA + magma_init(); +#endif +} + +#include "generic/THCTensorMathMagma.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorMathMagma.cuh b/aten/src/THC/THCTensorMathMagma.cuh new file mode 100644 index 0000000..6495049 --- /dev/null +++ b/aten/src/THC/THCTensorMathMagma.cuh @@ -0,0 +1,22 @@ +#ifndef THC_TENSOR_MATH_MAGMA_CUH +#define THC_TENSOR_MATH_MAGMA_CUH + +#ifdef USE_MAGMA +#include +#else +#include "THCBlas.h" +#endif + +#ifdef USE_MAGMA +template +static inline T* th_magma_malloc_pinned(size_t n) +{ + void* ptr; + if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T))) + THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456); + return reinterpret_cast(ptr); +} + +#endif + +#endif // THC_TENSOR_MATH_MAGMA_CUH diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu new file mode 100644 index 0000000..19434f3 --- /dev/null +++ b/aten/src/THC/THCTensorMathPairwise.cu @@ -0,0 +1,494 @@ +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCHalf.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCNumerics.cuh" +#include "THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" + +template +struct TensorAddConstantOp { + TensorAddConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in + val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v += val; + } + + const T val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorAddConstantOp { +#ifdef CUDA_HALF_INSTRUCTIONS + TensorAddConstantOp(half v) : val(v) {} +#else + TensorAddConstantOp(half v) : fval(THC_half2float(v)) {} +#endif + + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in, val); +#else + float fin = __half2float(*in); + float fout = fin + fval; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hadd(*v, val); +#else + float fv = __half2float(*v); + fv += fval; + *v = __float2half(fv); +#endif + } + +#ifdef CUDA_HALF_INSTRUCTIONS + const half val; +#else + const float fval; +#endif +}; +#endif // CUDA_HALF_TENSOR + + +template +struct TensorSubConstantOp { + TensorSubConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in - val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v -= val; + } + + const T val; +}; + + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorSubConstantOp { +#ifdef CUDA_HALF_INSTRUCTIONS + TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {} +#else + TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {} +#endif + + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in, val); +#else + float fin = __half2float(*in); + float fout = fin + fval; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hadd(*v, val); +#else + float fv = __half2float(*v); + fv += fval; + *v = __float2half(fv); +#endif + } + +#ifdef CUDA_HALF_INSTRUCTIONS + const half val; +#else + const float fval; +#endif +}; +#endif // CUDA_HALF_TENSOR + + +template +struct TensorMulConstantOp { + TensorMulConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in * val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v *= val; + } + + const T val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorMulConstantOp { +#ifdef CUDA_HALF_INSTRUCTIONS + TensorMulConstantOp(half v) : val(v) {} +#else + TensorMulConstantOp(half v) : fval(THC_half2float(v)) {} +#endif + + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*in, val); +#else + float fin = __half2float(*in); + float fout = fin * fval; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hmul(*v, val); +#else + float fv = __half2float(*v); + fv *= fval; + *v = __float2half(fv); +#endif + } + +#ifdef CUDA_HALF_INSTRUCTIONS + const half val; +#else + const float fval; +#endif +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorDivConstantOp { + TensorDivConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in / val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v /= val; + } + + const T val; +}; + +template <> +struct TensorDivConstantOp { + TensorDivConstantOp(float v) : val(1.f / v) {} + __device__ __forceinline__ void operator()(float* out, float* in) { + *out = *in * val; + } + + __device__ __forceinline__ void operator()(float* v) { + *v *= val; + } + + const float val; +}; + +template <> +struct TensorDivConstantOp { + TensorDivConstantOp(double v) : val(1. / v) {} + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = *in * val; + } + + __device__ __forceinline__ void operator()(double* v) { + *v *= val; + } + + const double val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorDivConstantOp { +#ifdef CUDA_HALF_INSTRUCTIONS + TensorDivConstantOp(half v) : val(ScalarInv::to(v)) {} +#else + TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {} +#endif + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*in, val); +#else + float fin = __half2float(*in); + float fout = fin * fval; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hmul(*v, val); +#else + float fv = __half2float(*v); + fv *= fval; + *v = __float2half(fv); +#endif + } + +#ifdef CUDA_HALF_INSTRUCTIONS + const half val; +#else + const float fval; +#endif +}; +#endif // CUDA_HALF_TENSOR + +template +static __device__ __forceinline__ +typename std::enable_if::value, bool>::type +modulo_wrap(T a, T b) { + return (a != 0) && (a < 0) != (b < 0); +} + +template +static __device__ __forceinline__ +typename std::enable_if::value, bool>::type +modulo_wrap(T a, T b) { + return false; +} + +template +struct TensorRemainderOp { + TensorRemainderOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in % val; + if (modulo_wrap(*out, val)) { + *out += val; + } + } + + __device__ __forceinline__ void operator()(T* v) { + *v = *v % val; + if (modulo_wrap(*v, val)) { + *v += val; + } + } + + const T val; +}; + +template <> +struct TensorRemainderOp { + TensorRemainderOp(float v) : val(v) {} + __device__ __forceinline__ void operator()(float* out, float* in) { + *out = *in - val * floorf(*in / val); + } + + __device__ __forceinline__ void operator()(float* v) { + *v = *v - val * floorf(*v / val); + } + + const float val; +}; + +template <> +struct TensorRemainderOp { + TensorRemainderOp(double v) : val(v) {} + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = *in - val * floor(*in / val); + } + + __device__ __forceinline__ void operator()(double* v) { + *v = *v - val * floor(*v / val); + } + + const double val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorRemainderOp { +#ifdef CUDA_HALF_INSTRUCTIONS + TensorRemainderOp(half v) : val(v) {} +#else + TensorRemainderOp(half v): fval(THC_half2float(v)) {} +#endif + + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*in, __hmul(val, hfloor(__hdiv(*in, val)))); +#else + float fin = __half2float(*in); + float fout = fin - fval * floorf(fin / fval); + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val)))); +#else + float fv = __half2float(*v); + fv = fv - fval * floorf(fv / fval); + *v = __float2half(fv); +#endif + } + +#ifdef CUDA_HALF_INSTRUCTIONS + const half val; +#else + const float fval; +#endif +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorFmodOp { + TensorFmodOp(T v) : val((float)v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = (T) fmodf((float) *in, val); + } + + __device__ __forceinline__ void operator()(T* v) { + *v = (T) fmodf((float) *v, val); + } + + const float val; +}; + +template <> +struct TensorFmodOp { + TensorFmodOp(double v) : val(v) {} + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = fmod(*in, val); + } + + __device__ __forceinline__ void operator()(double* v) { + *v = fmod(*v, val); + } + + const double val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorFmodOp { + TensorFmodOp(half v): fval(THC_half2float(v)) {} + + __device__ __forceinline__ void operator()(half* out, half* in) { + *out = __float2half(fmodf(__half2float(*in), fval)); + } + + __device__ __forceinline__ void operator()(half* v) { + *v = __float2half(fmodf(__half2float(*v), fval)); + } + + const float fval; +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorTriOp { + TensorTriOp(T *start_, int64_t stride0_, int64_t stride1_, int64_t k_) + : start(start_), stride0(stride0_), stride1(stride1_), k(k_) {} + + __device__ __forceinline__ int mask(T *out) { + ptrdiff_t n = out - start; + int64_t row, col; + if (stride0 > stride1) + { + row = (int64_t) (n / stride0); + col = (int64_t) ((n % stride0) / stride1); + } + else + { + row = (int64_t) ((n % stride1) / stride0); + col = (int64_t) (n / stride1); + } + + return Upper ? (col - row >= k) : (col - row <= k); + } + + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = mask(out) ? *in : ScalarConvert::to(0); + } + + __device__ __forceinline__ void operator()(T* v) { + if (!mask(v)) + *v = ScalarConvert::to(0); + } + + const T *start; + const int64_t stride0, stride1, k; +}; + +template +struct TensorLShiftConstantOp { + TensorLShiftConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in << val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v <<= val; + } + + const T val; +}; + +template +struct TensorRShiftConstantOp { + TensorRShiftConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in >> val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v >>= val; + } + + const T val; +}; + +template +struct TensorBitAndConstantOp { + TensorBitAndConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in & val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v &= val; + } + + const T val; +}; + +template +struct TensorBitOrConstantOp { + TensorBitOrConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in | val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v |= val; + } + + const T val; +}; + +template +struct TensorBitXorConstantOp { + TensorBitXorConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *in ^ val; + } + + __device__ __forceinline__ void operator()(T* v) { + *v ^= val; + } + + const T val; +}; + +#include "generic/THCTensorMathPairwise.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh new file mode 100644 index 0000000..26389c3 --- /dev/null +++ b/aten/src/THC/THCTensorMathPointwise.cuh @@ -0,0 +1,929 @@ +#ifndef THC_TENSORMATH_POINTWISE_CUH +#define THC_TENSORMATH_POINTWISE_CUH + +#include +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCHalf.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCNumerics.cuh" +#include "THCReduce.cuh" + + +template +struct TensorATan2Op { + __device__ __forceinline__ void operator()(T* out, T* a, T* b) { + *out = THCNumerics::atan2(*a, *b); + } +}; + +template +struct TensorSigmoidOp { + __device__ __forceinline__ void operator()(T* out, T* in) const { + T one = (T) 1.0; + *out = one / (one + THCNumerics::exp(- *in)); + } + + __device__ __forceinline__ void operator()(T* v) const { + T one = (T) 1.0; + *v = one / (one + THCNumerics::exp(- *v)); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorSigmoidOp { + __device__ __forceinline__ void operator()(half* out, half* in) const { +#ifdef CUDA_HALF_INSTRUCTIONS + half one = ScalarConvert::to(1); + *out = __hdiv(one, __hadd(one, hexp(__hneg(*in)))); +#else + float fin = __half2float(*in); + *out = __float2half(1.0f / (1.0f + expf(- fin))); +#endif + } + + __device__ __forceinline__ void operator()(half* v) const { +#ifdef CUDA_HALF_INSTRUCTIONS + half one = ScalarConvert::to(1); + *v = __hdiv(one, __hadd(one, hexp(__hneg(*v)))); +#else + float fv = __half2float(*v); + *v = __float2half(1.0f / (1.0f + expf(- fv))); +#endif + } +}; +#endif + +template +struct TensorSignOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + T orig = *in; + *out = (orig > 0) - (orig < 0); + } + + __device__ __forceinline__ void operator()(T* v) { + T orig = *v; + *v = (orig > 0) - (orig < 0); + } +}; + +template <> +struct TensorSignOp { + __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) { + unsigned char orig = *in; + *out = (orig == 0) ? 0 : 1; + } + + __device__ __forceinline__ void operator()(unsigned char* v) { + unsigned char orig = *v; + *v = (orig == 0) ? 0 : 1; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorSignOp { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + half zero = ScalarConvert::to(0); + half orig = *in; + *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero)); +#else + float orig = __half2float(*in); + *out = __float2half((orig > 0) - (orig < 0)); +#endif + } + + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + half zero = ScalarConvert::to(0); + half orig = *v; + *v = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero)); +#else + float orig = __half2float(*v); + *v = __float2half((orig > 0) - (orig < 0)); +#endif + } +}; +#endif + +template +struct TensorAddOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out += *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 + *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorAddOp { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*out, *in); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + fout += fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in1, *in2); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 + fin2; + *out = __float2half(fout); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorCAddOp { + TensorCAddOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out, T* in) { + *out += val * *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 + val * *in2; + } + + T val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorCAddOp { + TensorCAddOp(half v) : val(v) {} + + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*out, __hmul(val, *in)); +#else + float fout = __half2float(*out); + float fval = __half2float(val); + float fin = __half2float(*in); + + fout += fval * fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in1, __hmul(val, *in2)); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fval = __half2float(val); + + float fout = fin1 + fval * fin2; + *out = __float2half(fout); +#endif + } + + half val; +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorSubOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out -= *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 - *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorSubOp { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*out, *in); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + fout -= fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*in1, *in2); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 - fin2; + *out = __float2half(fout); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorMulOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out *= *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 * *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorMulOp { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*out, *in); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + fout *= fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*in1, *in2); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 * fin2; + *out = __float2half(fout); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorPowOp { + TensorPowOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + if (StaticExp == 1) { + *out = *in; + } else if (StaticExp == 2) { + *out = THCNumerics::mul(*in, *in); + } else if (StaticExp == 3) { + T square = THCNumerics::mul(*in, *in); + *out = THCNumerics::mul(square, *in); + } else { + *out = THCNumerics::pow(*in, val); + } + } + + __device__ __forceinline__ void operator()(T* v) { + if (StaticExp == 1) { + *v = *v; + } else if (StaticExp == 2) { + *v = THCNumerics::mul(*v, *v); + } else if (StaticExp == 3) { + *v = THCNumerics::mul(THCNumerics::mul(*v, *v), *v); + } else { + *v = THCNumerics::pow(*v, val); + } + } + + const T val; +}; + +template +struct TensorPowOp { + TensorPowOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::cinv(*in); + } + + __device__ __forceinline__ void operator()(T* v) { + *v = THCNumerics::cinv(*v); + } + + const T val; +}; + +template +struct TensorPowOp { + TensorPowOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + T square = THCNumerics::mul(*in, *in); + *out = THCNumerics::cinv(square); + } + + __device__ __forceinline__ void operator()(T* v) { + T square = THCNumerics::mul(*v, *v); + *v = THCNumerics::cinv(square); + } + + const T val; +}; + +template +struct TensorTPowOp { + TensorTPowOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::pow(val, *in); + } + + __device__ __forceinline__ void operator()(T* v) { + *v = THCNumerics::pow(val, *v); + } + + const T val; +}; + +template +struct TensorCPowOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::pow(*out, *in); + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = THCNumerics::pow(*in1, *in2); + } +}; + +template <> +struct TensorCPowOp { + __device__ __forceinline__ void operator()(float* out, float* in) { + *out = powf(*out, *in); + } + + __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { + *out = powf(*in1, *in2); + } +}; + + +template <> +struct TensorCPowOp { + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = pow(*out, *in); + } + + __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { + *out = pow(*in1, *in2); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorCPowOp { + __device__ __forceinline__ void operator()(half* out, half* in) { + // No fp16 pow function yet + float fout = __half2float(*out); + float fin = __half2float(*in); + fout = powf(fout, fin); + *out = __float2half(fout); + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { + // No fp16 pow function yet + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = powf(fin1, fin2); + *out = __float2half(fout); + } +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorDivOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out /= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 / *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorDivOp { + __device__ __forceinline__ void + operator()(half* out, half* in) { + // No fp16 div instruction yet + float fout = __half2float(*out); + float fin = __half2float(*in); + fout /= fin; + *out = __float2half(fout); + } + + __device__ __forceinline__ void + operator()(half* out, half* in1, half* in2) { + // No fp16 div instruction yet + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 / fin2; + *out = __float2half(fout); + } +}; +#endif // CUDA_HALF_TENSOR + +template +static __device__ __forceinline__ +typename std::enable_if::value, bool>::type +modulo_wrap(T a, T b) { + return (a != 0) && (a < 0) != (b < 0); +} + +template +static __device__ __forceinline__ +typename std::enable_if::value, bool>::type +modulo_wrap(T a, T b) { + return false; +} + +template +struct TensorCRemainderOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + T val = *out % *in; + if (modulo_wrap(val, *in)) { + val += *in; + } + *out = val; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + T val = *in1 % *in2; + if (modulo_wrap(val, *in2)) { + val += *in2; + } + *out = val; + } +}; + +template <> +struct TensorCRemainderOp { + __device__ __forceinline__ void operator()(float* out, float* in) { + *out = *in != 0.f ? *out - *in * floorf(*out / *in) : NAN; + } + + __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { + *out = *in2 != 0.f ? *in1 - *in2 * floorf(*in1 / *in2) : NAN; + } +}; + +template <> +struct TensorCRemainderOp { + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = *in != 0. ? *out - *in * floor(*out / *in) : NAN; + } + + __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { + *out = *in2 != 0. ? *in1 - *in2 * floor(*in1 / *in2) : NAN; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorCRemainderOp { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in)))); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2)))); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorCFmodOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = *out % *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 % *in2; + } +}; + +template <> +struct TensorCFmodOp { + __device__ __forceinline__ void operator()(float* out, float* in) { + *out = fmodf(*out, *in); + } + + __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { + *out = fmodf(*in1, *in2); + } +}; + +template <> +struct TensorCFmodOp { + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = fmod(*out, *in); + } + + __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { + *out = fmod(*in1, *in2); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorCFmodOp { + __device__ __forceinline__ void operator()(half* out, half* in) { + *out = __float2half(fmodf(__half2float(*out), __half2float(*in))); + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { + *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2))); + } +}; +#endif // CUDA_HALF_TENSOR + +template +struct TensorClampOp { + TensorClampOp(T min, T max) : minValue(min), maxValue(max) {} + __device__ __forceinline__ void operator()(T* out, T* in) { + T val = THCNumerics::lt(*in, maxValue) ? *in : maxValue; + *out = THCNumerics::gt(minValue, val) ? minValue : val; + } + + __device__ __forceinline__ void operator()(T* v) { + T val = THCNumerics::lt(*v, maxValue) ? *v : maxValue; + *v = THCNumerics::gt(minValue, val) ? minValue : val; + } + + const T minValue; + const T maxValue; +}; + +template +struct TensorLerpOp { + TensorLerpOp(T w) : w(w) {} + + __device__ __forceinline__ void operator()(T *out, T *a, T *b) { + *out = THCNumerics::add( + *a, + THCNumerics::mul( + w, + THCNumerics::sub(*b, *a) + ) + ); + } + + const T w; +}; + +template +struct TensorCrossOp { + TensorCrossOp(int64_t sx, int64_t sy, int64_t so) : sx(sx), sy(sy), so(so) {} + + __device__ __forceinline__ void operator()(T* out, T* x, T*y) { + T val0 = THCNumerics::sub( + THCNumerics::mul(x[1 * sx], y[2 * sy]), + THCNumerics::mul(x[2 * sx], y[1 * sy]) + ); + + T val1 = THCNumerics::sub( + THCNumerics::mul(x[2 * sx], y[0 * sy]), + THCNumerics::mul(x[0 * sx], y[2 * sy]) + ); + + T val2 = THCNumerics::sub( + THCNumerics::mul(x[0 * sx], y[1 * sy]), + THCNumerics::mul(x[1 * sx], y[0 * sy]) + ); + + out[0 * so] = val0; + out[1 * so] = val1; + out[2 * so] = val2; + } + + const int64_t sx, sy, so; +}; + +template +struct TensorMaxOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::gt(*out, *in) ? *out : *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = THCNumerics::gt(*in1, *in2) ? *in1 : *in2; + } +}; + +template +struct TensorMinOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::lt(*out, *in) ? *out : *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = THCNumerics::lt(*in1, *in2) ? *in1 : *in2; + } +}; + +template +struct TensorMaxValueOp { + TensorMaxValueOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out) { + *out = THCNumerics::lt(*out, val) ? val : *out; // this order propagates NaN + } + + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::lt(*in, val) ? val : *in; // this order propagates NaN + } + + T val; +}; + +template +struct TensorMinValueOp { + TensorMinValueOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out) { + *out = THCNumerics::gt(*out, val) ? val : *out; // this order propagates NaN + } + + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = THCNumerics::gt(*in, val) ? val : *in; // this order propagates NaN + } + + T val; +}; + +template +struct TensorAddCMulOp { + TensorAddCMulOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = THCNumerics::add( + *out, + THCNumerics::mul( + val, + THCNumerics::mul(*in1, *in2) + ) + ); + } + + T val; +}; + +template +struct TensorAddCDivOp { + TensorAddCDivOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = THCNumerics::add( + *out, + THCNumerics::mul( + val, + THCNumerics::div(*in1, *in2) + ) + ); + } + + T val; +}; + +template +struct TensorLShiftOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out <<= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 << *in2; + } +}; + +template <> +struct TensorLShiftOp { + __device__ __forceinline__ void + operator()(float* out, float* in) { + *out *= powf(2.0f, *in); + } + + __device__ __forceinline__ void + operator()(float* out, float* in1, float* in2) { + *out = *in1 * powf(2.0f, *in2); + } +}; + +template <> +struct TensorLShiftOp { + __device__ __forceinline__ void + operator()(double* out, double* in) { + *out *= pow(2.0, *in); + } + + __device__ __forceinline__ void + operator()(double* out, double* in1, double* in2) { + *out = *in1 * pow(2.0, *in2); + } +}; + +template +struct TensorRShiftOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out >>= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 >> *in2; + } +}; + + +template <> +struct TensorRShiftOp { + __device__ __forceinline__ void + operator()(float* out, float* in) { + *out /= powf(2.0f, *in); + } + + __device__ __forceinline__ void + operator()(float* out, float* in1, float* in2) { + *out = *in1 / powf(2.0f, *in2); + } +}; + +template <> +struct TensorRShiftOp { + __device__ __forceinline__ void + operator()(double* out, double* in) { + *out /= pow(2.0, *in); + } + + __device__ __forceinline__ void + operator()(double* out, double* in1, double* in2) { + *out = *in1 / pow(2.0, *in2); + } +}; + +template +struct TensorBitAndOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out &= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 & *in2; + } +}; + +template +struct TensorBitOrOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out |= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 | *in2; + } +}; + +template +struct TensorBitXorOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out ^= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 ^ *in2; + } +}; + +/* + * The following function was converted to CUDA form from code that comes + * with the following copyright notice. It has been released under the BSD license. + * + * Cephes Math Library Release 2.8: June, 2000 + * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier + */ +template +struct TensorDigammaOp { + __device__ __forceinline__ void + operator()(real* out, real* in) { + using compute_type = typename std::conditional::value, accreal, real>::type; + static const double PI_f64 = 3.14159265358979323846; + static const compute_type PSI_10 = 2.25175258906672110764; + static const compute_type A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + auto x = scalar_cast(*in); + if (x == 0) { + *out = scalar_cast(INFINITY); + return; + } + + bool x_is_integer = x == floor(x); + compute_type result = 0; + if (x < 0) { + if (x_is_integer) { + *out = scalar_cast(INFINITY); + return; + } + // Rounding errors in tan's input can really affect the output + // for extreme values, so we always perform this computation in double. + result = scalar_cast( + - PI_f64 / tan(PI_f64 * scalar_cast(x))); + x = 1 - x; + } + + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + *out = scalar_cast(result + PSI_10); + return; + } + + compute_type y = 0; + if (x < 1.0e17) { + compute_type z = 1.0 / (x * x); + + compute_type polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + *out = scalar_cast(log(x) - (0.5 / x) - y + result); + return; + } +}; + +template +struct TensorTrigammaOp { + using compute_type = typename std::conditional::value, accreal, real>::type; + __device__ __forceinline__ void + operator()(real* out, real* in) { + const compute_type PI = 3.14159265358979323846; + compute_type x = ScalarConvert::to(*in); + compute_type sign = +1; + compute_type result = 0; + if (x < 0.5f) { + sign = -1; + compute_type sin_pi_x = THCNumerics::sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const compute_type ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x; + *out = ScalarConvert::to(sign * result); + } +}; + +#endif // THC_TENSORMATH_POINTWISE_CUH diff --git a/aten/src/THC/THCTensorMathReduce.cu b/aten/src/THC/THCTensorMathReduce.cu new file mode 100644 index 0000000..e024e1f --- /dev/null +++ b/aten/src/THC/THCTensorMathReduce.cu @@ -0,0 +1,62 @@ +#include "THCTensorMathReduce.cuh" +#include "THCTensor.hpp" + +THC_API int +THCudaByteTensor_logicalAndAll(THCState *state, THCudaByteTensor *self) { + THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self)); + unsigned char result; + if (!THC_reduceAll(state, self, + thrust::identity(), + LogicalAll(), + (unsigned char) 1, &result, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + return (int) result; +} + +THC_API int +THCudaByteTensor_logicalAnyAll(THCState *state, THCudaByteTensor *self) { + THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 1, self)); + unsigned char result; + if (!THC_reduceAll(state, self, + thrust::identity(), + LogicalAny(), + (unsigned char) 0, &result, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + return (int) result; +} + +THC_API void +THCudaByteTensor_logicalAnd(THCState* state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim) { + THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 2, self, src)); + if (!THC_reduceDim(state, self, src, + thrust::identity(), + LogicalAll(), + thrust::identity(), + (unsigned char) 1, + dimension, + keepdim)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCudaByteTensor_logicalAny(THCState* state, THCudaByteTensor *self, THCudaByteTensor *src, int dimension, int keepdim) { + THCAssertSameGPU(THCudaByteTensor_checkGPU(state, 2, self, src)); + if (!THC_reduceDim(state, self, src, + thrust::identity(), + LogicalAny(), + thrust::identity(), + (unsigned char) 0, + dimension, + keepdim)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh new file mode 100644 index 0000000..5a0a804 --- /dev/null +++ b/aten/src/THC/THCTensorMathReduce.cuh @@ -0,0 +1,728 @@ +#ifndef THC_TENSORMATH_REDUCE_CUH +#define THC_TENSORMATH_REDUCE_CUH + +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCNumerics.cuh" +#include "THCReduce.cuh" +#include "THCReduceAll.cuh" +#include "THCTensorCopy.hpp" +#include "THCThrustAllocator.cuh" +#include +#include +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif + +/* +Reductions that (only) operate on accumulate types. +*/ + +template +struct ReduceAdd { + inline __device__ T operator()(const T a, const T b) const { + return THCNumerics::add(a, b); + } +}; + +template +struct ReduceMultiply { + inline __device__ T operator()(const T a, const T b) const { + return THCNumerics::mul(a, b); + } +}; + +template +struct ReduceDivide { + ReduceDivide(const T _divisor): divisor{_divisor} {} + + inline __device__ T operator()(const T x) const { + return THCNumerics::div(x, divisor); + } + + const T divisor; +}; + +template +struct ReducePow { + ReducePow(const T _exponent): exponent{_exponent} {} + + inline __device__ T operator()(const T x) const { + return THCNumerics::pow(x, exponent); + } + + const T exponent; +}; + +template +struct SquareFunctor { + SquareFunctor(const T _mean): mean{_mean} {} + + inline __device__ T operator()(const T x) const { + return THCNumerics::mul( + THCNumerics::sub(x, mean), + THCNumerics::sub(x, mean) + ); + } + + const T mean; +}; + +template +struct ReduceMin { + inline __device__ T operator()(T a, T b) const { + return (THCNumerics::lt(a, b) || THCNumerics::isnan(a)) ? a : b; + } +}; + +template +struct ReduceMax { + inline __device__ T operator()(T a, T b) const { +#if defined(__HIP_PLATFORM_HCC__) + return (static_cast(THCNumerics::sub(a, b)) > 0 || THCNumerics::isnan(a)) ? a : b; +#else + return (THCNumerics::gt(a, b) || THCNumerics::isnan(a)) ? a : b; +#endif + } +}; + +struct LogicalAll { + inline __device__ unsigned char operator()(const unsigned char x, + const unsigned char y) const { + return (x && y); + } +}; + +struct LogicalAny { + inline __device__ unsigned char operator()(const unsigned char x, + const unsigned char y) const { + return (x || y); + } +}; + +template +inline __device__ T THCMax(const T a, const T b) { + return THCNumerics::gt(a, b) ? a : b; +} + +template +__global__ void THCTensor_kernel_renorm(T *data, + const AccT value, + const ptrdiff_t size, + const AccT maxnorm) { + __shared__ AccT buffer[32]; + int64_t tx = threadIdx.x; + int64_t bx = blockIdx.x; + int64_t step = blockDim.x; + T *row = data + size * bx; + + buffer[tx] = scalar_cast(0); + AccT norm; + +#if !defined(__HIP_DEVICE_COMPILE__) + if (THCNumerics::eq(value, scalar_cast(INFINITY))) { + // get norm of axis + for (ptrdiff_t i = tx; i < size; i += step) { + const AccT val = scalar_cast(row[i]); + buffer[tx] = THCMax(buffer[tx], THCNumerics::abs(val)); + } + // add (reduce) + for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + __syncthreads(); + if (tx < stride) + buffer[tx] = THCMax(buffer[tx], buffer[tx+stride]); + } + // clip norms + __syncthreads(); + norm = buffer[0]; + } else { + // get norm of axis + for (ptrdiff_t i = tx; i < size; i += step) { + const AccT val = scalar_cast(row[i]); + buffer[tx] = THCNumerics::add( + buffer[tx], + THCNumerics::pow(THCNumerics::abs(val), value) + ); + } + // add (reduce) + for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + __syncthreads(); + if (tx < stride) + buffer[tx] = THCNumerics::add(buffer[tx], buffer[tx+stride]); + } + // clip norms + __syncthreads(); + norm = THCNumerics::pow(buffer[0], THCNumerics::cinv(value)); + } + + if (THCNumerics::gt(norm, maxnorm)) { + norm = THCNumerics::div( + maxnorm, + THCNumerics::add(norm, scalar_cast(1e-7)) + ); + // renormalize + for (ptrdiff_t i = tx; i < size; i += step) { + const AccT val = scalar_cast(row[i]); + row[i] = scalar_cast(THCNumerics::mul(val, norm)); + } + } +#endif +} + +template +struct TensorNonZeroOp { + TensorNonZeroOp() {} + + __host__ __device__ T operator()(const T lhs) const { + const T zero = scalar_cast(0); + if (THCNumerics::eq(lhs, zero)) return zero; + + return scalar_cast(1); + } +}; + +template +struct TensorNormOp { + TensorNormOp(T _exponent) : exponent{_exponent} {} + + __host__ __device__ T operator()(const T x) const { + switch (StaticExp) { + case 1: return THCNumerics::abs(x); + case 2: return THCNumerics::mul(x, x); + default: return THCNumerics::pow(THCNumerics::abs(x), exponent); + } + } + + const T exponent; +}; + +/* + Fuses conversions and a TensorDistOp. Needed for Thrust. +*/ +template +struct ThrustTensorDistOp { + ThrustTensorDistOp(AccT _exponent) : exponent{_exponent} {} + + __host__ __device__ AccT operator()(T _x, T _y) const { + const AccT x = scalar_cast(_x); + const AccT y = scalar_cast(_y); + return THCNumerics::pow( + THCNumerics::abs(THCNumerics::sub(x, y)), + exponent); + } + + const AccT exponent; +}; + +#include + +// Given the sum of values and the sum of squares, compute the variance or standard deviation. +template +__forceinline__ __device__ T THCTensor_computeVar( + T sum, + T sum2, + const unsigned row_size) { + + T rs2 = scalar_cast(row_size); + T rs2m = scalar_cast(row_size - 1); + T zero = scalar_cast(0); + + if (flag) { + sum = THCNumerics::div(sum, rs2); + sum2 = THCNumerics::div(sum2, rs2); + sum2 = THCNumerics::sub(sum2, THCNumerics::mul(sum, sum)); + sum2 = (THCNumerics::lt(sum2, zero) ? zero : sum2); + } else { + sum = THCNumerics::div(sum, rs2); + sum2 = THCNumerics::div(sum2, rs2m); + sum2 = THCNumerics::sub(sum2, + THCNumerics::mul( + THCNumerics::div(rs2 ,rs2m), + THCNumerics::mul(sum, sum))); + sum2 = (THCNumerics::lt(sum2, zero) ? zero : sum2); + } + + if (apply_sqrt) + return THCNumerics::sqrt(sum2); + + return sum2; +} + +/* Compute the variance (or standard deviation) along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to compute the variance; + * - if flag is set, normalize by `row_size` instead of `row_size - 1` + * - if apply_sqrt is set, compute the standard deviation instead of variance + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void THCTensor_kernel_varOuterDim(T *tgt, T *src_, unsigned num_orows, unsigned num_irows, unsigned row_size) { + for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + T *src = src_ + orow * row_size * num_irows + irow; + AccT mean = scalar_cast(0); + AccT m2 = scalar_cast(0); + + for (unsigned col = 0; col < row_size; ++col) { + AccT val = scalar_cast(*src); + AccT delta = THCNumerics::sub(val, mean); + mean = THCNumerics::add(mean, + THCNumerics::div(delta, scalar_cast(col + 1))); + AccT delta2 = THCNumerics::sub(val, mean); + m2 = THCNumerics::add(m2, + THCNumerics::mul(delta, delta2)); + src += num_irows; + } + + if (flag) { + m2 = THCNumerics::div(m2, scalar_cast(row_size)); + } else { + m2 = THCNumerics::div(m2, scalar_cast(row_size - 1)); + } + + tgt[orow * num_irows + irow] = scalar_cast( + apply_sqrt ? THCNumerics::sqrt(m2) : m2); + } + } +} + +template +__host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int64_t dimension, int flag) { + unsigned ndim = THCTensor__nDimension(state, src); + // Treat all outer dimensions (i.e. dim < dimension) as one. + unsigned num_orows = 1; + for (int64_t dim = 0; dim < dimension; dim++) { + num_orows *= THCTensor_size(state, src, dim); + } + unsigned row_size = THCTensor_size(state, src, dimension); + // Treat all inner dimensions (i.e. dim > dimension) as one. + unsigned num_irows = 1; + for (unsigned dim = dimension + 1; dim < ndim; dim++) { + num_irows *= THCTensor_size(state, src, dim); + } + + dim3 threads(min(512, num_irows)); + unsigned maxGridDim = 1024; + dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x))); + + if (flag) { + THCTensor_kernel_varOuterDim<<>>( + tgt->template data(), src->template data(), num_orows, num_irows, row_size); + } else { + THCTensor_kernel_varOuterDim<<>>( + tgt->template data(), src->template data(), num_orows, num_irows, row_size); + } + + cudaError errcode = cudaGetLastError(); + if (errcode != cudaSuccess) THError(cudaGetErrorString(errcode)); +} + +/* Compute the variance (or standard deviation) of the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * - if flag is set, normalize by `row_size` instead of `row_size - 1` + * - if apply_sqrt is set, compute the standard deviation instead of variance + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + * + * Uses Welford's algorithm for numeric stability. Divides the dataset into parallel groups + * and computes the M2 and mean for each group. (M2 is \sum (x - \bar{x})^2) + * For example, if the data is split into two groups x and y, the overall M2 can + * be computed by: + * + * overall_M2 = M2x + nx * (mean(x) - overall_mean)^2 + * + M2y + ny * (mean(y) - overall_mean)^2 + * + * This implementation assumes that each block has been launched with 16 x 32 threads. + */ +template +__global__ void THCTensor_kernel_varInnermostDim(T *tgt, T *src_, unsigned num_rows, unsigned row_size) { + /* + * Each block computes the var/std of blockDim.y (32) rows at once. + * One can visualize the computation as a 16 (x) by 32 (y) grid. + * - Each of the 32 rows of the block is responsible for the computation + * of one input row. + * - Each row has 16 columns; the variance computation of one input row is + * split between 16 threads. + * - Each of those 16 threads handles the accumulation of 1/16 of the input + * row's data. + */ + for (unsigned block_row = blockIdx.x * blockDim.y; block_row < num_rows; block_row += blockDim.y * gridDim.x) { + unsigned row = block_row + threadIdx.y; + + /* + * Compute local mean, local M2 via Welford's algorithm for this thread. + */ + AccT acc_zero = scalar_cast(0); + AccT local_mean = acc_zero; + AccT local_M2 = acc_zero; + unsigned count = 0; + + if (row < num_rows) { + T *src = src_ + row * row_size; + + for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) { + ++count; + AccT val = scalar_cast(src[col]); + AccT delta = THCNumerics::sub(val, local_mean); + local_mean = THCNumerics::add( + local_mean, + THCNumerics::div(delta, scalar_cast(count))); + AccT delta2 = THCNumerics::sub(val, local_mean); + local_M2 = THCNumerics::add( + local_M2, + THCNumerics::mul(delta, delta2)); + } + } + + AccT local_sum = + THCNumerics::mul(local_mean, scalar_cast(count)); + + /* + * We are reducing across each row of 16 threads to find the true sum of the + * entire input row. The warp shfl xor loop ultimately gives each thread the + * true sum. + */ + for (unsigned lane_mask = 8; lane_mask > 0; lane_mask >>= 1) { + local_sum = THCNumerics::add(local_sum, + WARP_SHFL_XOR((row < num_rows) ? local_sum : acc_zero, lane_mask, 16)); + } + AccT true_mean = THCNumerics::div(local_sum, + scalar_cast(row_size)); + + /* + * Adjust each local_M2 according to the following: + * adjusted_M2 = local_M2 + mean_diff * mean_diff * count + * The sum of these adjusted M2s is equal to the overall M2. + */ + AccT adjusted_M2 = acc_zero; + if (row < num_rows) { + AccT mean_diff = THCNumerics::sub(true_mean, local_mean); + adjusted_M2 = THCNumerics::add( + local_M2, + THCNumerics::mul( + THCNumerics::mul(mean_diff, mean_diff), + scalar_cast(count))); + } + + /* + * Sums the adjusted M2s. The thread with threadIdx.x == 0 has + * the total sum, which is equal to the M2 for the entire input row. + */ + for (unsigned s = 8; s >= 1; s >>= 1) { + adjusted_M2 = THCNumerics::add(adjusted_M2, + WARP_SHFL_DOWN((row < num_rows) ? adjusted_M2 : acc_zero, s, 16)); + } + + if (row < num_rows && threadIdx.x == 0) { + AccT M2 = adjusted_M2; + AccT variance; + if (flag) { + variance = THCNumerics::div(M2, scalar_cast(row_size)); + } else { + variance = THCNumerics::div(M2, scalar_cast(row_size - 1)); + } + tgt[row] = scalar_cast( + apply_sqrt ? THCNumerics::sqrt(variance) : variance); + } + } +} + +template +__host__ void THCTensor_varInnermostDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int flag) { + unsigned ndim = THCTensor__nDimension(state, src); + // Treat all outer dimensions as a single dimension. + unsigned num_rows = 1; + for (unsigned dim = 0; dim < ndim - 1; dim++) { + num_rows *= THCTensor_size(state, src, dim); + } + unsigned row_size = THCTensor_size(state, src, ndim - 1); + + // From limited testing, 16x32 seemed a good compromise for handling both long and short dimensions. + dim3 threads(16, 32); + dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y))); + + if (flag) { + THCTensor_kernel_varInnermostDim<<>>( + tgt->template data(), src->template data(), num_rows, row_size); + } else { + THCTensor_kernel_varInnermostDim<<>>( + tgt->template data(), src->template data(), num_rows, row_size); + } + + cudaError errcode = cudaGetLastError(); + if (errcode != cudaSuccess) THError(cudaGetErrorString(errcode)); +} + + +/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). + These are useful when you not only have to do a reduction, but you might have + to preserve the location of contention (for example min/max operations). + The structure of the kernels follows the structure of the reduction kernels. +*/ +template +__global__ void +kernelTransformReduceOuterDimIndex(K *tgt1, + Index *tgt2, + K *src_, + unsigned num_orows, + unsigned num_irows, + unsigned row_size, + thrust::pair init, + BinaryFunction binary_op) { + for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; + irow < num_irows; + irow += gridDim.y * blockDim.x) { + K *src = src_ + orow * row_size * num_irows + irow; + thrust::pair acc = init; + + for (unsigned col = 0; col < row_size; ++col) { + // +1 for Lua index + acc = binary_op(acc, + thrust::make_pair(*src, col + TH_INDEX_BASE)); + src += num_irows; + } + + tgt1[orow * num_irows + irow] = acc.first; + tgt2[orow * num_irows + irow] = acc.second; + } + } +} + +template +__host__ void +THC_transformReduceOuterDimIndex(THCState *state, + TensorTypeK *tgt1, + TensorTypeIndex *tgt2, + TensorTypeK *src, + int64_t rdim, + const thrust::pair& init, + BinaryFunction binary_op) { + unsigned ndim = THCTensor__nDimension(state, src); + unsigned num_orows = 1; + for (int64_t dim = 0; dim < rdim; dim++) { + num_orows *= THCTensor_size(state, src, dim); + } + unsigned row_size = THCTensor_size(state, src, rdim); + unsigned num_irows = 1; + for (unsigned dim = rdim + 1; dim < ndim; dim++) { + num_irows *= THCTensor_size(state, src, dim); + } + + dim3 threads(min(512, num_irows)); + unsigned maxGridDim = 1024; + dim3 grid(min(maxGridDim, num_orows), + min(maxGridDim, THCCeilDiv(num_irows, threads.x))); + + kernelTransformReduceOuterDimIndex + <<>>( + tgt1->template data(), + tgt2->template data(), + src->template data(), + num_orows, num_irows, row_size, init, binary_op); + + THCudaCheck(cudaGetLastError()); +} + +/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) + * + * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: + * + * - block.x is the innermost dimension, i.e. dimension 0; + * - block.y and grid.y make up dimension 1; and + * - grid.x and grid z are the remaining two outer dimensions (if any) + * + * Reduction along other dimensions is handled in a separate kernel. + */ +template +__global__ void +kernelTransformReduceInnermostDimIndex(K *tgt1, + Index* tgt2, + K *src_, + unsigned num_rows, + unsigned row_size, + thrust::pair init, + BinaryFunction binary_op) { + __shared__ K sbuf[32][16 + 1]; // avoid bank conflict + __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict + + for (unsigned block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + unsigned row = block_row + threadIdx.y; + thrust::pair acc = init; + if (row < num_rows) { + K *src = src_ + row * row_size; + // Sequential reduction within a thread. + for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) { + acc = binary_op(acc, thrust::make_pair(src[col], col + TH_INDEX_BASE)); + } + } + + sbuf[threadIdx.y][threadIdx.x] = acc.first; + ibuf[threadIdx.y][threadIdx.x] = acc.second; + + __syncthreads(); + + // Reduce intermediate values to single value. + K* sline = &sbuf[threadIdx.y][0]; + Index* iline = &ibuf[threadIdx.y][0]; + for (unsigned s = 8; s > 0; s >>= 1) { + if (row < num_rows && threadIdx.x < s) { + thrust::pair arg1 = + thrust::make_pair(sline[threadIdx.x], iline[threadIdx.x]); + thrust::pair arg2 = + thrust::make_pair(sline[threadIdx.x + s], iline[threadIdx.x + s]); + thrust::pair res = binary_op(arg1, arg2); + + sline[threadIdx.x] = res.first; + iline[threadIdx.x] = res.second; + } + __syncthreads(); + } + + if (row < num_rows && threadIdx.x == 0) { + tgt1[row] = sline[0]; + tgt2[row] = iline[0]; + } + __syncthreads(); + } +} + +template +__host__ void +THC_transformReduceInnermostDimIndex(THCState *state, + TensorTypeK *tgt1, + TensorTypeIndex *tgt2, + TensorTypeK *src, + const thrust::pair& init, + BinaryFunction binary_op) { + unsigned ndim = THCTensor__nDimension(state, src); + unsigned num_rows = 1; + for (unsigned dim = 0; dim < ndim - 1; dim++) { + num_rows *= THCTensor_size(state, src, dim); + } + unsigned row_size = THCTensor_size(state, src, ndim - 1); + + dim3 threads(16, 32); + dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y))); + + kernelTransformReduceInnermostDimIndex + <<>>( + tgt1->template data(), + tgt2->template data(), + src->template data(), + num_rows, row_size, init, binary_op); + + THCudaCheck(cudaGetLastError()); +} + +template +void +THC_reduceDimIndex(THCState *state, + TensorTypeK *tgt1_, + TensorTypeIndex *tgt2_, + TensorTypeK *src, + int64_t dimension, + int keepdim, + const thrust::pair& init, + BinaryFunction binary_op) +{ + THArgCheck(dimension >= 0 && + dimension < THCTensor__nDimension(state, src), + 3, "dimension out of range"); + + + // Unsqueeze tgt1_/tgt_2 if necessary so that their contiguity traits + // are preserved if they are the same size as the correct reduction output. + int src_dims = THCTensor__nDimension(state, src); + THCTensor_preserveReduceDimSemantics( + state, tgt1_, src_dims, dimension, keepdim); + THCTensor_preserveReduceDimSemantics( + state, tgt2_, src_dims, dimension, keepdim); + + THLongStorage *dim = THCTensor_newSizeOf(state, src); + THLongStorage_set(dim, dimension, 1); + THCTensor_resize(state, tgt1_, dim, NULL); + THCTensor_resize(state, tgt2_, dim, NULL); + THLongStorage_free(dim); + + TensorTypeK *tgt1 = (TensorTypeK*)THCTensor_newContiguous(state, tgt1_); + TensorTypeIndex *tgt2 = (TensorTypeIndex*)THCTensor_newContiguous(state, tgt2_); + src = (TensorTypeK*)THCTensor_newContiguous(state, src); + + if (dimension == THCTensor__nDimension(state, src) - 1) { + THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op); + } else { + THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op); + } + + THCTensor_free(state, src); + THCTensor_freeCopyTo(state, tgt1, tgt1_); + THCTensor_freeCopyTo(state, tgt2, tgt2_); + if (!keepdim) { + THCTensor_squeeze1d(state, tgt1_, tgt1_, dimension); + THCTensor_squeeze1d(state, tgt2_, tgt2_, dimension); + } +} + +template +struct MaxValuePair { + __host__ __device__ + thrust::pair operator()(const thrust::pair& a, + const thrust::pair& b) { + return (THCNumerics::ge(a.first, b.first) || + THCNumerics::isnan(a.first)) ? a : b; + } +}; + +template +struct MinValuePair { + __host__ __device__ + thrust::pair operator()(const thrust::pair& a, + const thrust::pair& b) { + return (THCNumerics::le(a.first, b.first) || + THCNumerics::isnan(a.first)) ? a : b; + } +}; + +template +struct AddOp { + __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) { + return THCNumerics::add(lhs, rhs); + } +}; + +template +struct MulOp { + __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) { + return THCNumerics::mul(lhs, rhs); + } +}; + +#endif // THC_TENSORMATH_REDUCE_CUH diff --git a/aten/src/THC/THCTensorMathScan.cu b/aten/src/THC/THCTensorMathScan.cu new file mode 100644 index 0000000..6f01bd2 --- /dev/null +++ b/aten/src/THC/THCTensorMathScan.cu @@ -0,0 +1,129 @@ +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCBlas.h" +#include "THCTensorCopy.h" +#include "THCApply.cuh" +#include "THCReduce.cuh" +#include "THCNumerics.cuh" +#include "THCTensorMathReduce.cuh" +#include +#include + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to compute the variance; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_, + unsigned num_orows, unsigned num_irows, unsigned row_size, + T init, BinaryOp binary_op) +{ + for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + T *src = src_ + orow * row_size * num_irows + irow; + T *tgt = tgt_ + orow * row_size * num_irows + irow; + T acc = init; + + for (unsigned col = 0; col < row_size; ++col) { + acc = binary_op(acc, *src); + *tgt = acc; + + src += num_irows; + tgt += num_irows; + } + } + } +} + +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_, + unsigned num_rows, unsigned row_size, + T init, BinaryFunction binary_op) +{ + __shared__ T sbuf[num_threads_y][2 * num_threads_x]; + + T* row_buf = sbuf[threadIdx.y]; + + for (unsigned block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + unsigned row = block_row + threadIdx.y; + T block_total = init; + + T *row_src = src_ + row * row_size; + T *row_tgt = tgt_ + row * row_size; + + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + unsigned col1 = block_col + threadIdx.x; + unsigned col2 = block_col + num_threads_x + threadIdx.x; + if (row < num_rows) { + if (col1 < row_size) { + row_buf[threadIdx.x] = row_src[col1]; + } else { + row_buf[threadIdx.x] = init; + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = row_src[col2]; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + row_buf[0] = binary_op(row_buf[0], block_total); + } + } + __syncthreads(); + + // Parallel reduction (up-sweep). + for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) { + if (row < num_rows && threadIdx.x < s) { + unsigned offset = (2 * threadIdx.x + 1) * d - 1; + row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]); + } + __syncthreads(); + } + + // Down-sweep. + for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) { + if (row < num_rows && threadIdx.x < s - 1) { + unsigned offset = 2 * (threadIdx.x + 1) * d - 1; + row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]); + } + __syncthreads(); + } + + // Write back to output. + if (row < num_rows) { + if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x]; + if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x]; + } + block_total = row_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +#include "generic/THCTensorMathScan.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorMode.cu b/aten/src/THC/THCTensorMode.cu new file mode 100644 index 0000000..52a5ce2 --- /dev/null +++ b/aten/src/THC/THCTensorMode.cu @@ -0,0 +1,18 @@ +#include "THC.h" +#include "THCThrustAllocator.cuh" +#include "THCTensorTypeUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "THCTensorMode.cuh" + +#include "generic/THCTensorMode.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorMode.cuh b/aten/src/THC/THCTensorMode.cuh new file mode 100644 index 0000000..0158f25 --- /dev/null +++ b/aten/src/THC/THCTensorMode.cuh @@ -0,0 +1,282 @@ +#ifndef THC_TENSOR_MODE_CUH +#define THC_TENSOR_MODE_CUH + +#include "THCNumerics.cuh" +#include "THCSortUtils.cuh" +#include "THCScanUtils.cuh" + +struct ThrustHalfLess +{ + __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { + return THCNumerics::lt(lhs, rhs); + } +}; + +struct ThrustHalfNotEqualTo +{ + __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { + return THCNumerics::ne(lhs, rhs); + } +}; + +struct ThrustHalfEqualTo +{ + __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { + return THCNumerics::eq(lhs, rhs); + } +}; + +struct ThrustHalfEqualToPredicate +{ + ThrustHalfEqualToPredicate(half val): val_(val) {} + __host__ __device__ inline bool operator()(half x) { + return THCNumerics::eq(val_, x); + } + + half val_; +}; + +template +struct BinaryAddOp { + __host__ __device__ inline T operator()(const T a, const T b) { + return THCNumerics::add(a, b); + } +}; + +template <> +struct BinaryAddOp { + __host__ __device__ inline unsigned int operator()(const unsigned int a, const unsigned int b) { + return a + b; + } +}; + +// Used for a segmented reduction +struct ModeUnsignedBoolPair { + unsigned int val; + bool flag; +}; + +// In the kernel below, we have a common pattern of reducing (unsigned int, unsigned int) +// pairs of data +struct ModeUnsignedPair { + unsigned int val; + unsigned int index; +}; + +template +struct MaxReduceOp { + __host__ __device__ inline T operator()(const T& a, const T& b) { + return b.val > a.val ? b : a; + } +}; + +template +struct MatchReduceOp { + __host__ __device__ inline T operator()(const T& a, const T& b) { + return b.flag ? b : a; + } +}; + +// The mode kernel has the following characteristics: It uses internal shared memory +// buffers of Power2Size, which must be greater than the number of elements. Additionally, +// there is one block for every slice to calculate the mode for, and in each block there +// is one thread for every two elements. +// +// Both sorted and positions are assumed to be contiguous Tensors with the mode dimension +// as the innermost dim, such that we can get the particular slice for a Tensor via its +// linear block dimension * the slice size. +template +__global__ void computeMode( + T *input, + TensorInfo values, + TensorInfo indices, + int64_t sliceSize) +{ + int tidx = threadIdx.x; + int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for + + // First, we need to calculate the offset into the sorted Tensor that represents + // the start of the slice for this block to calculate the mode for. This offset + // is a combination of the gridIndices, and the number of elements in the slice. + unsigned int blockId = getLinearBlockId(); + unsigned int linearOffset = blockId * sliceSize; + + // shmem is a dynamically sized buffer we will use throughout the kernel to + // handle computation efficiently. The size of this shmem must be + // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size) + // + // Initially, the buffer will be organized as follows: + // + // [smem (slice elements) | bmem (valid indices) | ] + extern __shared__ char shmem[]; + + // smem represents a proportion of the shared memory buffer that is used to store + // the elements from the slice: + T *smem = reinterpret_cast(shmem); + + // Each thread loads up to two elements from the Tensor into shared memory + if (tidx < sliceSize) { + smem[tidx] = input[linearOffset + tidx]; + } + if (stidx < sliceSize) { + smem[stidx] = input[linearOffset + stidx]; + } + + // Next, we initialize a boolean region of the buffer, offset by the loaded element + // smem region + bool *bmem = reinterpret_cast(&smem[Power2Size]); + + // The first use of this region stores bmem[i] = i < sliceSize to mark the valid + // components in the smem buffer + bmem[tidx] = tidx < sliceSize; + bmem[stidx] = stidx < sliceSize; + __syncthreads(); // barrier for smem, bmem initialization + + // First, sort the input slice in ascending order. smem contains the input + // elements, and bmem marks the valid indices + bitonicSortKeys, T, unsigned int, Power2Size>(smem, bmem, LTComp()); + __syncthreads(); // make no assumptions that the sort syncs at end + + // The next step of our algorithm is performing a block-wide comparison of + // neighboring elements. In particular, given an sorted input slice A, we + // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise 0. + // + // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8] + // B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // + // In particular, we can think of B[i] true indicating the start of a sequence of + // equal values in the sorted list. Similarly, we will also store the negation of B, + // which we'll call C. In particular, we can think of C[i] = true iff A[i-1] == A[i] + // in our original sorted slice. + // + // C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + + // We overwrite bmem, and treat the rest of shared memory as a buffer of (index, flag) pairs + // where the index represents values from C, and the flag represents values from B. + // + // [smem (sorted slice) | ubpmem (index, flag pairs)] + + struct ModeUnsignedBoolPair *ubpmem = reinterpret_cast( + &smem[Power2Size]); + + if (tidx == 0) { + ubpmem[0].flag = true; + ubpmem[0].val = 0; + } + + // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ... + ubpmem[tidx * 2 + 1].flag = THCNumerics::ne(smem[tidx * 2], smem[tidx * 2 + 1]); // (0, 1), (1, 2), etc. + ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag; + + // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ... + if (((tidx + 1) * 2) < Power2Size) { + ubpmem[(tidx + 1) * 2].flag = THCNumerics::ne(smem[((tidx + 1) * 2) - 1], smem[(tidx + 1) * 2]); + ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag; + } + __syncthreads(); // barrier for ubpmem initialization + + // Next, we perform a segmented prefix sum on the neighboring elements, where + // the presence of a one indicates the start of a segment. In this case B acts + // as the segment start flags, and C is the buffer to be summed: + // + // Input (C) = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + // Flag (B) = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // Output (C) = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0] + // + // Afterwards, the (index) components of the ubpmem buffer contain the lengths of the + // segments (minus 1), i.e. the counts of each element in the original input. + + inclusivePrefixScan< + struct ModeUnsignedBoolPair, + struct SegmentedScanOp >, + Power2Size>( + ubpmem, + SegmentedScanOp >(BinaryAddOp())); + // assumes scan syncs at the end + + // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. we treat the + // boolean flag regions as integers). We initialize these to represent indices, and we'll call + // this buffer I + struct ModeUnsignedPair *uupmem = reinterpret_cast(ubpmem); + + // At this point, we need to find the maximum element in lengths buffer C. + // This element will represent the count (-1) of the mode. Because of the + // way we have set up the problem, the index where this mode occurs will + // also be the location of the mode value in the sorted array, e.g. + // + // smem = [0, 0, 1, 1, 1, 2] + // C = [0, 1, 0, 1, 2, 0] + // I = [0, 1, 2, 3, 4, 5] + // ^ + // maximum value, also aligned with mode = 1 + // + // We perform a block wide max-reduction of the C buffer, but we also need the + // indices to come along with it, so we utilize the uupmem construction. + // + // At the end we need to return the ModeUnsignedPair containing index = 4, val = 2, + // which represents the max + + // In practice, we will make each thread locally reduce 2 values in its registers prior + // to the global block-wide reduction. Note that instead of tidx/stidx, we utilize tidx * 2, + // tidx * 2 + 1, so each thread deals with adjacent elements. This is because the reduce + // code below relies on thread elements to be adjacent. + struct ModeUnsignedPair uup[2]; + uup[0].index = tidx * 2; + uup[0].val = ubpmem[tidx * 2].val; + uup[1].index = tidx * 2 + 1; + uup[1].val = ubpmem[tidx * 2 + 1].val; + __syncthreads(); + + struct ModeUnsignedPair max = {0, 0}; + + max = reduceBlockWithNThreadLocalReductions, 2> + (uupmem, uup, sliceSize, MaxReduceOp(), max); + + // Store the mode in shared memory for use in finding the mode in the input slice + __shared__ T mode; + + // Given the above constraints, the mode is the value at the reduced index in the + // original sorted element buffer + if (tidx == 0) { + mode = smem[max.index]; + } + __syncthreads(); // broadcast mode + + // Finally, we need to find the "an" index of the mode in the input Tensor. The API does + // not constrain which index we pick, so it can be any of the indices that contain the mode. + // We will do a reduction to find the index. We go back to using the (index, flag) buffer + // arrangement. First, we mark indices that are equal to the mode, i.e B[i] = true if + // input[i] == mode, and initialize C[i] to be the index + // + // Again we reduce 2 elements in the thread's registers prior to the block-wide reduction + struct ModeUnsignedBoolPair ubpp[2]; + if (tidx * 2 < sliceSize) { + ubpp[0].flag = THCNumerics::eq(input[linearOffset + (tidx * 2)], mode); + ubpp[0].val = tidx * 2; + } + if (tidx * 2 + 1 < sliceSize) { + ubpp[1].flag = THCNumerics::eq(input[linearOffset + (tidx * 2 + 1)], mode); + ubpp[1].val = tidx * 2 + 1; + } + + // Then we perform a similar reduction to the one above, except this time we update + // the element if the element at the base position is not equal to the mode and + // the element at the offset position is. At the end, C[0] will contain an index + // with the mode. + struct ModeUnsignedBoolPair match = {0, false}; + + match = reduceBlockWithNThreadLocalReductions, 2> + (ubpmem, ubpp, sliceSize, MatchReduceOp(), match); + + // Finally, we have the mode, and an index where it occurs. We use a single thread + // to place this in the appropriate output position + if (tidx == 0) { + int64_t index = TH_INDEX_BASE + match.val; + + unsigned int outputOffset = IndexToOffset::get(blockId, values); + values.data[outputOffset] = mode; + indices.data[outputOffset] = index; + } +} + +#endif // THC_TENSOR_MODE_CUH diff --git a/aten/src/THC/THCTensorRandom.cpp b/aten/src/THC/THCTensorRandom.cpp new file mode 100644 index 0000000..e7a4100 --- /dev/null +++ b/aten/src/THC/THCTensorRandom.cpp @@ -0,0 +1,141 @@ +#include "THCTensorRandom.h" +#include "THCGenerator.hpp" + +#include +#include + + +void initializeGenerator(THCState *state, THCGenerator* gen); +void createGeneratorState(THCGenerator* gen, uint64_t seed); + + +/* Frees memory allocated during setup. */ +void destroyGenerator(THCState *state, THCGenerator* gen) +{ + std::lock_guard lock(gen->mutex); + if (gen->state.gen_states) + { + THCudaFree(state, gen->state.gen_states); + gen->state.gen_states = NULL; + } + if (gen->state.kernel_params) + { + THCudaFree(state, gen->state.kernel_params); + gen->state.kernel_params = NULL; + } +} + +static uint64_t createSeed(std::random_device& rd) +{ + // limit to 53 bits to ensure unique representation in double + uint64_t seed = (((uint64_t)rd()) << 32) + rd(); + return seed & 0x1FFFFFFFFFFFFF; +} + +/* Initialize generator array (must be called before any other function) */ +void THCRandom_init(THCState* state, int devices, int current_device) +{ + THCRNGState* rng_state = THCState_getRngState(state); + rng_state->num_devices = devices; + rng_state->gen = (THCGenerator*)malloc(rng_state->num_devices * sizeof(THCGenerator)); + std::random_device rd; + for (int i = 0; i < rng_state->num_devices; ++i) + { + new (&rng_state->gen[i].mutex) std::mutex(); + rng_state->gen[i].state.initf = 0; + rng_state->gen[i].state.initial_seed = createSeed(rd); + rng_state->gen[i].state.philox_seed_offset = 0; + rng_state->gen[i].state.gen_states = NULL; + rng_state->gen[i].state.kernel_params = NULL; + } +} + +/* Destroy generators and free memory */ +void THCRandom_shutdown(THCState* state) +{ + THCRNGState* rng_state = THCState_getRngState(state); + if (rng_state->gen == NULL) return; + for (int i = 0; i < rng_state->num_devices; ++i) + { + destroyGenerator(state, &rng_state->gen[i]); + } + free(rng_state->gen); + rng_state->gen = NULL; +} + +/* Get the generator for the current device, but does not initialize the state */ +static THCGenerator* THCRandom_rawGenerator(THCState* state) +{ + THCRNGState* rng_state = THCState_getRngState(state); + int device; + THCudaCheck(cudaGetDevice(&device)); + if (device >= rng_state->num_devices) THError("Invalid device index."); + return &rng_state->gen[device]; +} + +/* Get the generator for the current device and initializes it if necessary */ +THCGenerator* THCRandom_getGenerator(THCState* state) +{ + THCGenerator* gen = THCRandom_rawGenerator(state); + std::lock_guard lock(gen->mutex); + if (gen->state.initf == 0) + { + initializeGenerator(state, gen); + createGeneratorState(gen, gen->state.initial_seed); + gen->state.initf = 1; + } + return gen; +} + +struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state) +{ + THCGenerator* gen = THCRandom_getGenerator(state); + return gen->state.gen_states; +} + +/* Random seed */ +uint64_t THCRandom_seed(THCState* state) +{ + std::random_device rd; + uint64_t s = createSeed(rd); + THCRandom_manualSeed(state, s); + return s; +} + +uint64_t THCRandom_seedAll(THCState* state) +{ + std::random_device rd; + uint64_t s = createSeed(rd); + THCRandom_manualSeedAll(state, s); + return s; +} + +/* Manually set the seed */ +void THCRandom_manualSeed(THCState* state, uint64_t seed) +{ + THCGenerator* gen = THCRandom_rawGenerator(state); + std::lock_guard lock(gen->mutex); + gen->state.initial_seed = seed; + if (gen->state.initf) { + createGeneratorState(gen, seed); + } +} + +void THCRandom_manualSeedAll(THCState* state, uint64_t seed) +{ + THCRNGState* rng_state = THCState_getRngState(state); + int currentDevice; + THCudaCheck(cudaGetDevice(¤tDevice)); + for (int i = 0; i < rng_state->num_devices; ++i) { + THCudaCheck(cudaSetDevice(i)); + THCRandom_manualSeed(state, seed); + } + THCudaCheck(cudaSetDevice(currentDevice)); +} + +/* Get the initial seed */ +uint64_t THCRandom_initialSeed(THCState* state) +{ + THCGenerator* gen = THCRandom_getGenerator(state); + return gen->state.initial_seed; +} diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu new file mode 100644 index 0000000..6544a18 --- /dev/null +++ b/aten/src/THC/THCTensorRandom.cu @@ -0,0 +1,196 @@ +#include "THCTensorRandom.h" +#include "THCDeviceUtils.cuh" +#include "THCGeneral.h" +#include "THCTensorCopy.h" +#include "THCTensorMath.h" +#include "THCReduceApplyUtils.cuh" +#include "THCTensorRandom.cuh" +#include "THCGenerator.hpp" + +#include +#include +#include +#include +#include + +#define MAX_NUM_BLOCKS 200 +#define BLOCK_SIZE 256 + + +THCGenerator* THCRandom_getGenerator(THCState* state); + +/* Sets up generator. Allocates but does not create the generator states. Not thread-safe. */ +__host__ void initializeGenerator(THCState *state, THCGenerator* gen) +{ + gen->state.gen_states = static_cast(THCudaMalloc(state, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32))); + gen->state.kernel_params = static_cast(THCudaMalloc(state, sizeof(mtgp32_kernel_params))); +} + +/* Creates a new generator state given the seed. Not thread-safe. */ +__host__ void createGeneratorState(THCGenerator* gen, uint64_t seed) +{ + if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->state.kernel_params) != CURAND_STATUS_SUCCESS) + { + THError("Creating MTGP constants failed."); + } + if (curandMakeMTGP32KernelState(gen->state.gen_states, mtgp32dc_params_fast_11213, + gen->state.kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS) + { + THError("Creating MTGP kernel state failed."); + } +} + +__host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state) +{ + THCGenerator* gen = THCRandom_getGenerator(state); + std::lock_guard lock(gen->mutex); + + // The RNG state comprises the MTPG32 states, the seed, and an offset used for Philox + static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); + static const size_t seed_size = sizeof(gen->state.initial_seed); + static const size_t offset_size = sizeof(gen->state.philox_seed_offset); + static const size_t total_size = states_size + seed_size + offset_size; + THByteTensor_resize1d(rng_state, total_size); + THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); + THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); + THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->state.gen_states, + states_size, cudaMemcpyDeviceToHost)); + memcpy(THByteTensor_data(rng_state) + states_size, &gen->state.initial_seed, seed_size); + memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &gen->state.philox_seed_offset, offset_size); +} + +__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel) +{ + state[threadIdx.x].k = kernel; +} + +__host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state) +{ + THCGenerator* gen = THCRandom_getGenerator(state); + std::lock_guard lock(gen->mutex); + + static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); + static const size_t seed_size = sizeof(gen->state.initial_seed); + static const size_t offset_size = sizeof(gen->state.philox_seed_offset); + static const size_t total_size = states_size + seed_size + offset_size; + bool no_philox_seed = false; + if (THByteTensor_nElement(rng_state) == total_size - offset_size) { + no_philox_seed = true; + } + else { + THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); + } + THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); + + THCudaCheck(cudaMemcpy(gen->state.gen_states, THByteTensor_data(rng_state), + states_size, cudaMemcpyHostToDevice)); + set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>( + gen->state.gen_states, gen->state.kernel_params); + memcpy(&gen->state.initial_seed, THByteTensor_data(rng_state) + states_size, seed_size); + if (!no_philox_seed) { + memcpy(&gen->state.philox_seed_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size); + } + else { + gen->state.philox_seed_offset = 0; + } +} + +// Goes from (0, 1] to [0, 1). Note 1-x is not sufficient since for some floats +// eps near 0, 1-eps will round to 1. +template +__device__ inline T reverse_bounds(T value) { + if (THCNumerics::eq(value, ScalarConvert::to(1))) { + return ScalarConvert::to(0); + } + return value; +} + + +#ifdef CUDA_HALF_TENSOR +__device__ inline half half_uniform_scale_and_shift(float x, double a, double b) { + half width = ScalarConvert::to(b - a); + half start = ScalarConvert::to(a); + half scaled = THCNumerics::mul(reverse_bounds(ScalarConvert::to(x)), width); + return THCNumerics::add(scaled, start); +} +#endif + +#define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM) \ +__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ +{ \ + int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ + int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ + for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { \ + CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); \ + if (i < size) { \ + T y = TRANSFORM; \ + result[i] = y; \ + } \ + } \ +} + +#define GENERATE_KERNEL2(NAME, T, ARG1, ARG2, CURAND_T, CURAND_FUNC, TRANSFORM) \ +__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2) \ +{ \ + int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ + int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ + for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { \ + CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); \ + if (i < size) { \ + T y = TRANSFORM; \ + result[i] = y; \ + } \ + } \ +} + +template +struct is_same { static const bool value = false; }; + +template +struct is_same { static const bool value = true; }; + +template +__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size, + real *result, prob_type *probs) +{ + int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; + for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { + if (is_same::value) { + double x = curand_uniform_double(&state[blockIdx.x]); + if (i < size) + result[i] = ScalarConvert::to(x <= probs[i]); + } else { + float x = curand_uniform(&state[blockIdx.x]); + if (i < size) + result[i] = ScalarConvert::to(x <= probs[i]); + } + } +} + +// NOTE: curand_uniform is (0, 1] and we want [a, b) +GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) +GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) +GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a) + +GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean) +GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean) + +GENERATE_KERNEL1(generate_exponential, float, double lambda, float, curand_uniform, (float)(-1. / lambda * log(x))) +GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uniform_double, (double)(-1. / lambda * log(x))) + +GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5)))) +GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5)))) + +#ifdef CUDA_HALF_TENSOR +GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b))) +GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert::to((x * stdv) + mean))) +GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert::to((float)(-1. / lambda * log(x))))) +GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert::to((float)(median + sigma * tan(M_PI*(x-0.5)))))) +#endif // CUDA_HALF_TENSOR + +#include "generic/THCTensorRandom.cu" +#include "THCGenerateAllTypes.h" + +#undef GENERATE_KERNEL1 +#undef GENERATE_KERNEL2 diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh new file mode 100644 index 0000000..7749f23 --- /dev/null +++ b/aten/src/THC/THCTensorRandom.cuh @@ -0,0 +1,401 @@ +#ifndef THC_TENSOR_RANDOM_CUH +#define THC_TENSOR_RANDOM_CUH + +#include "THCNumerics.cuh" +#include "THCReduceApplyUtils.cuh" +#include "THCTensorMathReduce.cuh" + +#include + +#define MAX_NUM_BLOCKS 200 +#define BLOCK_SIZE 256 +/* Separate kernel because curand_log_normal gets extra parameters. */ + +template +__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev) +{ + int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; + for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { + float x = curand_log_normal(&state[blockIdx.x], mean, stddev); + if (i < size) { + result[i] = ScalarConvert::to(x); + } + } +} + +template <> +__global__ void generateLogNormal(curandStateMtgp32 *state, int size, double *result, double mean, double stddev) +{ + int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; + for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { + double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev); + if (i < size) { + result[i] = x; + } + } +} + +template +__global__ void +multinomialAliasDrawKernel(int size, int64_t *output, int64_t *J, T *q, int64_t K, T *uniform, T *bernoulli){ + int64_t idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + if (idx < size) { + int64_t rand_ind = ScalarConvert::to(uniform[idx]); + T bern_uniform = bernoulli[idx]; + int _mask = (int) THCNumerics::lt(bern_uniform, q[rand_ind]); + output[idx] = J[rand_ind]*(1 -_mask) + (rand_ind+1L) * _mask; + } +} + +template +__global__ void +aliasMultinomialFilter(T *q, T *probs, int64_t *smaller, int64_t *larger, int64_t *J_data, int64_t *larger_short_data, int64_t *smaller_short_data, T one, int64_t inputsize){ + int64_t idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + if (idx < inputsize) { + larger_short_data[idx] = 0; + smaller_short_data[idx] = 0; + J_data[idx]= 0; + T val = THCNumerics::mul(probs[idx], ScalarConvert::to(inputsize)); + if (THCNumerics::lt(val, one)) { + smaller[idx] = idx+1; + larger[idx] = 0; + } else { + larger[idx] = idx+1; + smaller[idx] = 0; + } + q[idx] = val; + } +} + +template +__global__ void +condDiv(T *q, int64_t *J, int64_t inputsize, T q_max) { + int64_t idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + T one = ScalarConvert::to(1); + if (idx < inputsize) { + if (J[idx] <= 0) { + q[idx] = one; + } else { + if (THCNumerics::gt(q_max, one)) { + q[idx] = THCNumerics::div(q[idx], q_max); + } + } + } +} + + +#undef MAX_NUM_BLOCKS +#undef BLOCK_SIZE + +// Normalizes the L1 norm of every row to 1; used by multinomial +template +__global__ void renormRowsL1(T* dist, long rows, long cols) { + extern __shared__ unsigned char my_smem[]; + T *smem = reinterpret_cast(my_smem); + T zero = ScalarConvert::to(0); + T val; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + T sum = ScalarConvert::to(0); + for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) { + val = dist[row * cols + col]; + assert(THCNumerics::ge(val, zero)); + sum = THCNumerics::add(sum, val); + } + + sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd(), zero); + if (threadIdx.x == 0) { + assert(THCNumerics::gt(sum, zero)); + smem[0] = sum; + } + __syncthreads(); + + sum = smem[0]; + if (THCNumerics::gt(sum, ScalarConvert::to(0))) { + for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) { + dist[row * cols + col] = THCNumerics::div(dist[row * cols + col], sum); + } + } + } +} + +template +__device__ int binarySearchForMultinomial(T* dist, + int size, + T val) { + int start = 0; + int end = size; + + while (end - start > 0) { + int mid = start + (end - start) / 2; + + T midVal = dist[mid]; + if (THCNumerics::lt(midVal, val)) { + start = mid + 1; + } else { + end = mid; + } + } + + if (start == size) { + // No probability mass or precision problems; just return the + // first non-zero element by setting start to size-1 here, + // the code below will move it to the last non-zero probability + // this actually can happen when the random number is 1 + // (github pytorch issue #4858). + start = size - 1; + } + + T curVal = dist[start]; + while(start >= 1 && THCNumerics::eq(dist[start - 1], curVal)) start--; + + return start; +} + +template +__global__ void +sampleMultinomialOnce(int64_t* dest, + int64_t distributions, + int categories, + T* sampled, + T* dist, + int stride_dist, // dist->stride[0] + int stride_categories // dist->stride[1] + ) { + extern __shared__ unsigned char my_smem[]; + __shared__ bool found; + + // Shared Memory hold blockdim.x T for holding the cumulative sum, + // blockDim.x AccT for normalizing the probabilities, + T *smem = reinterpret_cast(my_smem); + AccT *asmem = reinterpret_cast(&my_smem[blockDim.x * sizeof(T)]); + + AccT accZero = ScalarConvert::to(0); + T zero = ScalarConvert::to(0); + + for (int64_t curDist = blockIdx.x; + curDist < distributions; curDist += gridDim.x) { + // Each block handles one distribution + // First pass, find the total sum of the distribution + AccT sum = accZero; + T val; + for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) { + val = dist[curDist * stride_dist + cat * stride_categories]; + assert(THCNumerics::ge(val, zero)); + assert(!THCNumerics::isinf(val)); + assert(!THCNumerics::isnan(val)); + sum = THCNumerics::add(sum, ScalarConvert::to(val)); + } + + // threadIdx.x == 0 has the sum value from this + sum = reduceBlock(asmem, blockDim.x, sum, ReduceAdd(), accZero); + + // Broadcast sum and sample value + if (threadIdx.x == 0) { + // Make sure the sum of our distribution didn't overflow + assert(!isinf(sum)); + assert(THCNumerics::gt(sum, accZero)); + + asmem[0] = sum; + smem[0] = sampled[curDist]; + } + __syncthreads(); + + sum = asmem[0]; + T sample = smem[0]; + __syncthreads(); + + if (THCNumerics::eq(sum, accZero) || THCNumerics::eq(sample, zero)) { + // Choose the first element + if (threadIdx.x == 0) { + dest[curDist] = TH_INDEX_BASE; + } + + continue; + } + + int chunks = THCCeilDiv(categories, (int) blockDim.x); + T prevHighProb = zero; + found = false; + + for (int chunk = 0; chunk < chunks && !found; ++chunk) { + // All threads in bounds load a value + int cat = chunk * blockDim.x + threadIdx.x; + + AccT val = + cat < categories ? + THCNumerics::div( + ScalarConvert::to(dist[curDist * stride_dist + cat * stride_categories]), + sum) : + accZero; + + smem[threadIdx.x] = ScalarConvert::to(val); + __syncthreads(); + + // Perform an inclusive prefix sum of the shared memory contents + for (int offset = 1; offset < blockDim.x; offset *= 2) { + T val = zero; + + if (threadIdx.x >= offset) { + val = THCNumerics::add(smem[threadIdx.x - offset], smem[threadIdx.x]); + } + + __syncthreads(); + if (threadIdx.x >= offset) { + smem[threadIdx.x] = val; + } + __syncthreads(); + } + + // Each thread will check to see if the sample falls in its + // bucket + T curBucket = THCNumerics::add(smem[threadIdx.x], prevHighProb); + T prevBucket = + threadIdx.x == 0 ? prevHighProb : + THCNumerics::add(smem[threadIdx.x - 1], prevHighProb); + bool inBucket = + (cat < categories) && + (!THCNumerics::gt(sample, curBucket)) && + (THCNumerics::gt(sample, prevBucket)); + + if (inBucket) { + // We're done; we have the sample + // Torch indices are 1-based + dest[curDist] = cat + TH_INDEX_BASE; + found = true; + } + + // Store the previous scan's high value for future use + prevHighProb = THCNumerics::add(prevHighProb, smem[blockDim.x - 1]); + + __syncthreads(); + } + + if (threadIdx.x == 0 && !found) { + // This should address a rare bug where we don't select a valid index. This likely occurs when + // due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but + // and our uniform sample is greater than this value. In this case we likely have unitialized memory + // in dest[curDist]. So basically we will loop through the distribution and pick the largest index + // where the distribution is non-zero. This is obviously terribly inefficient, but due to the + // rarity in which this occurs, this should not be an issue. + for (int cat = categories - 1; cat >= 0; --cat) { + if (THCNumerics::gt(dist[curDist * stride_dist + cat * stride_categories], zero)) { + dest[curDist] = cat + TH_INDEX_BASE; + break; + } + } + } + } +} + +template +__global__ void +sampleMultinomialWithReplacement(curandStateMtgp32* state, + int totalSamples, + int64_t* dest, + int64_t distributions, + int categories, + T* normDistPrefixSum) { + // At the moment, each warp computes one sample value in the binary + // search due to divergence. It seems possible to compute multiple + // values and limit divergence though later on. However, no matter + // what, all block threads must participate in the curand_uniform + // call to update the generator state. + + // The block determines the distribution for which we generate a point + for (int64_t curDist = blockIdx.x; + curDist < distributions; + curDist += gridDim.x) { + for (int sampleBase = 0; + sampleBase < totalSamples; sampleBase += blockDim.y) { + // The warp determines the sample + int sample = sampleBase + threadIdx.y; + + // All threads participate in this + T r = ScalarConvert::to(curand_uniform(&state[blockIdx.x])); + + if (threadIdx.x == 0 && sample < totalSamples) { + // Find the bucket that a uniform sample lies in + int choice = binarySearchForMultinomial( + normDistPrefixSum + curDist * categories, + categories, + r); + + // Torch indices are 1-based + dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE; + } + } + } +} + +template +__global__ void +sampleMultinomialWithoutReplacement(curandStateMtgp32* state, + int totalSamples, + int sample, + int64_t* dest, + int64_t distributions, + int categories, + T* origDist, + T* normDistPrefixSum) { + // At the moment, each warp computes one sample value in the binary + // search due to divergence. It seems possible to compute multiple + // values and limit divergence though later on. However, no matter + // what, all block threads must participate in the curand_uniform + // call to update the generator state. + + // The block and warp determines the distribution for which we + // generate a point + for (int64_t curDistBase = blockIdx.x * blockDim.y; + curDistBase < distributions; + curDistBase += gridDim.x * blockDim.y) { + // The warp determines the distribution + int64_t curDist = curDistBase + threadIdx.y; + + // All threads must participate in this + T r = ScalarConvert::to(curand_uniform(&state[blockIdx.x])); + + if (threadIdx.x == 0 && curDist < distributions) { + // Find the bucket that a uniform sample lies in + int choice = binarySearchForMultinomial( + normDistPrefixSum + curDist * categories, + categories, + r); + + // Torch indices are 1-based + dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE; + + // Without replacement, so update the original probability so it + // is not considered a second time + origDist[curDist * categories + choice] = ScalarConvert::to(0); + } + } +} + +template +__global__ void +aliasMultinomialSetup(int64_t *J, T*q, int64_t inputsize, int64_t * smaller, int64_t *larger, int small_c, int large_c) { + T one = ScalarConvert::to(1); + // Loop through and create little binary mixtures that + // appropriately allocate the larger outcomes over the + // overall uniform mixture. + int64_t large = 0; + int64_t small = 0; + while (small_c > 0 && large_c > 0) { + large = larger[large_c-1]-1; + small = smaller[small_c-1]-1; + J[small] = large; + T q_sub = THCNumerics::sub(one, q[small]); + q[large] = THCNumerics::sub(q[large], q_sub); + if (THCNumerics::le(q[large], one)) { + smaller[small_c-1] = large+1; + large_c -= 1; + } else { + larger[large_c-1] = large+1; + small_c -= 1; + } + } +} + +#endif // THC_TENSOR_RANDOM_CUH diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h new file mode 100644 index 0000000..5203df2 --- /dev/null +++ b/aten/src/THC/THCTensorRandom.h @@ -0,0 +1,31 @@ +#ifndef TH_CUDA_TENSOR_RANDOM_INC +#define TH_CUDA_TENSOR_RANDOM_INC + +#include "THCTensor.h" + +#include "generic/THCTensorRandom.h" +#include "THCGenerateAllTypes.h" + +typedef struct THCGenerator THCGenerator; + +typedef struct THCRNGState { + /* One generator per GPU */ + THCGenerator* gen; + int num_devices; +} THCRNGState; + +struct THCState; + +THC_API void THCRandom_init(struct THCState *state, int num_devices, int current_device); +THC_API void THCRandom_shutdown(struct THCState *state); +THC_API uint64_t THCRandom_seed(struct THCState *state); +THC_API uint64_t THCRandom_seedAll(struct THCState *state); +THC_API void THCRandom_manualSeed(struct THCState *state, uint64_t the_seed_); +THC_API void THCRandom_manualSeedAll(struct THCState *state, uint64_t the_seed_); +THC_API uint64_t THCRandom_initialSeed(struct THCState *state); +THC_API void THCRandom_getRNGState(struct THCState *state, THByteTensor *rng_state); +THC_API void THCRandom_setRNGState(struct THCState *state, THByteTensor *rng_state); + +THC_API struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state); + +#endif diff --git a/aten/src/THC/THCTensorScatterGather.cu b/aten/src/THC/THCTensorScatterGather.cu new file mode 100644 index 0000000..a1ed0d4 --- /dev/null +++ b/aten/src/THC/THCTensorScatterGather.cu @@ -0,0 +1,184 @@ +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCAtomics.cuh" +#include "THCApply.cuh" + +// Compute the offsets into the given tensors for a linear index. For the 't2' +// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same +// size (with the exception of 't2' in dimension 'dim'). +// This version uses a static number of dimensions. +template +struct IndexToScatterGatherOffsets { + static __device__ void compute( + IndexType linearId, const int dim, + const TensorInfo& index, IndexType* indexOffset, + const TensorInfo& t1, IndexType* t1Offset, + const TensorInfo& t2, IndexType* t2Offset) { + for (int d = Dims - 1; d >= 0; d--) { + IndexType curDimIndex = linearId % index.sizes[d]; + *indexOffset += curDimIndex * index.strides[d]; + *t1Offset += curDimIndex * t1.strides[d]; + if (d != dim) { + *t2Offset += curDimIndex * t2.strides[d]; + } + linearId /= index.sizes[d]; + } + } + + static __device__ void compute( + IndexType linearId, const int dim, + const TensorInfo& index, IndexType* indexOffset, + const TensorInfo& t2, IndexType* t2Offset) { + for (int d = Dims - 1; d >= 0; d--) { + IndexType curDimIndex = linearId % index.sizes[d]; + *indexOffset += curDimIndex * index.strides[d]; + if (d != dim) { + *t2Offset += curDimIndex * t2.strides[d]; + } + linearId /= index.sizes[d]; + } + } +}; + +// Same as above but using a dynamic number of dimensions. +template +struct IndexToScatterGatherOffsets { + static __device__ void compute( + IndexType linearId, const int dim, + const TensorInfo& index, IndexType* indexOffset, + const TensorInfo& t1, IndexType* t1Offset, + const TensorInfo& t2, IndexType* t2Offset) { + for (int d = index.dims - 1; d >= 0; d--) { + IndexType curDimIndex = linearId % index.sizes[d]; + *indexOffset += curDimIndex * index.strides[d]; + *t1Offset += curDimIndex * t1.strides[d]; + if (d != dim) { + *t2Offset += curDimIndex * t2.strides[d]; + } + linearId /= index.sizes[d]; + } + } + + static __device__ void compute( + IndexType linearId, const int dim, + const TensorInfo& index, IndexType* indexOffset, + const TensorInfo& t2, IndexType* t2Offset) { + for (int d = index.dims - 1; d >= 0; d--) { + IndexType curDimIndex = linearId % index.sizes[d]; + *indexOffset += curDimIndex * index.strides[d]; + if (d != dim) { + *t2Offset += curDimIndex * t2.strides[d]; + } + linearId /= index.sizes[d]; + } + } +}; + +template +__global__ void THCudaTensor_gatherKernel( + TensorInfo tensor, + TensorInfo src, + TensorInfo index, + const int dim, + const IndexType totalElements) { + for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; + linearId < totalElements; + linearId += gridDim.x * blockDim.x) { + IndexType tensorOffset = 0; + IndexType srcOffset = 0; + IndexType indexOffset = 0; + + IndexToScatterGatherOffsets::compute(linearId, dim, + index, &indexOffset, + tensor, &tensorOffset, + src, &srcOffset); + + int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE; + assert(indexValue >= 0 && indexValue < src.sizes[dim]); + srcOffset += indexValue * src.strides[dim]; + + tensor.data[tensorOffset] = src.data[srcOffset]; + } +} + +template +__global__ void THCudaTensor_scatterKernel( + TensorInfo tensor, + TensorInfo src, + TensorInfo index, + const int dim, + const IndexType totalElements) { + for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; + linearId < totalElements; + linearId += gridDim.x * blockDim.x) { + IndexType tensorOffset = 0; + IndexType srcOffset = 0; + IndexType indexOffset = 0; + + IndexToScatterGatherOffsets::compute(linearId, dim, + index, &indexOffset, + src, &srcOffset, + tensor, &tensorOffset); + + int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE; + assert(indexValue >= 0 && indexValue < tensor.sizes[dim]); + tensorOffset += indexValue * tensor.strides[dim]; + + tensor.data[tensorOffset] = src.data[srcOffset]; + } +} + +template +__global__ void THCudaTensor_scatterAddKernel( + TensorInfo tensor, + TensorInfo src, + TensorInfo index, + const int dim, + const IndexType totalElements) { + for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; + linearId < totalElements; + linearId += gridDim.x * blockDim.x) { + IndexType tensorOffset = 0; + IndexType srcOffset = 0; + IndexType indexOffset = 0; + + IndexToScatterGatherOffsets::compute(linearId, dim, + index, &indexOffset, + src, &srcOffset, + tensor, &tensorOffset); + + int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE; + assert(indexValue >= 0 && indexValue < tensor.sizes[dim]); + tensorOffset += indexValue * tensor.strides[dim]; + + atomicAdd(&tensor.data[tensorOffset], src.data[srcOffset]); + } +} + +template +__global__ void THCudaTensor_scatterFillKernel( + TensorInfo tensor, + TensorInfo index, + Real value, + const int dim, + const IndexType totalElements) { + for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; + linearId < totalElements; + linearId += gridDim.x * blockDim.x) { + IndexType tensorOffset = 0; + IndexType indexOffset = 0; + + IndexToScatterGatherOffsets::compute(linearId, dim, + index, &indexOffset, + tensor, &tensorOffset); + + int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE; + assert(indexValue >= 0 && indexValue < tensor.sizes[dim]); + tensorOffset += indexValue * tensor.strides[dim]; + + tensor.data[tensorOffset] = value; + } +} + +#include "generic/THCTensorScatterGather.cu" +#include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorSort.cu b/aten/src/THC/THCTensorSort.cu new file mode 100644 index 0000000..ed1342f --- /dev/null +++ b/aten/src/THC/THCTensorSort.cu @@ -0,0 +1,62 @@ +#include "THCTensorSort.cuh" + +void THCudaLongTensor_fillSliceWithIndex(THCState* state, + THCudaLongTensor* t, + int dim) { + int64_t dims = THCudaLongTensor__nDimension(state, t); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + + ptrdiff_t inElements = THCudaLongTensor_nElement(state, t); + int64_t sliceSize = THCudaLongTensor_size(state, t, dim); + ptrdiff_t numSlices = inElements / sliceSize; + + dim3 grid; + if (!THC_getGridFromTiles(numSlices, grid)) { + THError("Slice to fill with indices is too large"); + } + + int64_t maxThreads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + int64_t numThreads = sliceSize; + if (numThreads > maxThreads) { + numThreads = maxThreads; + } + + dim3 block(numThreads); + +#define FILL_INDEX(T, DIM) \ + fillSliceWithIndex \ + <<>>( \ + info, numSlices, sliceSize, info.strides[collapseDim]) + + if (THCTensor_canUse32BitIndexMath(state, t)) { + TensorInfo info = + getTensorInfo(state, t); + info.reduceDim(dim); + int collapseDim = info.collapseDims(dim); + + if (info.isContiguous()) { + FILL_INDEX(unsigned int, -2); + } else { + if (info.dims == 1) { + FILL_INDEX(unsigned int, 1); + } else if (info.dims == 2) { + FILL_INDEX(unsigned int, 2); + } else { + FILL_INDEX(unsigned int, -1); + } + } + } else { + TensorInfo info = + getTensorInfo(state, t); + info.reduceDim(dim); + int collapseDim = info.collapseDims(dim); + + // catch-all implementation + FILL_INDEX(uint64_t, -1); + } + +#undef FILL_INDEX + + THCudaCheck(cudaGetLastError()); +} diff --git a/aten/src/THC/THCTensorSort.cuh b/aten/src/THC/THCTensorSort.cuh new file mode 100644 index 0000000..9b75a73 --- /dev/null +++ b/aten/src/THC/THCTensorSort.cuh @@ -0,0 +1,86 @@ +#ifndef THC_TENSORSORT_CUH +#define THC_TENSORSORT_CUH + +#include "THCReduceApplyUtils.cuh" +#include "THCSortUtils.cuh" +#include "THCTensorCopy.h" +#include "THCTensorTypeUtils.cuh" + +#include "THCThrustAllocator.cuh" +#include +#include +#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__) +#include +#endif + +template +struct ThrustGTOp { + __device__ bool operator()(const T& lhs, const T& rhs) const { + return THCNumerics::gt(lhs, rhs); + } +}; + +template +struct ThrustLTOp { + __device__ bool operator()(const T& lhs, const T& rhs) const { + return THCNumerics::lt(lhs, rhs); + } +}; + +// `base` is the base address of a tensor +// For each slice (defined as a linear point of `out`, from 0 -> +// (sliceSize - 1) * sliceStride, we fill that slice from `0` to +// `sliceSize - 1`. +template +__global__ void +fillSliceWithIndex(TensorInfo out, + IndexType totalSlices, + IndexType sliceSize, + IndexType sliceStride) { + IndexType slice = getLinearBlockId(); + + if (slice >= totalSlices) { + return; + } + + const uint64_t offset = + IndexToOffset::get(slice, out); + int64_t* base = &out.data[offset]; + + for (int64_t i = threadIdx.x; i < sliceSize; i += blockDim.x) { + // Torch indices are 1-based (hence the +1) + base[i * sliceStride] = i + TH_INDEX_BASE; + } +} + +// For slice sorting in Thrust; extracts a slice index from a linear +// index and uses that for comparison +struct SliceComp { + SliceComp(int64_t size) : sliceSize(size) {} + + __device__ bool operator()(const int64_t& a, const int64_t& b) const { + // Since the slices are guaranteed to be innermost, + // the segment is just via int64_t division + int64_t segA = a / sliceSize; + int64_t segB = b / sliceSize; + return segA < segB; + } + + const int64_t sliceSize; +}; + +// For sorting in Thurst; extracts a within-slice index from a linear index +struct GlobalIndexToPerSliceIndex { + GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {} + + __device__ inline void operator()(int64_t& v) const { + v = v % sliceSize + TH_INDEX_BASE; + } + + const int64_t sliceSize; +}; + +void THCudaLongTensor_fillSliceWithIndex(THCState* state, + THCudaLongTensor* t, + int dim); +#endif // THC_TENSORSORT_CUH diff --git a/aten/src/THC/THCTensorTopK.cu b/aten/src/THC/THCTensorTopK.cu new file mode 100644 index 0000000..325d560 --- /dev/null +++ b/aten/src/THC/THCTensorTopK.cu @@ -0,0 +1,19 @@ +#include "THC.h" +#include "THCReduceApplyUtils.cuh" +#include "THCTensorCopy.h" +#include "THCTensorMath.h" +#include "THCAsmUtils.cuh" +#include "THCScanUtils.cuh" +#include "THCTensorTypeUtils.cuh" +#include "THCTensorMathReduce.cuh" +#include // for std::min + +#if CUDA_VERSION >= 7000 +#include +#endif + +#include "THCTensorTopK.cuh" + +#include "generic/THCTensorTopK.cu" +#include "THCGenerateAllTypes.h" + diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh new file mode 100644 index 0000000..c243316 --- /dev/null +++ b/aten/src/THC/THCTensorTopK.cuh @@ -0,0 +1,485 @@ +#ifndef THC_TENSOR_TOPK_CUH +#define THC_TENSOR_TOPK_CUH + +template +struct TopKTypeConfig {}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + // Converts a float to an integer representation with the same + // sorting; i.e., for floats f1, f2: + // if f1 < f2 then convert(f1) < convert(f2) + // We use this to enable radix selection of floating-point values. + // This also gives a relative order for NaNs, but that's ok, as they + // will all be adjacent + static inline __device__ RadixType convert(float v) { + RadixType x = __float_as_int(v); + RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000; + + return (x ^ mask); + } + + static inline __device__ float deconvert(RadixType v) { + RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff; + + return __int_as_float(v ^ mask); + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(uint8_t v) { + return v; + } + + static inline __device__ uint8_t deconvert(RadixType v) { + return v; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int8_t v) { + return 128u + v; + } + + static inline __device__ int8_t deconvert(RadixType v) { + return v - 128; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int16_t v) { + assert(sizeof(short) == 2); + return 32768u + v; + } + + static inline __device__ int16_t deconvert(RadixType v) { + return v - 32768; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int32_t v) { + assert(sizeof(int) == 4); + return 2147483648u + v; + } + + static inline __device__ int32_t deconvert(RadixType v) { + return v - 2147483648u; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint64_t RadixType; + + static inline __device__ RadixType convert(int64_t v) { + assert(sizeof(int64_t) == 8); + return 9223372036854775808ull + v; + } + + static inline __device__ int64_t deconvert(RadixType v) { + return v - 9223372036854775808ull; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint64_t RadixType; + + static inline __device__ RadixType convert(double v) { + RadixType x = __double_as_longlong(v); + RadixType mask = -((x >> 63)) | 0x8000000000000000; + return (x ^ mask); + } + + static inline __device__ double deconvert(RadixType v) { + RadixType mask = ((v >> 63) - 1) | 0x8000000000000000; + return __longlong_as_double(v ^ mask); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(half v) { +#if CUDA_VERSION >= 8000 + RadixType x = __half_as_ushort(v); + RadixType mask = -((x >> 15)) | 0x8000; + return (x ^ mask); +#else + assert(false); + return 0u; +#endif + } + + static inline __device__ half deconvert(RadixType v) { +#if CUDA_VERSION >= 8000 + RadixType mask = ((v >> 15) - 1) | 0x8000; + return __ushort_as_half(v ^ mask); +#else + assert(false); + return ScalarConvert::to(0); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +// This function counts the distribution of all input values in a +// slice we are selecting by radix digit at `radixDigitPos`, but only +// those that pass the filter `((v & desiredMask) == desired)`. +// This produces and broadcasts the seen counts for a single block only. +// `smem` must have at least `RadixSize` elements. +template +__device__ void countRadixUsingMask(CountType counts[RadixSize], + CountType* smem, + BitDataType desired, + BitDataType desiredMask, + int radixDigitPos, + IndexType sliceSize, + IndexType withinSliceStride, + DataType* data) { + // Clear out per-thread counts from a previous round +#pragma unroll + for (int i = 0; i < RadixSize; ++i) { + counts[i] = 0; + } + + if (threadIdx.x < RadixSize) { + smem[threadIdx.x] = 0; + } + __syncthreads(); + + // Scan over all the data. Upon a read, the warp will accumulate + // counts per each digit in the radix using warp voting. + for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) { + BitDataType val = TopKTypeConfig::convert(doLdg(&data[i * withinSliceStride])); + + bool hasVal = ((val & desiredMask) == desired); + BitDataType digitInRadix = Bitfield::getBitfield(val, radixDigitPos, RadixBits); + +#pragma unroll + for (unsigned int j = 0; j < RadixSize; ++j) { + bool vote = hasVal && (digitInRadix == j); + counts[j] += __popc(WARP_BALLOT(vote, ACTIVE_MASK())); + } + } + + // Now, for each warp, sum values + if (getLaneId() == 0) { +#pragma unroll + for (unsigned int i = 0; i < RadixSize; ++i) { + atomicAdd(&smem[i], counts[i]); + } + } + + __syncthreads(); + + // For each thread, read in the total counts +#pragma unroll + for (unsigned int i = 0; i < RadixSize; ++i) { + counts[i] = smem[i]; + } + + __syncthreads(); +} + +// Over what radix we are selecting values +#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS) +#define RADIX_SIZE 4 // 2 ^ RADIX_BITS +#define RADIX_MASK (RADIX_SIZE - 1) + +// This finds the unique value `v` that matches the pattern +// ((v & desired) == desiredMask) in our sorted int format +template +__device__ DataType findPattern(DataType* smem, + DataType* data, + IndexType sliceSize, + IndexType withinSliceStride, + BitDataType desired, + BitDataType desiredMask) { + if (threadIdx.x < 32) { + smem[threadIdx.x] = ScalarConvert::to(0); + } + __syncthreads(); + + // All threads participate in the loop, in order to sync on the flag + IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x); + for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) { + bool inRange = (i < sliceSize); + DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : ScalarConvert::to(0); + + if (inRange && ((TopKTypeConfig::convert(v) & desiredMask) == desired)) { + // There should not be conflicts if we are using findPattern, + // since the result is unique + smem[0] = ScalarConvert::to(1); + smem[1] = v; // can't use val as the flag, since it could be 0 + } + + __syncthreads(); + + DataType found = smem[0]; + DataType val = smem[1]; + + __syncthreads(); + + // Check to see if a thread found the value + if (THCNumerics::ne(found, ScalarConvert::to(0))) { + // all threads return this value + return val; + } + } + + // should not get here + assert(false); + return ScalarConvert::to(0); +} + +// Returns the top-Kth element found in the data using radix selection +template +__device__ void radixSelect(DataType* data, + IndexType k, + IndexType sliceSize, + IndexType withinSliceStride, + int* smem, + DataType* topK) { + // Per-thread buckets into which we accumulate digit counts in our + // radix + int counts[RADIX_SIZE]; + + // We only consider elements x such that (x & desiredMask) == desired + // Initially, we consider all elements of the array, so the above + // statement is true regardless of input. + BitDataType desired = 0; + BitDataType desiredMask = 0; + + // We are looking for the top kToFind-th element when iterating over + // digits; this count gets reduced by elimination when counting + // successive digits + int kToFind = k; + + // We start at the most significant digit in our radix, scanning + // through to the least significant digit +#pragma unroll + for (int digitPos = sizeof(DataType) * 8 - RADIX_BITS; + digitPos >= 0; + digitPos -= RADIX_BITS) { + + // Count radix distribution for the current position and reduce + // across all threads + countRadixUsingMask( + counts, smem, + desired, desiredMask, digitPos, + sliceSize, withinSliceStride, data); + + // All threads participate in the comparisons below to know the + // final result + + +#define CHECK_RADIX(i) \ + int count = counts[i]; \ + \ + /* All threads have the same value in counts here, so all */ \ + /* threads will return from the function. */ \ + if (count == 1 && kToFind == 1) { \ + /* There is a unique answer. */ \ + desired = Bitfield::setBitfield(desired, i, digitPos, RADIX_BITS); \ + desiredMask = \ + Bitfield::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS); \ + \ + /* The answer is now the unique element v such that: */ \ + /* (v & desiredMask) == desired */ \ + /* However, we do not yet know what the actual element is. We */ \ + /* need to perform a search through the data to find the */ \ + /* element that matches this pattern. */ \ + *topK = findPattern( \ + (DataType*) smem, data, sliceSize, \ + withinSliceStride, desired, desiredMask); \ + return; \ + } \ + \ + if (count >= kToFind) { \ + desired = Bitfield::setBitfield(desired, i, digitPos, RADIX_BITS); \ + desiredMask = \ + Bitfield::setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS); \ + \ + /* The top-Kth element v must now be one such that: */ \ + /* (v & desiredMask == desired) */ \ + /* but we haven't narrowed it down; we must check the next */ \ + /* least-significant digit */ \ + break; \ + } \ + \ + kToFind -= count \ + + if (Order) { + // Process in descending order +#pragma unroll + for (int i = RADIX_SIZE - 1; i >= 0; --i) { + CHECK_RADIX(i); + } + } else { + // Process in ascending order +#pragma unroll + for (int i = 0; i < RADIX_SIZE; ++i) { + CHECK_RADIX(i); + } + } +#undef CHECK_RADIX + } // end digitPos for + + // There is no unique result, but there is a non-unique result + // matching `desired` exactly + *topK = TopKTypeConfig::deconvert(desired); +} + +template +__global__ void gatherTopK(TensorInfo input, + IndexType inputSliceSize, + IndexType outputSliceSize, // aka `k` + + IndexType numInputSlices, + IndexType inputWithinSliceStride, + + TensorInfo topK, + IndexType numTopKSlices, + IndexType topKWithinSliceStride, + + TensorInfo indices, + IndexType indicesWithinSliceStride) { + // Indices are limited to integer fp precision, so counts can fit in + // int32, regardless of IndexType + __shared__ int smem[32]; // one per each warp, up to warp limit + + IndexType slice = getLinearBlockId(); + if (slice >= numInputSlices) { + return; + } + + // Find the start offset for our slice + IndexType sliceStartIndex = + IndexToOffset::get(slice, input); + IndexType topKSliceStartIndex = + IndexToOffset::get(slice, topK); + IndexType indicesSliceStartIndex = + IndexToOffset::get(slice, indices); + + T* inputSliceStart = &input.data[sliceStartIndex]; + T* topKSliceStart = &topK.data[topKSliceStartIndex]; + int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex]; + + // Find the k-th highest element in our input + T topKValue = ScalarConvert::to(0); + radixSelect::RadixType, IndexType, Order>( + inputSliceStart, outputSliceSize, + inputSliceSize, inputWithinSliceStride, + smem, &topKValue); + + // Every value that is strictly less/greater than `pattern` + // (depending on sort dir) in sorted int format is in the top-K. + // The top-K value itself might not be unique. + // + // Since there are a variable number of elements that we see that + // are within the top-k, we don't know at what index to write out + // the resulting values. + // In order to get this, we perform an exclusive prefix sum of + // `hasTopK`. This will return the resulting index into which we + // need to write the result, if a thread has a result. + + // All threads need to participate in the loop and the prefix sum, + // but not necessarily in the load; hence loop bounds being rounded + // up to a multiple of the block dim. + IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x); + IndexType writeIndexStart = 0; + + for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) { + bool inRange = (i < inputSliceSize); + T v = + inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert::to(0); + bool hasTopK; + if (Order) { + hasTopK = inRange && (THCNumerics::gt(v, topKValue)); + } else { + hasTopK = inRange && (THCNumerics::lt(v, topKValue)); + } + + int index; + int carry; + exclusiveBinaryPrefixScan(smem, hasTopK, &index, &carry, AddOp()); + + if (hasTopK) { + int writeIndex = writeIndexStart + index; + assert(writeIndex < outputSliceSize); + + IndexType topKOffset = writeIndex * topKWithinSliceStride; + IndexType indexOffset = writeIndex * indicesWithinSliceStride; + + topKSliceStart[topKOffset] = v; + indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index + } + + writeIndexStart += carry; + } + + // We need to fill in the rest with actual == top-K values. + // The number that we need is outputSliceSize - + // writeIndexStart. There might be more than that number available, + // in which case we have to choose the first seen set. We do this + // via a prefix sum to calculate indices for writing results. + assert(outputSliceSize >= writeIndexStart); + IndexType topKRemaining = (outputSliceSize - writeIndexStart); + + for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) { + bool inRange = (i < inputSliceSize); + T v = + inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert::to(0); + bool hasTopK = inRange && (THCNumerics::eq(v, topKValue)); + + int index; + int carry; + exclusiveBinaryPrefixScan(smem, hasTopK, &index, &carry, AddOp()); + + if (hasTopK && index < topKRemaining) { + int writeIndex = writeIndexStart + index; + assert(writeIndex < outputSliceSize); + + IndexType topKOffset = writeIndex * topKWithinSliceStride; + IndexType indexOffset = writeIndex * indicesWithinSliceStride; + + topKSliceStart[topKOffset] = v; + indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index + } + + if (carry >= topKRemaining) { + break; + } + + topKRemaining -= carry; + writeIndexStart += carry; + } +} + +#undef RADIX_BITS +#undef RADIX_SIZE +#undef RADIX_MASK + +#endif // THC_TENSOR_TOPK_CUH diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh new file mode 100644 index 0000000..6ff6d68 --- /dev/null +++ b/aten/src/THC/THCTensorTypeUtils.cuh @@ -0,0 +1,142 @@ +#ifndef THC_TENSOR_TYPE_UTILS_INC +#define THC_TENSOR_TYPE_UTILS_INC + +#include +#include +#include "THCGeneral.h" +#include "THCHalf.h" +#include "THCTensor.hpp" +#include "THCTensorInfo.cuh" +#include "THCTensor.hpp" + +/// A utility for accessing THCuda*Tensor types in a generic manner + +/// Equivalent to C++11's type_traits std::is_same; used for comparing +/// equality of types. Don't assume the existence of C++11 +template +struct SameType { + static const bool same = false; +}; + +template +struct SameType { + static const bool same = true; +}; + +template +bool isSameType() { + return SameType::same; +} + +// Utility function for constructing TensorInfo structs. In this case, the +// two template parameters are: +// +// 1. The TensorType, e.g. THCTensor in generic functions, or THCudaTensor, +// THCudaLongTensor etc. +// +// 2. The IndexType. This is always going to be an unsigned integral value, +// but depending on the size of the Tensor you may select uint16_t +// uint32_t, uint64_t etc. +// +// Internally we use the TensorUtils static functions to get the necessary +// dims, sizes, stride etc. +// +// For example, suppose we have a THCudaTensor t, with dim = 2, size = [3, 4], +// stride = [4, 1], offset = 8, and we set our index type to be unsigned int. +// Then we yield a TensorInfo struct templatized with float, unsigned int and +// the following fields: +// +// data is a float* to the underlying storage at position 8 +// dims is 2 +// sizes is a MAX_CUTORCH_DIMS element array with [3, 4] in its first two positions +// strides is a MAX_CUTORCH_DIMS element array with [4, 1] in its first two positions +// +// TensorInfos can then be passed to CUDA kernels, but we can use the static functions +// defined above to perform Tensor Operations that are appropriate for each +// TensorType. +template +TensorInfo +getTensorInfo(THCState* state, TensorType* t) { + IndexType sz[MAX_CUTORCH_DIMS]; + IndexType st[MAX_CUTORCH_DIMS]; + + int dims = THCTensor__nDimension(state, t); + for (int i = 0; i < dims; ++i) { + sz[i] = THCTensor_size(state, t, i); + st[i] = THCTensor_stride(state, t, i); + } + + return TensorInfo( + t->template data(), dims, sz, st); +} + +template +struct ScalarNegate { + static __host__ __device__ T to(const T v) { return -v; } +}; + +template +struct ScalarInv { + static __host__ __device__ T to(const T v) { return ((T) 1) / v; } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct ScalarNegate { + static __host__ __device__ half to(const half v) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hneg(v); +#else + return __float2half(-__half2float(v)); +#endif +#else +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) + half out = v; +#else + __half_raw out = __half_raw(v); +#endif + out.x ^= 0x8000; // toggle sign bit + return out; +#endif + } +}; + +template <> +struct ScalarInv { + static __host__ __device__ half to(const half v) { +#if defined (__CUDA_ARCH_) || defined(__HIP_PLATFORM_HCC__) + return __float2half(1.0f / __half2float(v)); +#else + float fv = THC_half2float(v); + fv = 1.0f / fv; + return THC_float2half(fv); +#endif + } +}; + +inline bool operator==(half a, half b) { +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) + return a.x == b.x; +#else + __half_raw araw, braw; + araw = __half_raw(a); + braw = __half_raw(b); + return araw.x == braw.x; +#endif +} + +inline bool operator!=(half a, half b) { +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) + return a.x != b.x; +#else + __half_raw araw, braw; + araw = __half_raw(a); + braw = __half_raw(b); + return araw.x != braw.x; +#endif +} + +#endif // CUDA_HALF_TENSOR + +#endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/aten/src/THC/THCThreadLocal.cpp b/aten/src/THC/THCThreadLocal.cpp new file mode 100644 index 0000000..3cc95c3 --- /dev/null +++ b/aten/src/THC/THCThreadLocal.cpp @@ -0,0 +1,46 @@ +#include "THCThreadLocal.h" +#include "THCGeneral.h" +#ifdef _WIN32 +#include +#endif + + +THCThreadLocal THCThreadLocal_alloc(void) +{ +#ifndef _WIN32 + pthread_key_t key; + THAssert(pthread_key_create(&key, NULL) == 0); + return key; +#else + DWORD key = TlsAlloc(); + THAssert(key != TLS_OUT_OF_INDEXES); + return key; +#endif +} + +void THCThreadLocal_free(THCThreadLocal local) +{ +#ifndef _WIN32 + THAssert(pthread_key_delete(local) == 0); +#else + THAssert(TlsFree(local)); +#endif +} + +void* THCThreadLocal_get(THCThreadLocal local) +{ +#ifndef _WIN32 + return pthread_getspecific(local); +#else + return TlsGetValue(local); +#endif +} + +void THCThreadLocal_set(THCThreadLocal local, void* value) +{ +#ifndef _WIN32 + THAssert(pthread_setspecific(local, value) == 0); +#else + THAssert(TlsSetValue(local, value)); +#endif +} diff --git a/aten/src/THC/THCThreadLocal.h b/aten/src/THC/THCThreadLocal.h new file mode 100644 index 0000000..a733cac --- /dev/null +++ b/aten/src/THC/THCThreadLocal.h @@ -0,0 +1,17 @@ +#ifndef THC_THREAD_LOCAL_INC +#define THC_THREAD_LOCAL_INC + +#ifdef _WIN32 +#include +typedef DWORD THCThreadLocal; +#else +#include +typedef pthread_key_t THCThreadLocal; +#endif + +THCThreadLocal THCThreadLocal_alloc(void); +void THCThreadLocal_free(THCThreadLocal local); +void* THCThreadLocal_get(THCThreadLocal local); +void THCThreadLocal_set(THCThreadLocal local, void* value); + +#endif // THC_THREAD_LOCAL_INC diff --git a/aten/src/THC/THCThrustAllocator.cuh b/aten/src/THC/THCThrustAllocator.cuh new file mode 100644 index 0000000..0e75322 --- /dev/null +++ b/aten/src/THC/THCThrustAllocator.cuh @@ -0,0 +1,31 @@ +#ifndef THC_THRUST_ALLOCATOR_INC +#define THC_THRUST_ALLOCATOR_INC + +#include + +/// Allocator for Thrust to re-route its internal device allocations +/// to the THC allocator +class THCThrustAllocator { + public: + typedef char value_type; + + THCThrustAllocator(THCState* state) + : state_(state) { + } + + ~THCThrustAllocator() { + } + + char* allocate(std::ptrdiff_t size) { + return static_cast(THCudaMalloc(state_, size)); + } + + void deallocate(char* p, size_t size) { + THCudaFree(state_, p); + } + + private: + THCState* state_; +}; + +#endif // THC_THRUST_ALLOCATOR_INC diff --git a/aten/src/THC/generated/THCTensorMaskedByte.cu b/aten/src/THC/generated/THCTensorMaskedByte.cu new file mode 100644 index 0000000..08818af --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedByte.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateByteType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedChar.cu b/aten/src/THC/generated/THCTensorMaskedChar.cu new file mode 100644 index 0000000..27ac787 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedChar.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateCharType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedDouble.cu b/aten/src/THC/generated/THCTensorMaskedDouble.cu new file mode 100644 index 0000000..03e6b8e --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedDouble.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateDoubleType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedFloat.cu b/aten/src/THC/generated/THCTensorMaskedFloat.cu new file mode 100644 index 0000000..bc4d9a0 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedFloat.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateFloatType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedHalf.cu b/aten/src/THC/generated/THCTensorMaskedHalf.cu new file mode 100644 index 0000000..fc544cd --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedHalf.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateHalfType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedInt.cu b/aten/src/THC/generated/THCTensorMaskedInt.cu new file mode 100644 index 0000000..9714761 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedInt.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateIntType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedLong.cu b/aten/src/THC/generated/THCTensorMaskedLong.cu new file mode 100644 index 0000000..355ea2b --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedLong.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateLongType.h" diff --git a/aten/src/THC/generated/THCTensorMaskedShort.cu b/aten/src/THC/generated/THCTensorMaskedShort.cu new file mode 100644 index 0000000..43fe037 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMaskedShort.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMasked.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMasked.cu" +#include "../THCGenerateShortType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareByte.cu b/aten/src/THC/generated/THCTensorMathCompareByte.cu new file mode 100644 index 0000000..3eaf375 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareByte.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateByteType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareChar.cu b/aten/src/THC/generated/THCTensorMathCompareChar.cu new file mode 100644 index 0000000..471cf03 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareChar.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateCharType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareDouble.cu b/aten/src/THC/generated/THCTensorMathCompareDouble.cu new file mode 100644 index 0000000..7bbf36c --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareDouble.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateDoubleType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareFloat.cu b/aten/src/THC/generated/THCTensorMathCompareFloat.cu new file mode 100644 index 0000000..5fc04be --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareFloat.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateFloatType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareHalf.cu b/aten/src/THC/generated/THCTensorMathCompareHalf.cu new file mode 100644 index 0000000..52d43ed --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareHalf.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateHalfType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareInt.cu b/aten/src/THC/generated/THCTensorMathCompareInt.cu new file mode 100644 index 0000000..81c056c --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareInt.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateIntType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareLong.cu b/aten/src/THC/generated/THCTensorMathCompareLong.cu new file mode 100644 index 0000000..a9ca765 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareLong.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateLongType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareShort.cu b/aten/src/THC/generated/THCTensorMathCompareShort.cu new file mode 100644 index 0000000..f620f52 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareShort.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompare.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompare.cu" +#include "../THCGenerateShortType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTByte.cu b/aten/src/THC/generated/THCTensorMathCompareTByte.cu new file mode 100644 index 0000000..0a46202 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTByte.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateByteType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTChar.cu b/aten/src/THC/generated/THCTensorMathCompareTChar.cu new file mode 100644 index 0000000..df0c4bb --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTChar.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateCharType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTDouble.cu b/aten/src/THC/generated/THCTensorMathCompareTDouble.cu new file mode 100644 index 0000000..6b9f4e7 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTDouble.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateDoubleType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTFloat.cu b/aten/src/THC/generated/THCTensorMathCompareTFloat.cu new file mode 100644 index 0000000..b34a12b --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTFloat.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateFloatType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTHalf.cu b/aten/src/THC/generated/THCTensorMathCompareTHalf.cu new file mode 100644 index 0000000..b38dc55 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTHalf.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateHalfType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTInt.cu b/aten/src/THC/generated/THCTensorMathCompareTInt.cu new file mode 100644 index 0000000..6a8a114 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTInt.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateIntType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTLong.cu b/aten/src/THC/generated/THCTensorMathCompareTLong.cu new file mode 100644 index 0000000..d5bf322 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTLong.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateLongType.h" diff --git a/aten/src/THC/generated/THCTensorMathCompareTShort.cu b/aten/src/THC/generated/THCTensorMathCompareTShort.cu new file mode 100644 index 0000000..d41dab6 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathCompareTShort.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathCompareT.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathCompareT.cu" +#include "../THCGenerateShortType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseByte.cu b/aten/src/THC/generated/THCTensorMathPointwiseByte.cu new file mode 100644 index 0000000..b6fe10e --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseByte.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateByteType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseChar.cu b/aten/src/THC/generated/THCTensorMathPointwiseChar.cu new file mode 100644 index 0000000..af851f3 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseChar.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateCharType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseDouble.cu b/aten/src/THC/generated/THCTensorMathPointwiseDouble.cu new file mode 100644 index 0000000..8053408 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseDouble.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateDoubleType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseFloat.cu b/aten/src/THC/generated/THCTensorMathPointwiseFloat.cu new file mode 100644 index 0000000..8149c27 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseFloat.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateFloatType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseHalf.cu b/aten/src/THC/generated/THCTensorMathPointwiseHalf.cu new file mode 100644 index 0000000..29cbf26 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseHalf.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateHalfType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseInt.cu b/aten/src/THC/generated/THCTensorMathPointwiseInt.cu new file mode 100644 index 0000000..7e7c486 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseInt.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateIntType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseLong.cu b/aten/src/THC/generated/THCTensorMathPointwiseLong.cu new file mode 100644 index 0000000..583b271 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseLong.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateLongType.h" diff --git a/aten/src/THC/generated/THCTensorMathPointwiseShort.cu b/aten/src/THC/generated/THCTensorMathPointwiseShort.cu new file mode 100644 index 0000000..8c30a6b --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseShort.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathPointwise.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathPointwise.cu" +#include "../THCGenerateShortType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceByte.cu b/aten/src/THC/generated/THCTensorMathReduceByte.cu new file mode 100644 index 0000000..27490e5 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceByte.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateByteType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceChar.cu b/aten/src/THC/generated/THCTensorMathReduceChar.cu new file mode 100644 index 0000000..9e55b7d --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceChar.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateCharType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceDouble.cu b/aten/src/THC/generated/THCTensorMathReduceDouble.cu new file mode 100644 index 0000000..5cd6b11 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceDouble.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateDoubleType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceFloat.cu b/aten/src/THC/generated/THCTensorMathReduceFloat.cu new file mode 100644 index 0000000..7c21ce2 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceFloat.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateFloatType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceHalf.cu b/aten/src/THC/generated/THCTensorMathReduceHalf.cu new file mode 100644 index 0000000..f05f2d8 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceHalf.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateHalfType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceInt.cu b/aten/src/THC/generated/THCTensorMathReduceInt.cu new file mode 100644 index 0000000..f6fc959 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceInt.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateIntType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceLong.cu b/aten/src/THC/generated/THCTensorMathReduceLong.cu new file mode 100644 index 0000000..cff6374 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceLong.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateLongType.h" diff --git a/aten/src/THC/generated/THCTensorMathReduceShort.cu b/aten/src/THC/generated/THCTensorMathReduceShort.cu new file mode 100644 index 0000000..1ad31a8 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathReduceShort.cu @@ -0,0 +1,5 @@ +#include "../THCTensorMathReduce.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorMathReduce.cu" +#include "../THCGenerateShortType.h" diff --git a/aten/src/THC/generated/THCTensorSortByte.cu b/aten/src/THC/generated/THCTensorSortByte.cu new file mode 100644 index 0000000..53923a2 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortByte.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateByteType.h" diff --git a/aten/src/THC/generated/THCTensorSortChar.cu b/aten/src/THC/generated/THCTensorSortChar.cu new file mode 100644 index 0000000..0e95c69 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortChar.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateCharType.h" diff --git a/aten/src/THC/generated/THCTensorSortDouble.cu b/aten/src/THC/generated/THCTensorSortDouble.cu new file mode 100644 index 0000000..770ffa0 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortDouble.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateDoubleType.h" diff --git a/aten/src/THC/generated/THCTensorSortFloat.cu b/aten/src/THC/generated/THCTensorSortFloat.cu new file mode 100644 index 0000000..e7604b9 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortFloat.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateFloatType.h" diff --git a/aten/src/THC/generated/THCTensorSortHalf.cu b/aten/src/THC/generated/THCTensorSortHalf.cu new file mode 100644 index 0000000..c783ff0 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortHalf.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateHalfType.h" diff --git a/aten/src/THC/generated/THCTensorSortInt.cu b/aten/src/THC/generated/THCTensorSortInt.cu new file mode 100644 index 0000000..1597eab --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortInt.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateIntType.h" diff --git a/aten/src/THC/generated/THCTensorSortLong.cu b/aten/src/THC/generated/THCTensorSortLong.cu new file mode 100644 index 0000000..787a942 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortLong.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateLongType.h" diff --git a/aten/src/THC/generated/THCTensorSortShort.cu b/aten/src/THC/generated/THCTensorSortShort.cu new file mode 100644 index 0000000..8a0c275 --- /dev/null +++ b/aten/src/THC/generated/THCTensorSortShort.cu @@ -0,0 +1,5 @@ +#include "../THCTensorSort.cuh" +#include "THCTensor.hpp" +#include "THCStream.h" +#include "../generic/THCTensorSort.cu" +#include "../THCGenerateShortType.h" diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp new file mode 100644 index 0000000..98b4c3b --- /dev/null +++ b/aten/src/THC/generic/THCStorage.cpp @@ -0,0 +1,123 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCStorage.cpp" +#else + +real* THCStorage_(data)(THCState *state, const THCStorage *self) +{ + return self->data(); +} + +ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage *self) +{ + return THStorage_size(self); +} + +int THCStorage_(elementSize)(THCState *state) +{ + return sizeof(real); +} + +void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value) +{ + THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(real), + cudaMemcpyHostToDevice, + stream)); + THCudaCheck(cudaStreamSynchronize(stream)); +} + +real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) +{ + THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); + real value; + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(real), + cudaMemcpyDeviceToHost, stream)); + THCudaCheck(cudaStreamSynchronize(stream)); + return value; +} + +THCStorage* THCStorage_(new)(THCState *state) +{ + return THCStorage_new(state, at::CTypeToScalarType::to()); +} + +THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) +{ + return THCStorage_newWithSize(state, at::CTypeToScalarType::to(), size); +} + +THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, + at::Allocator* allocator) +{ + return THCStorage_newWithAllocator(state, at::CTypeToScalarType::to(), + size, allocator); +} + +THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0) +{ + THCStorage *self = THCStorage_(newWithSize)(state, 1); + THCStorage_(set)(state, self, 0, data0); + return self; +} + +THCStorage* THCStorage_(newWithSize2)(THCState *state, real data0, real data1) +{ + THCStorage *self = THCStorage_(newWithSize)(state, 2); + THCStorage_(set)(state, self, 0, data0); + THCStorage_(set)(state, self, 1, data1); + return self; +} + +THCStorage* THCStorage_(newWithSize3)(THCState *state, real data0, real data1, real data2) +{ + THCStorage *self = THCStorage_(newWithSize)(state, 3); + THCStorage_(set)(state, self, 0, data0); + THCStorage_(set)(state, self, 1, data1); + THCStorage_(set)(state, self, 2, data2); + return self; +} + +THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, real data2, real data3) +{ + THCStorage *self = THCStorage_(newWithSize)(state, 4); + THCStorage_(set)(state, self, 0, data0); + THCStorage_(set)(state, self, 1, data1); + THCStorage_(set)(state, self, 2, data2); + THCStorage_(set)(state, self, 3, data3); + return self; +} + +THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, ptrdiff_t size, int isShared) +{ + THError("not available yet for THCStorage"); + return NULL; +} + +THCStorage* THCStorage_(newWithDataAndAllocator)( + THCState *state, at::DataPtr&& data, ptrdiff_t size, + at::Allocator *allocator) { + return THCStorage_newWithDataAndAllocator(state, at::CTypeToScalarType::to(), std::move(data), size, allocator); +} + +void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag) +{ + THStorage_setFlag(storage, flag); +} + +void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag) +{ + THStorage_clearFlag(storage, flag); +} + +void THCStorage_(retain)(THCState *state, THCStorage *self) +{ + THStorage_retain(self); +} + +void THCStorage_(free)(THCState *state, THCStorage *self) +{ + THCStorage_free(state, self); +} +#endif diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu new file mode 100644 index 0000000..c3f25f4 --- /dev/null +++ b/aten/src/THC/generic/THCStorage.cu @@ -0,0 +1,25 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCStorage.cu" +#else + +void THCStorage_(fill)(THCState *state, THCStorage *self, real value) +{ + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr self_data(THCStorage_(data)(state, self)); + thrust::fill( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + self_data, self_data+self->size, value); +} + +void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) +{ + THCStorage_resize(state, self, size); +} + +THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { + return THCStorage_getDevice(state, storage); +} + +#endif diff --git a/aten/src/THC/generic/THCStorage.h b/aten/src/THC/generic/THCStorage.h new file mode 100644 index 0000000..4ac2fcd --- /dev/null +++ b/aten/src/THC/generic/THCStorage.h @@ -0,0 +1,58 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCStorage.h" +#else + +#define TH_STORAGE_REFCOUNTED 1 +#define TH_STORAGE_RESIZABLE 2 + +#define THCStorage THStorage + +// These used to be distinct types; for some measure of backwards compatibility and documentation +// alias these to the single THCStorage type. +#define THCudaStorage THCStorage +#define THCudaDoubleStorage THCStorage +#ifdef CUDA_HALF_TENSOR +#define THCudaHalfStorage THCStorage +#endif +#define THCudaByteStorage THCStorage +#define THCudaCharStorage THCStorage +#define THCudaShortStorage THCStorage +#define THCudaIntStorage THCStorage +#define THCudaLongStorage THCStorage + +THC_API real* THCStorage_(data)(THCState *state, const THCStorage*); +THC_API ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage*); +THC_API int THCStorage_(elementSize)(THCState *state); + +/* slow access -- checks everything */ +THC_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, real); +THC_API real THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t); + +THC_API THCStorage* THCStorage_(new)(THCState *state); +THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size); +THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real); +THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real); +THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real); +THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real); +THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared); + +#ifdef __cplusplus +THC_API THCStorage* THCStorage_(newWithAllocator)( + THCState *state, ptrdiff_t size, + at::Allocator* allocator); +THC_API THCStorage* THCStorage_(newWithDataAndAllocator)( + THCState *state, at::DataPtr&& data, ptrdiff_t size, + at::Allocator* allocator); +#endif + +THC_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag); +THC_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag); +THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage); + +THC_API void THCStorage_(free)(THCState *state, THCStorage *storage); +THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, ptrdiff_t size); +THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value); + +THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage); + +#endif diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp new file mode 100644 index 0000000..dc877b6 --- /dev/null +++ b/aten/src/THC/generic/THCStorageCopy.cpp @@ -0,0 +1,72 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCStorageCopy.cpp" +#else + +void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src) +{ + THArgCheck(self->size == src->size, 2, "size does not match"); + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), + THStorage_(data)(src), + self->size * sizeof(real), + cudaMemcpyHostToDevice, + stream)); + THCudaCheck(cudaStreamSynchronize(stream)); +} + +#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \ +void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ +{ \ + THCTensor* selfTensor = \ + THCTensor_(newWithStorage1d)(state, self, 0, self->size, 1); \ + struct TH##TYPEC##Tensor* srcTensor = \ + TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size, 1); \ + THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor); \ + TH##TYPEC##Tensor_free(srcTensor); \ + THCTensor_(free)(state, selfTensor); \ +} +TH_CUDA_STORAGE_IMPLEMENT_COPY(Byte) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Char) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Short) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Int) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Long) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Float) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Half) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Double) + +void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src) +{ + THArgCheck(self->size == src->size, 2, "size does not match"); + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self), + THCStorage_(data)(state, src), + self->size * sizeof(real), + cudaMemcpyDeviceToHost, + stream)); + THCudaCheck(cudaStreamSynchronize(stream)); +} + +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ +void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ +{ \ + TH##TYPEC##Tensor* selfTensor = \ + TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size, 1); \ + struct THCTensor* srcTensor = \ + THCTensor_(newWithStorage1d)(state, src, 0, src->size, 1); \ + TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \ + THCTensor_(free)(state, srcTensor); \ + TH##TYPEC##Tensor_free(selfTensor); \ +} +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Int) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Long) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Float) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Half) +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double) + +#undef TH_CUDA_STORAGE_IMPLEMENT_COPY +#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO + +#endif diff --git a/aten/src/THC/generic/THCStorageCopy.cu b/aten/src/THC/generic/THCStorageCopy.cu new file mode 100644 index 0000000..ba50004 --- /dev/null +++ b/aten/src/THC/generic/THCStorageCopy.cu @@ -0,0 +1,46 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCStorageCopy.cu" +#else + +void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src) +{ + THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->size * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); +} + +// conversions are delegated to THCTensor implementation +#define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA) \ +void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src) \ +{ \ + THArgCheck(self->size == src->size, 2, "size does not match"); \ + THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size, 1); \ + struct THCuda##TYPECUDA##Tensor* srcTensor = \ + THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size, 1); \ + THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor); \ + THCuda##TYPECUDA##Tensor_free(state, srcTensor); \ + THCTensor_(free)(state, selfTensor); \ +} + +THC_CUDA_STORAGE_IMPLEMENT_COPY(Byte,Byte) +THC_CUDA_STORAGE_IMPLEMENT_COPY(Char,Char) +THC_CUDA_STORAGE_IMPLEMENT_COPY(Short,Short) +THC_CUDA_STORAGE_IMPLEMENT_COPY(Int,Int) +THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long) +THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,) // i.e. float +THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double) +#ifdef CUDA_HALF_TENSOR +THC_CUDA_STORAGE_IMPLEMENT_COPY(Half,Half) +#endif + +#undef THC_CUDA_STORAGE_IMPLEMENT_COPY + +void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src) +{ + THCStorage_(TH_CONCAT_2(copyCuda, Real))(state, self, src); +} + +void THCStorage_(copy)(THCState *state, THCStorage *self, THCStorage *src) +{ + THCStorage_(copyCuda)(state, self, src); +} + +#endif diff --git a/aten/src/THC/generic/THCStorageCopy.h b/aten/src/THC/generic/THCStorageCopy.h new file mode 100644 index 0000000..7a4ef6b --- /dev/null +++ b/aten/src/THC/generic/THCStorageCopy.h @@ -0,0 +1,42 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCStorageCopy.h" +#else + +/* Support for copy between different Storage types */ + +THC_API void THCStorage_(rawCopy)(THCState *state, THCStorage *storage, real *src); +THC_API void THCStorage_(copy)(THCState *state, THCStorage *storage, THCStorage *src); +THC_API void THCStorage_(copyByte)(THCState *state, THCStorage *storage, struct THByteStorage *src); +THC_API void THCStorage_(copyChar)(THCState *state, THCStorage *storage, struct THCharStorage *src); +THC_API void THCStorage_(copyShort)(THCState *state, THCStorage *storage, struct THShortStorage *src); +THC_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct THIntStorage *src); +THC_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src); +THC_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src); +THC_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src); +THC_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src); + +THC_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src); +THC_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src); +THC_API void THCStorage_(copyCudaShort)(THCState *state, THCStorage *storage, struct THCudaShortStorage *src); +THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, struct THCudaIntStorage *src); +THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src); +THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); +THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); +#ifdef CUDA_HALF_TENSOR +THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); +#endif + +THC_API void TH_CONCAT_2(THByteStorage_copyCuda , Real)(THCState *state, THByteStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THCharStorage_copyCuda , Real)(THCState *state, THCharStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THShortStorage_copyCuda , Real)(THCState *state, THShortStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THIntStorage_copyCuda , Real)(THCState *state, THIntStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THLongStorage_copyCuda , Real)(THCState *state, THLongStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src); +THC_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src); + +THC_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src); +THC_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src); +THC_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src); + +#endif diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp new file mode 100644 index 0000000..c0924a5 --- /dev/null +++ b/aten/src/THC/generic/THCTensor.cpp @@ -0,0 +1,731 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensor.cpp" +#else + +/**** access methods ****/ +THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self) +{ + return self->storage; +} + +ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self) +{ + return self->storageOffset; +} + +int THCTensor_(nDimension)(THCState *state, const THCTensor *self) +{ + return THCTensor_nDimension(state, self); +} + +int THCTensor_(_nDimension)(THCState *state, const THCTensor *self) +{ + return THCTensor__nDimension(state, self); +} + +int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim) +{ + return THCTensor_size(state, self, dim); +} + +int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim) +{ + return THCTensor_stride(state, self, dim); +} + +THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self) +{ + return THCTensor_newSizeOf(state, self); +} + +THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self) +{ + THLongStorage *stride = THLongStorage_newWithSize(self->dim()); + THLongStorage_rawCopy(stride, self->stride); + return stride; +} + +real *THCTensor_(data)(THCState *state, const THCTensor *self) +{ + if(self->storage) + return (THCStorage_(data)(state, self->storage)+self->storageOffset); + else + return NULL; +} + +/**** creation methods ****/ + +/* Empty init */ +THCTensor *THCTensor_(new)(THCState *state) +{ + return new THCTensor(THCStorage_(new)(state)); +} + +/* Pointer-copy init */ +THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) +{ + THCTensor *self = new THCTensor(THCStorage_(new)(state)); + THCTensor_(setStorageNd)(state, + self, + tensor->storage, + tensor->storageOffset, + tensor->dim(), + tensor->size, + tensor->stride); + return self; +} + +/* Storage init */ +THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride) +{ + if(size && stride) + THArgCheck(size->size == stride->size, 4, "inconsistent size"); + + AT_CHECK(size, "size must not be null"); + THCTensor *self = new THCTensor(THCStorage_(new)(state)); + THCTensor_(setStorageNd)(state, + self, + storage, + storageOffset, + size->size, + THLongStorage_data(size), + (stride ? THLongStorage_data(stride) : NULL)); + + return self; +} + +THCTensor *THCTensor_(newWithStorageIntLists)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) { + AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); + THCTensor *self = new THCTensor(THCStorage_(new)(state)); + THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(), + const_cast(sizes.data()), const_cast(strides.data())); + + return self; +} + +THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0) +{ + return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, {size0}, {stride0}); +} + +THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0, + int64_t size1, int64_t stride1) +{ + return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, {size0, size1}, {stride0, stride1}); +} + +THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0, + int64_t size1, int64_t stride1, + int64_t size2, int64_t stride2) +{ + return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, {size0, size1, size2}, {stride0, stride1, stride2}); +} + +THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, + int64_t size0, int64_t stride0, + int64_t size1, int64_t stride1, + int64_t size2, int64_t stride2, + int64_t size3, int64_t stride3) +{ + return THCTensor_(newWithStorageIntLists)(state, storage, storageOffset, + {size0, size1, size2, size3}, + {stride0, stride1, stride2, stride3}); +} + +THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size, THLongStorage *stride) +{ + return THCTensor_(newWithStorage)(state, NULL, 0, size, stride); +} + +THCTensor *THCTensor_(newWithSizeIntList)(THCState *state, at::IntList sizes) { + THCTensor *self = new THCTensor(THCStorage_(new)(state)); + THCTensor_(resizeNd)(state, self, sizes.size(), const_cast(sizes.data()), nullptr); + + return self; +} + +THCTensor *THCTensor_(newWithSize1d)(THCState *state, int64_t size0) +{ + return THCTensor_(newWithSizeIntList)(state, {size0}); +} + +THCTensor *THCTensor_(newWithSize2d)(THCState *state, int64_t size0, int64_t size1) +{ + return THCTensor_(newWithSizeIntList)(state, {size0, size1}); +} + +THCTensor *THCTensor_(newWithSize3d)(THCState *state, int64_t size0, int64_t size1, int64_t size2) +{ + return THCTensor_(newWithSizeIntList)(state, {size0, size1, size2}); +} + +THCTensor *THCTensor_(newWithSize4d)(THCState *state, int64_t size0, int64_t size1, int64_t size2, int64_t size3) +{ + return THCTensor_(newWithSizeIntList)(state, {size0, size1, size2, size3}); +} + +THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self) +{ + THCTensor *tensor = THCTensor_(new)(state); + THCTensor_(resizeAs)(state, tensor, self); + THCTensor_(copy)(state, tensor, self); + return tensor; +} + +THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *self) +{ + if(!THCTensor_(isContiguous)(state, self)) { + return THCTensor_(newClone)(state, self); + } else { + THCTensor_(retain)(state, self); + return self; + } +} + +THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, int64_t sliceIndex_) +{ + THCTensor *self = THCTensor_(newWithTensor)(state, tensor); + THCTensor_(select)(state, self, NULL, dimension_, sliceIndex_); + return self; +} + +THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_) +{ + THCTensor *self = THCTensor_(newWithTensor)(state, tensor); + THCTensor_(narrow)(state, self, NULL, dimension_, firstIndex_, size_); + return self; +} + +THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_) +{ + THCTensor *self = THCTensor_(newWithTensor)(state, tensor); + THCTensor_(transpose)(state, self, NULL, dimension1_, dimension2_); + return self; +} + +THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, int64_t size_, int64_t step_) +{ + THCTensor *self = THCTensor_(newWithTensor)(state, tensor); + THCTensor_(unfold)(state, self, NULL, dimension_, size_, step_); + return self; +} + +THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size) +{ + ptrdiff_t numel = THCTensor_(nElement)(state, tensor); + THCTensor *self = THCTensor_(new)(state); + THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel); + auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()), + at::IntList(tensor->stride, tensor->dim()), + at::IntList(inferred_size->data(), inferred_size->size)); + THArgCheck(stride.has_value(), 2, "view size is " + "not compatible with input tensor's size and stride (at least one dimension spans " + "across two contiguous subspaces). Call .contiguous() before .view()."); + auto stride_value = *stride; + THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size()); + THLongStorage_rawCopy(new_stride, stride_value.data()); + THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, new_stride); + THLongStorage_free(inferred_size); + THLongStorage_free(new_stride); + return self; +} + +// Collapses the first two dimensions of a tensor. +// Assumes the input tensor is contiguous. +THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input) { + int in_dims = THCTensor_(_nDimension)(state, input); + THArgCheck(in_dims >= 2, 1, "Tensor needs to have at least two dimensions"); + THArgCheck(THCTensor_(isContiguous)(state, input), 1, + "Tensor must be contiguous"); + THLongStorage *newSize = THLongStorage_newWithSize(in_dims - 1); + THLongStorage_data(newSize)[0] = THCTensor_(size)(state, input, 0) * THCTensor_(size)(state, input, 1); + for (int i = 2; i < in_dims; i++) { + THLongStorage_data(newSize)[i - 1] = THCTensor_(size)(state, input, i); + } + THCTensor *output = THCTensor_(newView)(state, input, newSize); + THLongStorage_free(newSize); + return output; +} + +/* Resize */ +void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride) +{ + THCTensor_resize(state, self, size, stride); +} + +void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src) +{ + THCTensor_resizeAs(state, self, src); +} + +void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0) +{ + int64_t size[1] = {size0}; + THCTensor_resizeNd(state, tensor, 1, size, nullptr); +} + +void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, int64_t size0, int64_t size1) +{ + int64_t size[2] = {size0, size1}; + THCTensor_resizeNd(state, tensor, 2, size, nullptr); +} + +void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, int64_t size0, int64_t size1, int64_t size2) +{ + int64_t size[3] = {size0, size1, size2}; + THCTensor_resizeNd(state, tensor, 3, size, nullptr); +} + +void THCTensor_(resize4d)(THCState *state, THCTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3) +{ + int64_t size[4] = {size0, size1, size2, size3}; + THCTensor_resizeNd(state, self, 4, size, nullptr); +} + +void THCTensor_(resize5d)(THCState *state, THCTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4) +{ + int64_t size[5] = {size0, size1, size2, size3, size4}; + THCTensor_resizeNd(state, self, 5, size, nullptr); +} + +void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src) +{ + THCTensor_set(state, self, src); +} + +void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_) +{ + if(size_ && stride_) + THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes"); + + AT_CHECK(size_, "size must not be null"); + THCTensor_(setStorageNd)(state, + self, + storage_, + storageOffset_, + size_->size, + THLongStorage_data(size_), + (stride_ ? THLongStorage_data(stride_) : NULL)); +} + +void THCTensor_(setStorageIntLists)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + at::IntList sizes, at::IntList strides) +{ + AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); + + THCTensor_(setStorageNd)(state, self, storage_, storageOffset_, sizes.size(), + const_cast(sizes.data()), const_cast(strides.data())); +} + +void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_) +{ + THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_, + {size0_}, {stride0_}); +} + +void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_) +{ + THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_, + {size0_, size1_}, + {stride0_, stride1_}); +} + +void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_) +{ + THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_, + {size0_, size1_, size2_}, + {stride0_, stride1_, stride2_}); +} + +void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_, + int64_t size3_, int64_t stride3_) +{ + + int64_t size[4] = {size0_, size1_, size2_, size3_}; + int64_t stride[4] = {stride0_, stride1_, stride2_, stride3_}; + + THCTensor_(setStorageIntLists)(state, self, storage_, storageOffset_, size, stride); +} + + +void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t firstIndex, int64_t size) +{ + if(!src) + src = self; + + THArgCheck( (dimension >= 0) && (dimension < src->dim()), 3, "out of range"); + THArgCheck( firstIndex >= 0, 4, "out of range"); +#ifdef USE_TH_SIZE_ZERO_DIM + THArgCheck( size >= 0, 5, "out of range"); +#else + THArgCheck( size > 0, 5, "out of range"); +#endif + THArgCheck(firstIndex+size <= src->size[dimension], 5, "out of range"); + + THCTensor_(set)(state, self, src); + + if(firstIndex > 0) + self->storageOffset += firstIndex*self->stride[dimension]; + + self->size[dimension] = size; +} + +void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t sliceIndex) +{ + int d; + + if(!src) + src = self; + +#ifndef USE_TH_SCALAR + THArgCheck(src->_dim() > 1, 1, "cannot select on a vector"); +#endif + THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range"); + THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 4, "out of range"); + + THCTensor_(set)(state, self, src); + THCTensor_(narrow)(state, self, NULL, dimension, sliceIndex, 1); + for(d = dimension; d < self->dim()-1; d++) + { + self->size[d] = self->size[d+1]; + self->stride[d] = self->stride[d+1]; + } + self->dim_--; +} + +void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1, int dimension2) +{ + int64_t z; + + if(!src) + src = self; + + THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); + + THCTensor_(set)(state, self, src); + + if(dimension1 == dimension2) + return; + + z = self->stride[dimension1]; + self->stride[dimension1] = self->stride[dimension2]; + self->stride[dimension2] = z; + z = self->size[dimension1]; + self->size[dimension1] = self->size[dimension2]; + self->size[dimension2] = z; +} + +void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t size, int64_t step) +{ + int64_t *newSize; + int64_t *newStride; + int d; + + if(!src) + src = self; + +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); +#endif + THArgCheck(dimension < src->dim(), 2, "out of range"); + THArgCheck(size <= src->size[dimension], 3, "out of range"); + THArgCheck(step > 0, 4, "invalid step"); + + THCTensor_(set)(state, self, src); + + newSize = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1)); + newStride = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1)); + + newSize[self->dim()] = size; + newStride[self->dim()] = self->stride[dimension]; + for(d = 0; d < self->dim(); d++) + { + if(d == dimension) + { + newSize[d] = (self->size[d] - size) / step + 1; + newStride[d] = step*self->stride[d]; + } + else + { + newSize[d] = self->size[d]; + newStride[d] = self->stride[d]; + } + } + + THFree(self->size); + THFree(self->stride); + + self->size = newSize; + self->stride = newStride; + self->dim_++; +} + +/* we have to handle the case where the result is a number */ +void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src) +{ + int ndim = 0; + int d; + + if(!src) + src = self; + + THCTensor_(set)(state, self, src); + + for(d = 0; d < src->dim(); d++) + { + if(src->size[d] != 1) + { + if(d != ndim) + { + self->size[ndim] = src->size[d]; + self->stride[ndim] = src->stride[d]; + } + ndim++; + } + } + +#ifndef USE_TH_SCALAR + /* right now, we do not handle 0-dimension tensors */ + if(ndim == 0 && src->dim() > 0) + { + self->size[0] = 1; + self->stride[0] = 1; + ndim = 1; + } + self->dim_ = ndim; +} +#endif + +void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension) +{ + THCTensor_squeeze1d(state, self, src, dimension); +} + +void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension) +{ + THCTensor_unsqueeze1d(state, self, src, dimension); +} + +int THCTensor_(isContiguous)(THCState *state, const THCTensor *self) +{ + return THCTensor_isContiguous(state, self); +} + +int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims) +{ + int d; + if (self->dim() != dims->size) + return 0; + + for (d = 0; d < self->dim(); ++d) + { + if (self->size[d] != THLongStorage_data(dims)[d]) + return 0; + } + return 1; +} + +int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src) +{ + if (self->storage == src->storage && + self->storageOffset == src->storageOffset && + self->dim() == src->dim()) + { + int d; + for (d = 0; d < self->dim(); ++d) + { + if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d]) + return 0; + } + return 1; + } + return 0; +} + +int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor* src) +{ + int d; + if (self->dim() != src->dim()) + return 0; + for(d = 0; d < self->dim(); ++d) + { + if(self->size[d] != src->size[d]) + return 0; + } + return 1; +} + +ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self) +{ + return THCTensor_nElement(state, self); +} + +void THCTensor_(retain)(THCState *state, THCTensor *self) +{ + THCTensor_retain(state, self); +} + +void THCTensor_(free)(THCState *state, THCTensor *self) +{ + THCTensor_free(state, self); +} + +void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst) +{ + if(self != dst) + THCTensor_(copy)(state, dst, self); + + THCTensor_(free)(state, self); +} + +/*******************************************************************************/ + +void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) +{ + THCTensor_setStorageNd(state, self, storage, storageOffset, nDimension, size, stride); +} + +void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride) +{ + THCTensor_resizeNd(state, self, nDimension, size, stride); +} + +void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value) +{ + THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value); +} + +real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0) +{ + THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]); +} + +void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value) +{ + THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value); +} + +real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1) +{ + THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]); +} + +void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value) +{ + THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value); +} + +real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2) +{ + THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]); +} + +void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) +{ + THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value); +} + +real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3) +{ + THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions"); + THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]); +} + +int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...) +{ + /* FIXME: remove this flag after any users stop using it since it is + now superseded by the runtime option */ +#ifdef DISABLE_CHECK_GPU + return 1; +#else + int kernelP2PEnabled = + THCState_getKernelPeerToPeerAccessEnabled(state); + + int curDev = -1; + THCudaCheck(cudaGetDevice(&curDev)); + va_list(args); + va_start(args, nTensors); + int valid = 1; + for (unsigned int i = 0; i < nTensors; i++) { + THCTensor* tensor = va_arg(args, THCTensor*); + if (tensor == NULL) { + continue; + } + int tensorDev = THCTensor_(getDevice)(state, tensor); + if (tensorDev == -1) { + /* This tensor does not have GPU memory (empty) */ + continue; + } + + if (tensorDev != curDev) { + if (kernelP2PEnabled) { + /* Kernel p2p access is allowed */ + /* Can `curDev` access `tensorDev` directly? */ + if (!THCState_getPeerToPeerAccess(state, curDev, tensorDev)) { + valid = 0; + break; + } + } else { + /* No kernel p2p access allowed */ + valid = 0; + break; + } + } + } + + va_end(args); + return valid; +#endif // DISABLE_CHECK_GPU +} + +THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor) { + const int L = THC_DESC_BUFF_LEN; + THCDescBuff buf; + char *str = buf.str; + int n = 0; + n += snprintf(str, L-n, "["); + int i; + for(i = 0; i < tensor->dim(); i++) { + if(n >= L) break; + n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]); + if(i < tensor->dim()-1) { + n += snprintf(str+n, L-n, " x "); + } + } + if(n < L - 2) { + snprintf(str+n, L-n, "]"); + } else { + snprintf(str+L-5, 5, "...]"); + } + return buf; +} + +#endif diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu new file mode 100644 index 0000000..9847834 --- /dev/null +++ b/aten/src/THC/generic/THCTensor.cu @@ -0,0 +1,9 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensor.cu" +#else + +THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) { + return THCTensor_getDevice(state, tensor); +} + +#endif diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h new file mode 100644 index 0000000..8e9bf84 --- /dev/null +++ b/aten/src/THC/generic/THCTensor.h @@ -0,0 +1,141 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensor.h" +#else + +typedef struct THCTensor THCTensor; + +// These used to be distinct types; for some measure of backwards compatibility and documentation +// alias these to the single THCTensor type. +#define THCudaTensor THCTensor +#define THCudaDoubleTensor THCTensor +#ifdef CUDA_HALF_TENSOR +#define THCudaHalfTensor THCTensor +#endif +#define THCudaByteTensor THCTensor +#define THCudaCharTensor THCTensor +#define THCudaShortTensor THCTensor +#define THCudaIntTensor THCTensor +#define THCudaLongTensor THCTensor + +/**** access methods ****/ +THC_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self); +THC_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self); + +// See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim(). +THC_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self); +THC_API int THCTensor_(_nDimension)(THCState *state, const THCTensor *self); + +THC_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim); +THC_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim); +THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self); +THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self); +THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self); + +THC_API void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag); +THC_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag); + + +/**** creation methods ****/ +THC_API THCTensor *THCTensor_(new)(THCState *state); +THC_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor); +/* stride might be NULL */ +THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_); +THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_); +THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_); +THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_); +THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_, + int64_t size3_, int64_t stride3_); + +/* stride might be NULL */ +THC_API THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size_, THLongStorage *stride_); +THC_API THCTensor *THCTensor_(newWithSize1d)(THCState *state, int64_t size0_); +THC_API THCTensor *THCTensor_(newWithSize2d)(THCState *state, int64_t size0_, int64_t size1_); +THC_API THCTensor *THCTensor_(newWithSize3d)(THCState *state, int64_t size0_, int64_t size1_, int64_t size2_); +THC_API THCTensor *THCTensor_(newWithSize4d)(THCState *state, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_); + +THC_API THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self); +THC_API THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *tensor); +THC_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, int64_t sliceIndex_); +THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_); +THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_); +THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, int64_t size_, int64_t step_); +THC_API THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size); +THC_API THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input); + +// resize* methods simply resize the storage. So they may not retain the current data at current indices. +// This is especially likely to happen when the tensor is not contiguous. In general, if you still need the +// values, unless you are doing some size and stride tricks, do not use resize*. +THC_API void THCTensor_(resize)(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride); +THC_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, int64_t *size, int64_t *stride); +THC_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src); +THC_API void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0_); +THC_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_); +THC_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_); +THC_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_); +THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_); + +THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_); +THC_API void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride); +THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_); +THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_); +THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_); +THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, + int64_t size0_, int64_t stride0_, + int64_t size1_, int64_t stride1_, + int64_t size2_, int64_t stride2_, + int64_t size3_, int64_t stride3_); + +THC_API void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t firstIndex_, int64_t size_); +THC_API void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t sliceIndex_); +THC_API void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1_, int dimension2_); +THC_API void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t size_, int64_t step_); + +THC_API void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_); +THC_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_); + +THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self); +THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src); +THC_API int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src); +THC_API int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims); +THC_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self); + +THC_API void THCTensor_(retain)(THCState *state, THCTensor *self); +THC_API void THCTensor_(free)(THCState *state, THCTensor *self); +THC_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst); + +/* Slow access methods [check everything] */ +THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value); +THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value); +THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value); +THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value); + +THC_API real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0); +THC_API real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1); +THC_API real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2); +THC_API real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3); + +/* CUDA-specific functions */ +THC_API int THCTensor_(getDevice)(THCState *state, const THCTensor *self); +THC_API int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...); + +/* debug methods */ +THC_API THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor); + +#endif diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp new file mode 100644 index 0000000..5715133 --- /dev/null +++ b/aten/src/THC/generic/THCTensorCopy.cpp @@ -0,0 +1,175 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorCopy.cpp" +#else + +/* specific methods */ + +void THCTensor_(copyCPU)(THCState *state, THCTensor *self, struct THTensor *src) +{ + THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match"); + + { + THCTensor *selfc = THCTensor_(newContiguous)(state, self); + src = THTensor_(newContiguous)(src); + + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state,selfc), + THTensor_(data)(src), + THTensor_(nElement)(src) * sizeof(real), + cudaMemcpyHostToDevice, + stream)); + THCudaCheck(cudaStreamSynchronize(stream)); + + THTensor_(free)(src); + THCTensor_(freeCopyTo)(state, selfc, self); + } +} + +#define IMPLEMENT_TH_CUDA_TENSOR_COPY(TYPEC) \ +void THCTensor_(copy##TYPEC)(THCState *state, THCTensor *self, struct TH##TYPEC##Tensor *src) \ +{ \ + THArgCheck(THCTensor_(nElement)(state, self) == TH##TYPEC##Tensor_nElement(src), 2, "sizes do not match"); \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THCTensor_(copyCPU)(state, self, (THTensor*) src); /* cast just removes warnings */ \ + } else { \ + THLongStorage *size = TH##TYPEC##Tensor_newSizeOf(src); \ + THTensor *srcf = THTensor_(newWithSize)(size, NULL); \ + \ + THTensor_(copy##TYPEC)(srcf, src); \ + THCTensor_(copyCPU)(state, self, srcf); \ + \ + THLongStorage_free(size); \ + THTensor_(free)(srcf); \ + } \ +} + +IMPLEMENT_TH_CUDA_TENSOR_COPY(Byte) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Char) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Short) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Int) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Long) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Float) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Double) +IMPLEMENT_TH_CUDA_TENSOR_COPY(Half) + +/* copyCuda */ + +void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src) +{ + THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match"); + + { + THTensor *selfc = THTensor_(newContiguous)(self); + src = THCTensor_(newContiguous)(state, src); + + cudaStream_t stream = THCState_getCurrentStream(state); + THCudaCheck(cudaMemcpyAsync(THTensor_(data)(selfc), + THCTensor_(data)(state, src), + THCTensor_(nElement)(state, src) * sizeof(real), + cudaMemcpyDeviceToHost, + stream)); + THCudaCheck(cudaStreamSynchronize(stream)); + + THCTensor_(free)(state, src); + THTensor_(freeCopyTo)(selfc, self); + } +} + +#define IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(TYPEC) \ + void TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(THCState *state, TH##TYPEC##Tensor *self, struct THCTensor *src) \ + { \ + THArgCheck(TH##TYPEC##Tensor_nElement(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match"); \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THTensor_(copyCuda)(state, (THTensor*) self, src); /* cast just removes compiler warning */ \ + } else { \ + THLongStorage *size = THCTensor_(newSizeOf)(state, src); \ + THTensor *srcf = THTensor_(newWithSize)(size, NULL); \ + \ + THTensor_(copyCuda)(state, srcf, src); \ + TH_CONCAT_4(TH,TYPEC,Tensor_copy,Real)(self, srcf); \ + \ + THLongStorage_free(size); \ + THTensor_(free)(srcf); \ + } \ + } + +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Byte) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Char) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Short) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Int) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Long) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Float) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Double) +IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Half) + +void THCTensor_(copyCuda)(THCState *state, THCTensor *self, THCTensor *src) +{ + THCTensor_(copy)(state, self, src); +} + +void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor *src) +{ + THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match"); + THArgCheck(THCTensor_(isContiguous)(state, self), 2, "Target tensor must be contiguous"); + THArgCheck(THTensor_(isContiguous)(src), 3, "Source tensor must be contiguous"); + + if (THCTensor_(nElement)(state, self) == 0) return; + + // Perform the copy wrt the current stream on the CudaTensor's device. + int tensorDevice = THCTensor_(getDevice)(state, self); + int currentDevice; + THCudaCheck(cudaGetDevice(¤tDevice)); + + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(tensorDevice)); + } + + THCStream *stream = THCState_getStream(state); + THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self), + THTensor_(data)(src), + THTensor_(nElement)(src) * sizeof(real), + cudaMemcpyHostToDevice, + THCStream_stream(stream))); + + THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream)); + + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(currentDevice)); + } +} + +void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src) +{ + THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match"); + THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous"); + + if (THTensor_(nElement)(self) == 0) return; + + // Perform the copy wrt the current stream on the CudaTensor's device. + int tensorDevice = THCTensor_(getDevice)(state, src); + int currentDevice; + THCudaCheck(cudaGetDevice(¤tDevice)); + + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(tensorDevice)); + } + + THCStream *stream = THCState_getStream(state); + THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self), + THCTensor_(data)(state, src), + THCTensor_(nElement)(state, src) * sizeof(real), + cudaMemcpyDeviceToHost, + THCStream_stream(stream))); + + THCudaCheck(THCCachingHostAllocator_recordEvent(THCStorage_(data)(state, src->storage), stream)); + + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(currentDevice)); + } +} + +#undef IMPLEMENT_TH_CUDA_TENSOR_COPY +#undef IMPLEMENT_TH_CUDA_TENSOR_COPY_TO + +#endif diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu new file mode 100644 index 0000000..0e2630c --- /dev/null +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -0,0 +1,80 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorCopy.cu" +#else + +THC_API void +THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { + if (dst == src) return; + THC_copyTensor(state, dst, src); +} + +template <> +THCTensor *THCTensor_newClone(THCState *state, THCTensor *self) { + THCTensor *tensor = THCTensor_new(state, self->storage->scalar_type); + THCTensor_resizeAs(state, tensor, self); + THC_copyTensor(state, tensor, self); + return tensor; +} + +template <> +THCTensor *THCTensor_newContiguous(THCState *state, THCTensor *self) +{ + if(!THCTensor_isContiguous(state, self)) { + return THCTensor_newClone(state, self); + } else { + THCTensor_retain(state, self); + return self; + } +} + + +template <> +void THCTensor_freeCopyTo(THCState *state, THCTensor *self, THCTensor *dst) { + if(self != dst) + THC_copyTensor(state, dst, self); + + THCTensor_free(state, self); +} + +template <> +void THCTensor_copyIgnoringOverlaps(THCState* state, THCTensor* dst, THCTensor* src) { + // Called when we are copying into an overlapping index `dst`, but + // we don't care which writer wins. Hacky but it works. + // This is itself invoked by pointwiseApply2 / THCTensor_copy in + // case that there are write overlaps. + // FIXME: really, overlapping writes should be illegal/an error in Torch + THC_pointwiseApply2( + state, dst, src, + CopyOp(), + ReadOnly, /* ignore overwrites */ + ReadOnly); +} + +THC_API void +THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { + THCTensor_copyIgnoringOverlaps(state, dst, src); +} + +#define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA, SCALARC) \ + THC_API void \ + THCTensor_(copyCuda##TYPEC)(THCState *state, \ + THCTensor *self, \ + THCuda##TYPECUDA##Tensor *src) { \ + THC_copyTensor(state, self, src); \ + } + +IMPLEMENT_THC_CUDA_TENSOR_COPY(Byte, Byte, uint8_t) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Char, Char, int8_t) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Short, Short, int16_t) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Int, Int, int32_t) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long, int64_t) +// THCudaTensor aka the non-existent THCudaFloatTensor +IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, , float) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double, double) +#ifdef CUDA_HALF_TENSOR +IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, half) +#endif + +#undef IMPLEMENT_THC_CUDA_TENSOR_COPY + +#endif diff --git a/aten/src/THC/generic/THCTensorCopy.h b/aten/src/THC/generic/THCTensorCopy.h new file mode 100644 index 0000000..e549f09 --- /dev/null +++ b/aten/src/THC/generic/THCTensorCopy.h @@ -0,0 +1,43 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorCopy.h" +#else + +THC_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(copyByte)(THCState *state, THCTensor *self, THByteTensor *src); +THC_API void THCTensor_(copyChar)(THCState *state, THCTensor *self, THCharTensor *src); +THC_API void THCTensor_(copyShort)(THCState *state, THCTensor *self, THShortTensor *src); +THC_API void THCTensor_(copyInt)(THCState *state, THCTensor *self, THIntTensor *src); +THC_API void THCTensor_(copyLong)(THCState *state, THCTensor *self, THLongTensor *src); +THC_API void THCTensor_(copyFloat)(THCState *state, THCTensor *self, THFloatTensor *src); +THC_API void THCTensor_(copyDouble)(THCState *state, THCTensor *self, THDoubleTensor *src); +THC_API void THCTensor_(copyHalf)(THCState *state, THCTensor *self, struct THHalfTensor *src); + +THC_API void THCTensor_(copyCudaByte)(THCState *state, THCTensor *dst, struct THCudaByteTensor *src); +THC_API void THCTensor_(copyCudaChar)(THCState *state, THCTensor *dst, struct THCudaCharTensor *src); +THC_API void THCTensor_(copyCudaShort)(THCState *state, THCTensor *dst, struct THCudaShortTensor *src); +THC_API void THCTensor_(copyCudaInt)(THCState *state, THCTensor *dst, struct THCudaIntTensor *src); +THC_API void THCTensor_(copyCudaLong)(THCState *state, THCTensor *dst, struct THCudaLongTensor *src); +THC_API void THCTensor_(copyCudaFloat)(THCState *state, THCTensor *dst, struct THCudaTensor *src); +THC_API void THCTensor_(copyCudaDouble)(THCState *state, THCTensor *dst, struct THCudaDoubleTensor *src); +#ifdef CUDA_HALF_TENSOR +THC_API void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *dst, struct THCudaHalfTensor *src); +#endif + +THC_API void TH_CONCAT_2(THByteTensor_copyCuda , Real) (THCState *state, THByteTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THCharTensor_copyCuda , Real) (THCState *state, THCharTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THShortTensor_copyCuda , Real) (THCState *state, THShortTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THIntTensor_copyCuda , Real) (THCState *state, THIntTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THLongTensor_copyCuda , Real) (THCState *state, THLongTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real) (THCState *state, THFloatTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real) (THCState *state, THDoubleTensor *self, THCTensor *src); +THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real) (THCState *state, THHalfTensor *self, THCTensor *src); +THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src); + +THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src); +THC_API void THCTensor_(copyCPU) (THCState *state, THCTensor *self, THTensor *src); + +THC_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src); +THC_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src); + +#endif diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu new file mode 100644 index 0000000..0e6a7ff --- /dev/null +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -0,0 +1,654 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorIndex.cu" +#else + +// Check tensor dimensions for index operations, and return the slice size. +// src can be nullptr in case of indexFill: in that case it is ignored. +static ptrdiff_t THCTensor_(getSliceSize)(THCState *state, THCTensor *dst, + int dim, + THCudaLongTensor *index, + THCTensor *src) +{ + int dstDims = THCTensor_(_nDimension)(state, dst); + int srcDims = (src == nullptr) ? dstDims : THCTensor_(_nDimension)(state, src); + + THArgCheck(THCudaLongTensor__nDimension(state, index) == 1, 4, + "expecting vector of indices"); + THArgCheck(dim >= 0 && dim < dstDims, 2, "Indexing dim is out of bounds"); + + ptrdiff_t dstSliceSize = 1; + for (int d = 0; d < dstDims; d++) { + if (d != dim) { + dstSliceSize *= dst->size[d]; + } + } + + if (src == nullptr) return dstSliceSize; + + THArgCheck(dim < srcDims, 3, "Indexing dim is out of bounds"); + THArgCheck(THCudaLongTensor_nElement(state, index) == src->size[dim], 4, + "length of src.size[dim] is not equal to length of indices"); + + ptrdiff_t srcSliceSize = 1; + bool mismatch = false; + + if (dstDims != srcDims) mismatch = true; + + for (int d = 0; d < srcDims; d++) { + if (d != dim) { + srcSliceSize *= src->size[d]; + if (!mismatch && dst->size[d] != src->size[d]) mismatch = true; + } + } + + THArgCheck(dstSliceSize == srcSliceSize, 2, + "Source/destination tensor have different slice sizes (%ld vs %ld)", + dstSliceSize, srcSliceSize); + + if (mismatch) { + static bool warningShown = false; + if (!warningShown) { + warningShown = true; + fprintf(stderr, + "Warning: source/destination slices have same size but different " + "shape for an index operation. This behavior is deprecated.\n"); + } + } + + return dstSliceSize; +} + +// Compare the stride between adjacent slices (sliceStride) with strides in the +// other dimensions (i.e., strides *inside* each slice). +// +// - Returns true if some dimension inside the slice has lower stride than +// sliceStride. The simplest example is a 2-D contiguous tensor with sliceDim +// == 0 (that is, each slice is a row). +// +// In this case, we choose the CUDA kernel that processes the data in +// "index-major order". For example, if thread count equals slice size, then +// all threads process slice #0 in lockstep, and then slice #1, and so on. +// +// - Otherwise (i.e., sliceStride has the lowest value), this function returns +// false. The simplest example is a 2-D contiguous tensor with sliceDim == 1 +// (each slice is a column). +// +// In this case, we choose the CUDA kernel that processes the data in +// "elementInSlice-major order". For example, each thread can process element +// #0 of every slice, and then element #1 of every slice, and so on. +bool THCTensor_(indexShouldBeMajor)(TensorInfo &info, + int sliceDim) +{ + // The stride between adjacent slices (e.g., between element #0 of slice #100 + // and element #0 of slice #101). + unsigned int sliceStride = info.strides[sliceDim]; + + for (int i = 0; i < info.dims; ++i) { + if (i != sliceDim && info.sizes[i] > 1 && info.strides[i] < sliceStride) { + return true; + } + } + + return false; +} + +void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices)); + + int dims = THCTensor_(_nDimension)(state, dst); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + dims = THCTensor_(_nDimension)(state, src); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING); + dims = THCudaLongTensor__nDimension(state, indices); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); + + // The `src` is partitioned into two parts: + // -the size of each slice we are indexing, which is the + // total size of the tensor ignoring dimension `dim`; + // -the number of indices we are choosing, which is the total size + // of the tensor `indices`. + ptrdiff_t sliceSize = THCTensor_(getSliceSize)(state, dst, dim, indices, src); + ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src); + int64_t dstCopyDimSize = THCTensor_(size)(state, dst, dim); + + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); + cudaStream_t stream = THCState_getCurrentStream(state); + int indContig = THCudaLongTensor_isContiguous(state, indices); + + int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; + +#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexCopySmallIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize); + +#define LARGE_INDEX(TENSOR_TYPE, TYPE, \ + DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR) \ + indexCopyLargeIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstCopyDim, srcCopyDim, srcTotalSize, \ + (IDX_IS_MAJOR) ? sliceSize : numIndices, \ + dstCopyDimSize); + + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); + + dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128)); + + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, indices)) { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstCopyDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstCopyDim); + + TensorInfo srcInfo = + getTensorInfo(state, src); + int srcCopyDim = srcInfo.collapseDims(dim); + srcInfo.reduceDim(srcCopyDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + // A reasonable choice for when to have each thread iterate over + // indices to choose + if (numIndices <= 16) { + if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) { + SMALL_INDEX(real, unsigned int, 1, 1, -2); + } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) { + SMALL_INDEX(real, unsigned int, 2, 2, -2); + } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) { + SMALL_INDEX(real, unsigned int, 3, 3, -2); + } else { + SMALL_INDEX(real, unsigned int, -1, -1, -1); + } + } else { + bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstCopyDim); + + if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) { + LARGE_INDEX(real, unsigned int, 1, 1, -2, true); + } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 2, 2, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 2, 2, -2, false); + } + } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 3, 3, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 3, 3, -2, false); + } + } else { + LARGE_INDEX(real, unsigned int, -1, -1, -1, true); + } + } + } else { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstCopyDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstCopyDim); + + TensorInfo srcInfo = + getTensorInfo(state, src); + int srcCopyDim = srcInfo.collapseDims(dim); + srcInfo.reduceDim(srcCopyDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + LARGE_INDEX(real, uint64_t, -1, -1, -1, true); + } + +#undef SMALL_INDEX +#undef LARGE_INDEX +} + +void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLongTensor *index) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); + + THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(!(THCTensor_(_nDimension)(state, src) == 0 && THCudaLongTensor__nDimension(state, index) != 0), 2, + "tried to take from an empty tensor"); + + THCTensor_(resizeNd)(state, dst, index->dim(), index->size, NULL); + + // dispatchTakePut only handles non-empty tensors; + if (index->_dim() > 0) { + dispatchTakePut(state, src, dst, index); + } +} + +static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) { + THCThrustAllocator thrustAlloc(state); + + auto index_iter = thrust::device_ptr(THCudaLongTensor_data(state, index)); + auto src_iter = thrust::device_ptr(THCTensor_(data)(state, src)); + auto numel = THCTensor_(numel)(state, src); + + thrust::sort_by_key( + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), + index_iter, index_iter + numel, + src_iter, ThrustLTOp()); +} + +void THCTensor_(put)(THCState *state, THCTensor *dst, THCudaLongTensor *index, THCTensor *src, int accumulate) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); + + ptrdiff_t dstSize = THCTensor_(nElement)(state, dst); + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, index); + THArgCheck(THCTensor_(nElement)(state, src) == numIndices, + 3, "src should have the same number of elements as index"); + + THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + + if (numIndices == 0) { + return; + } + + if (accumulate) { + // wrap indices so to replace negative indices + THCudaLongTensor* sorted_index = THCudaLongTensor_new(state); + THCudaLongTensor_resizeAs(state, sorted_index, index); + THC_pointwiseApply2(state, sorted_index, index, WrapIndexOp(dstSize)); + + THCTensor* sorted_src = THCTensor_(newClone)(state, src); + + THCTensor_(sort_indices)(state, sorted_index, sorted_src); + dispatchTakePut(state, dst, sorted_src, sorted_index); + + THCTensor_(free)(state, sorted_src); + THCudaLongTensor_free(state, sorted_index); + } else { + dispatchTakePut(state, dst, src, index); + } +} + +void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices)); + + int dims = THCTensor_(_nDimension)(state, dst); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + dims = THCTensor_(_nDimension)(state, src); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING); + dims = THCudaLongTensor__nDimension(state, indices); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); + + // The `src` is partitioned into two parts: + // -the size of each slice we are indexing, which is the + // total size of the tensor ignoring dimension `dim`; + // -the number of indices we are choosing, which is the total size + // of the tensor `indices`. + ptrdiff_t sliceSize = THCTensor_(getSliceSize)(state, dst, dim, indices, src); + ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src); + int64_t dstAddDimSize = THCTensor_(size)(state, dst, dim); + + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); + cudaStream_t stream = THCState_getCurrentStream(state); + int indContig = THCudaLongTensor_isContiguous(state, indices); + + int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; + +#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexAddSmallIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstAddDim, srcAddDim, sliceSize, dstAddDimSize); + +#define LARGE_INDEX(TENSOR_TYPE, TYPE, \ + DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR) \ + indexAddLargeIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstAddDim, srcAddDim, srcTotalSize, \ + (IDX_IS_MAJOR) ? sliceSize : numIndices, \ + dstAddDimSize); + + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); + + dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128)); + + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, indices)) { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstAddDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstAddDim); + + TensorInfo srcInfo = + getTensorInfo(state, src); + int srcAddDim = srcInfo.collapseDims(dim); + srcInfo.reduceDim(srcAddDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + // A reasonable choice for when to have each thread iterate over + // indices to choose + if (numIndices <= 16) { + if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) { + SMALL_INDEX(real, unsigned int, 1, 1, -2); + } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) { + SMALL_INDEX(real, unsigned int, 2, 2, -2); + } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) { + SMALL_INDEX(real, unsigned int, 3, 3, -2); + } else { + SMALL_INDEX(real, unsigned int, -1, -1, -1); + } + } else { + bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstAddDim); + + if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) { + LARGE_INDEX(real, unsigned int, 1, 1, -2, true); + } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 2, 2, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 2, 2, -2, false); + } + } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 3, 3, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 3, 3, -2, false); + } + } else { + LARGE_INDEX(real, unsigned int, -1, -1, -1, true); + } + } + } else { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstAddDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstAddDim); + + TensorInfo srcInfo = + getTensorInfo(state, src); + int srcAddDim = srcInfo.collapseDims(dim); + srcInfo.reduceDim(srcAddDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + LARGE_INDEX(real, uint64_t, -1, -1, -1, true); + } + +#undef SMALL_INDEX +#undef LARGE_INDEX +} + +void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, real val) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, dst)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices)); + int dims = THCTensor_(_nDimension)(state, dst); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + dims = THCudaLongTensor__nDimension(state, indices); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); + + // The `src` is partitioned into two parts: + // -the size of each slice we are indexing, which is the + // total size of the tensor ignoring dimension `dim`; + // -the number of indices we are choosing, which is the total size + // of the tensor `indices`. + ptrdiff_t sliceSize = + THCTensor_(getSliceSize)(state, dst, dim, indices, nullptr); + ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst); + int64_t dstFillDimSize = THCTensor_(size)(state, dst, dim); + + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); + cudaStream_t stream = THCState_getCurrentStream(state); + int indContig = THCudaLongTensor_isContiguous(state, indices); + + int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; + +#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM) \ + indexFillSmallIndex \ + <<>>( \ + dstInfo, indicesInfo, \ + dstFillDim, sliceSize, dstFillDimSize, val); + +#define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM, IDX_IS_MAJOR) \ + indexFillLargeIndex \ + <<>>( \ + dstInfo, indicesInfo, \ + dstFillDim, sliceSize * numIndices, \ + (IDX_IS_MAJOR) ? sliceSize : numIndices, \ + dstFillDimSize, val); + + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); + + dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128)); + + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, indices)) { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstFillDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstFillDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + // A reasonable choice for when to have each thread iterate over + // indices to choose + if (numIndices <= 16) { + if (dstInfo.dims == 1 && indContig) { + SMALL_INDEX(real, unsigned int, 1, -2); + } else if (dstInfo.dims == 2 && indContig) { + SMALL_INDEX(real, unsigned int, 2, -2); + } else if (dstInfo.dims == 3 && indContig) { + SMALL_INDEX(real, unsigned int, 3, -2); + } else { + SMALL_INDEX(real, unsigned int, -1, -1); + } + } else { + bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstFillDim); + + if (dstInfo.dims == 1 && indContig) { + LARGE_INDEX(real, unsigned int, 1, -2, true); + } else if (dstInfo.dims == 2 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 2, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 2, -2, false); + } + } else if (dstInfo.dims == 3 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 3, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 3, -2, false); + } + } else { + LARGE_INDEX(real, unsigned int, -1, -1, true); + } + } + } else { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstFillDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstFillDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + LARGE_INDEX(real, uint64_t, -1, -1, true); + } + +#undef SMALL_INDEX +#undef LARGE_INDEX +} + +void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THCudaLongTensor *indices) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, dst, src, indices)); + + int dims = THCTensor_(_nDimension)(state, dst); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + dims = THCTensor_(_nDimension)(state, src); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING); + dims = THCudaLongTensor__nDimension(state, indices); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING); + + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); + + int srcDims = THCTensor_(_nDimension)(state, src); + cudaStream_t stream = THCState_getCurrentStream(state); + + THArgCheck(THCudaLongTensor__nDimension(state, indices) <= 1, 3, + "Index is supposed to be an empty tensor or a vector"); + THArgCheck(dim < srcDims, 4, "Indexing dim is out of bounds"); + THArgCheck(srcDims > 0, 2, "Source tensor is empty"); + + THLongStorage *newSize; + + if (numIndices == 0) { + newSize = THCTensor_(newSizeOf)(state, src); + THLongStorage_set(newSize, 0, numIndices); + THCTensor_(resize)(state, dst, newSize, NULL); + THLongStorage_free(newSize); + return; + } + + newSize = THCTensor_(newSizeOf)(state, src); + THLongStorage_set(newSize, dim, numIndices); + THCTensor_(resize)(state, dst, newSize, NULL); + THLongStorage_free(newSize); + + int indContig = THCudaLongTensor_isContiguous(state, indices); + + // The `src` is partitioned into two parts: + // -the size of each slice we are indexing, which is the + // total size of the tensor ignoring dimension `dim`; + // -the number of indices we are choosing, which is the total size + // of the tensor `indices`. + ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst); + int64_t srcSelectDimSize = THCTensor_(size)(state, src, dim); + ptrdiff_t sliceSize = dstTotalSize / numIndices; + + int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; + +#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexSelectSmallIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstSelectDim, srcSelectDim, sliceSize, srcSelectDimSize); + +#define LARGE_INDEX(TENSOR_TYPE, TYPE, \ + DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR) \ + indexSelectLargeIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstSelectDim, srcSelectDim, dstTotalSize, \ + (IDX_IS_MAJOR) ? sliceSize : numIndices, \ + srcSelectDimSize); + + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); + + dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128)); + + if (THCTensor_canUse32BitIndexMath(state, dst) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, indices)) { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstSelectDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstSelectDim); + + TensorInfo srcInfo = + getTensorInfo(state, src); + int srcSelectDim = srcInfo.collapseDims(dim); + srcInfo.reduceDim(srcSelectDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + // A reasonable choice for when to have each thread iterate over + // indices to choose + if (numIndices <= 16) { + if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) { + SMALL_INDEX(real, unsigned int, 1, 1, -2); + } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) { + SMALL_INDEX(real, unsigned int, 2, 2, -2); + } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) { + SMALL_INDEX(real, unsigned int, 3, 3, -2); + } else { + SMALL_INDEX(real, unsigned int, -1, -1, -1); + } + } else { + bool indexIsMajor = THCTensor_(indexShouldBeMajor)(dstInfo, dstSelectDim); + + if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) { + LARGE_INDEX(real, unsigned int, 1, 1, -2, true); + } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 2, 2, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 2, 2, -2, false); + } + } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(real, unsigned int, 3, 3, -2, true); + } else { + LARGE_INDEX(real, unsigned int, 3, 3, -2, false); + } + } else { + LARGE_INDEX(real, unsigned int, -1, -1, -1, true); + } + } + } else { + TensorInfo dstInfo = + getTensorInfo(state, dst); + int dstSelectDim = dstInfo.collapseDims(dim); + dstInfo.reduceDim(dstSelectDim); + + TensorInfo srcInfo = + getTensorInfo(state, src); + int srcSelectDim = srcInfo.collapseDims(dim); + srcInfo.reduceDim(srcSelectDim); + + TensorInfo indicesInfo = + getTensorInfo(state, indices); + indicesInfo.collapseDims(); + + LARGE_INDEX(real, uint64_t, -1, -1, -1, true); + } + +#undef SMALL_INDEX +#undef LARGE_INDEX +} + +#endif diff --git a/aten/src/THC/generic/THCTensorIndex.h b/aten/src/THC/generic/THCTensorIndex.h new file mode 100644 index 0000000..03ff54c --- /dev/null +++ b/aten/src/THC/generic/THCTensorIndex.h @@ -0,0 +1,12 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorIndex.h" +#else + +THC_API void THCTensor_(indexCopy)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src); +THC_API void THCTensor_(indexAdd)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src); +THC_API void THCTensor_(indexFill)(THCState *state, THCTensor *tensor, int dim, THCudaLongTensor *index, real val); +THC_API void THCTensor_(indexSelect)(THCState *state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index); +THC_API void THCTensor_(take)(THCState *state, THCTensor *res_, THCTensor *src, THCudaLongTensor *index); +THC_API void THCTensor_(put)(THCState *state, THCTensor *res_, THCudaLongTensor *indices, THCTensor *src, int accumulate); + +#endif diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu new file mode 100644 index 0000000..80c1344 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMasked.cu @@ -0,0 +1,193 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMasked.cu" +#else + + +THC_API void +THCTensor_(maskedFill)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask)); + THArgCheck(THCTensor_(nElement)(state, tensor) == + THCudaByteTensor_nElement(state, mask), + 2, "sizes do not match"); + + if (!THC_pointwiseApply2(state, tensor, mask, + TensorMaskedFillOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(maskedFillByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor)); + THLongStorage* maskSizes = THByteTensor_newSizeOf(mask); + THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL); + THLongStorage_free(maskSizes); + THCudaByteTensor_copyByte(state, maskCuda, mask); + THCTensor_(maskedFill)(state, tensor, maskCuda, value); + THCudaByteTensor_free(state, maskCuda); +} + +THC_API void +THCTensor_(maskedCopy)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); + ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask); + ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor); + ptrdiff_t srcSize = THCTensor_(nElement)(state, src); + + // `mask` and `tensor` must have the same number of elements + THArgCheck(maskSize == tensorSize, 2, + "mask and tensor must have the same number of elements"); + + // Determine our output size + ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask); + + // The number of `1` elements present in the mask must be <= the + // number of elements available in `src` + if (totalElements > srcSize) { + THArgCheck(false, 2, "source nElements must be == mask `1` elements"); + } + + // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed + // iterator prefix sums? Convert `mask` to the same datatype as what + // we're accumulating the prefix sum in (int64_t) to get around it + THCudaLongTensor* maskLong = THCudaLongTensor_new(state); + THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask); + THCudaLongTensor_resize(state, maskLong, maskSizes, NULL); + THCudaLongTensor_copyCudaByte(state, maskLong, mask); + + // Use a prefix sum to determine the output locations of the masked elements + THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state); + THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL); + THLongStorage_free(maskSizes); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr + maskData(THCudaLongTensor_data(state, maskLong)); + thrust::device_ptr + maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum)); + + thrust::exclusive_scan( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + maskData, + maskData + THCudaLongTensor_nElement(state, maskLong), + maskPrefixSumData); + + // We are getting elements from `src` based on an offset from + // `maskPrefixSum`, so that should be made contiguous too + THCTensor* contigSrc = THCTensor_(newContiguous)(state, src); + + // update `tensor` where `mask` == 1 but pull from `src` at + // maskPrefixSum + bool status = THC_pointwiseApply3( + state, tensor, mask, maskPrefixSum, + TensorMaskedCopyOp( + THCTensor_(data)(state, contigSrc))); + + THCTensor_(free)(state, contigSrc); + THCudaLongTensor_free(state, maskLong); + THCudaLongTensor_free(state, maskPrefixSum); + + THArgCheck(status, 2, CUTORCH_DIM_WARNING); + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(maskedCopyByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, THCTensor *src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); + THLongStorage* maskSizes = THByteTensor_newSizeOf(mask); + THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL); + THLongStorage_free(maskSizes); + THCudaByteTensor_copyByte(state, maskCuda, mask); + THCTensor_(maskedCopy)(state, tensor, maskCuda, src); + THCudaByteTensor_free(state, maskCuda); +} + +THC_API void +THCTensor_(maskedSelect)(THCState* state, + THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); + THArgCheck(THCudaByteTensor_nElement(state, mask) == + THCTensor_(nElement)(state, src), + 2, "sizes do not match"); + + // Determine our output size + ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask); + THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor); + + THCTensor_(resize1d)(state, tensorContig, totalElements); + if (tensor != tensorContig) { + THCTensor_(resize1d)(state, tensor, totalElements); + } + + // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed + // iterator prefix sums? Convert `mask` to the same datatype as what + // we're accumulating the prefix sum in (int64_t) to get around it + THCudaLongTensor* maskLong = THCudaLongTensor_new(state); + THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask); + THCudaLongTensor_resize(state, maskLong, maskSizes, NULL); + THCudaLongTensor_copyCudaByte(state, maskLong, mask); + + // Use a prefix sum to determine the output locations of the masked elements + THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state); + THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL); + THLongStorage_free(maskSizes); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr + maskData(THCudaLongTensor_data(state, maskLong)); + thrust::device_ptr + maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum)); + + thrust::exclusive_scan( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + maskData, + maskData + THCudaLongTensor_nElement(state, maskLong), + maskPrefixSumData); + + // Then copy over the masked elements at their desired output index + bool status = THC_pointwiseApply3( + state, mask, maskPrefixSum, + src, TensorMaskedSelectOp( + THCTensor_(data)(state, tensor))); + + THCudaLongTensor_free(state, maskLong); + THCudaLongTensor_free(state, maskPrefixSum); + + if (tensor != tensorContig) { + THCTensor_(freeCopyTo)(state, tensorContig, tensor); + } else { + THCTensor_(free)(state, tensorContig); + } + + THArgCheck(status, 2, CUTORCH_DIM_WARNING); + THCudaCheck(cudaGetLastError()); +} + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void +THCTensor_(maskedSelectByte)(THCState* state, + THCTensor *tensor, THCTensor *src, THByteTensor *mask) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); + THLongStorage* maskSizes = THByteTensor_newSizeOf(mask); + THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL); + THLongStorage_free(maskSizes); + THCudaByteTensor_copyByte(state, maskCuda, mask); + THCTensor_(maskedSelect)(state, tensor, src, maskCuda); + THCudaByteTensor_free(state, maskCuda); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMasked.h b/aten/src/THC/generic/THCTensorMasked.h new file mode 100644 index 0000000..98f5aee --- /dev/null +++ b/aten/src/THC/generic/THCTensorMasked.h @@ -0,0 +1,38 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMasked.h" +#else + +THC_API void THCTensor_(maskedFill)(THCState *state, + THCTensor *tensor, + THCudaByteTensor *mask, + real value); + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void THCTensor_(maskedFillByte)(THCState *state, + THCTensor *tensor, + THByteTensor *mask, + real value); + +THC_API void THCTensor_(maskedCopy)(THCState *state, + THCTensor *tensor, + THCudaByteTensor *mask, + THCTensor *src); + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void THCTensor_(maskedCopyByte)(THCState *state, + THCTensor *tensor, + THByteTensor *mask, + THCTensor *src); + +THC_API void THCTensor_(maskedSelect)(THCState *state, + THCTensor *tensor, + THCTensor *src, + THCudaByteTensor *mask); + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void THCTensor_(maskedSelectByte)(THCState *state, + THCTensor *tensor, + THCTensor *src, + THByteTensor *mask); + +#endif diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu new file mode 100644 index 0000000..8bdd8fa --- /dev/null +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -0,0 +1,485 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMath.cu" +#else + +THC_API void +THCTensor_(fill)(THCState* state, THCTensor *self_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + + if (!THC_pointwiseApply1( + state, self_, TensorFillOp(value))) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(zero)(THCState *state, THCTensor *self_) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + if (THCTensor_(isContiguous)(state, self_)) { + THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_), + 0, + sizeof(real) * THCTensor_(nElement)(state, self_), + THCState_getCurrentStream(state))); + } else { + if (!THC_pointwiseApply1( + state, self_, + TensorFillOp(ScalarConvert::to(0)))) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input)); + THCTensor_(resizeAs)(state, r_, input); + THCTensor_(zero)(state, r_); +} + +THC_API void +THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input)); + THCTensor_(resizeAs)(state, r_, input); + THCTensor_(fill)(state, r_, ScalarConvert::to(1)); +} + +ptrdiff_t +THCTensor_(numel)(THCState *state, THCTensor *t) +{ + return THCTensor_(nElement)(state, t); +} + +void THCTensor_(cat)(THCState *state, THCTensor *result, + THCTensor *ta, THCTensor *tb, int dimension) +{ + THCTensor* inputs[2]; + inputs[0] = ta; + inputs[1] = tb; + THCTensor_(catArray)(state, result, inputs, 2, dimension); +} + +void THCTensor_(check_shape_except_dim)(THCState *state, + THCTensor *first, THCTensor *second, int dimension); +inline void THCTensor_(check_shape_except_dim)(THCState *state, + THCTensor *first, THCTensor *second, int dimension) +{ + int first_dims = first->dim(); + int second_dims = second->dim(); + THArgCheck(first_dims == second_dims, 0, + "Tensors must have same number of dimensions: got %d and %d", + first_dims, second_dims); + for (int dim = 0; dim < first_dims; dim++) { + if (dim == dimension) { + continue; + } + int64_t first_dim_size = THCTensor_(size)(state, first, dim); + int64_t second_dim_size = THCTensor_(size)(state, second, dim); + THArgCheck(first_dim_size == second_dim_size, 0, + "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d", + dimension, (long long)first_dim_size, (long long)second_dim_size, dim); + } +} + +void THCTensor_(catArray)(THCState *state, THCTensor *result, + THCTensor **inputs, int numInputs, int dimension) +{ + // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible + // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors + // to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific + // size (i.e. other empty sizes are not skipped). + // FIXME: warn if this is the case + THLongStorage *size; + int i, j, cohortMax; + int64_t offset; + bool hasSkippedInput = false; + THCTensor *notSkippedTensor = NULL; // non-owning reference + auto should_skip = [](THCTensor *t) { return t->is_empty() && t->dim() == 1; }; + int nDims = 0; + + for (i = 0; i < numInputs; i++) + { + if (should_skip(inputs[i])) { + hasSkippedInput = true; + continue; + } + nDims = inputs[i]->dim(); + notSkippedTensor = inputs[i]; + } + + // If all inputs are empty tensors, return an empty tensor + if (notSkippedTensor == NULL) { + return; + } + + THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs); + THArgCheck(dimension >= 0, 4, "invalid dimension %d", dimension); + + size = THLongStorage_newWithSize(nDims); + + // Compute size of the result in the cat dimension + int64_t cat_dim_size = 0; + for (int i = 0; i < numInputs; i++) { + THCTensor *tensor = inputs[i]; + if (should_skip(tensor)) { + continue; + } + THCTensor_(check_shape_except_dim)(state, notSkippedTensor, tensor, dimension); + cat_dim_size += THCTensor_(size)(state, tensor, dimension); + } + + // Compute the size of the result + for (int dim = 0; dim < nDims; dim++) { + int64_t result_dim_size = THCTensor_(size)(state, notSkippedTensor, dim); + if (dim == dimension) { + result_dim_size = cat_dim_size; + } + THLongStorage_data(size)[dim] = result_dim_size; + } + THCTensor_(resize)(state, result, size, NULL); + THLongStorage_free(size); + + // We parallelize the copy if all 6 conditions pass: + // + // 1. There is more than one input tensor + // 2. No empty inputs + // 3. The result tensor is 32-bit indexable + // 4. The number of dimensions is <= 4 + // 5. All input tensors are contiguous (output tensor may be non-contig) + // 6. All input tensors can use 32-bit indexing + // 7. All input tensors are on the same device + + if (numInputs > 1 && + !hasSkippedInput && + result->dim() <= CAT_ARRAY_MAX_INPUT_DIMS && + THCTensor_canUse32BitIndexMath(state, result) && + THCTensor_allContiguous(state, inputs, numInputs) && + THCTensor_all32BitIndexable(state, inputs, numInputs) && + THCTensor_allSameDevice(state, inputs, numInputs)) { + + // First, let's set up our kernel parameters. We start with a raw pointer to the storage + // for the output Tensor. + real *data = THCTensor_(data)(state, result); + + // Kernel Parameter + size_t tensorMetadataSize = sizeof(CatArrInputTensor) * CAT_ARRAY_BATCH_SIZE; + auto d_inputs = static_cast *>(THCudaMalloc(state, tensorMetadataSize)); + + OutputTensorSizeStride param; + + // Next, let's initialize the size, stride arrays for the output Tensor. + for (i = 0; i < nDims; ++i) { + param.outputSize[i] = THCTensor_(size)(state, result, i); + param.outputStride[i] = THCTensor_(stride)(state, result, i); + } + + THCStream* stream = THCState_getStream(state); + + // Template Declarations for dim = 1, 2, 3, 4 +#define HANDLE_CASE(DIMS) \ + CatArrayBatchedCopy<<>>(data, d_inputs, param, dimension, param.outputStride[dimension]); + + // Now we loop + offset = 0; + for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) { + // Re-allocate stackInputs every iteration to avoid read-after-write hazard + { + auto stackInputs_owner = THCudaHostAlloc(state, tensorMetadataSize); + CatArrInputTensor* stackInputs = static_cast*>(stackInputs_owner.get()); + cohortMax = 0; + for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) { + int64_t dimSize = THCTensor_(size)(state, inputs[i+j], dimension); + + stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]); + stackInputs[j].offset = offset; + stackInputs[j].dimSize = dimSize; + stackInputs[j].nElements = THCTensor_(nElement)(state, inputs[i+j]); + cohortMax = cohortMax > (int) stackInputs[j].nElements ? cohortMax : (int) stackInputs[j].nElements; + + // update offset + offset += dimSize; + } + THCudaCheck(cudaMemcpyAsync( + d_inputs, + stackInputs, + j * sizeof(CatArrInputTensor), + cudaMemcpyHostToDevice, + THCStream_stream(stream))); + THCudaHostRecord(state, stackInputs); + } + + // Next, let's consider how we set our kernel launch parameters. + // We borrow from THCApply, which the kernel's internal indexing + // is based on. + dim3 applyBlock = getApplyBlock(); + + //Get grid where x dim fills half gpu and y dim is number of tensors. + //This will have cating two tensors fill the entire grid, but prevent + //many threads from needlessly load meta data if their sizes is small. + dim3 catGrid; + getCatGrid(state, j, catGrid); + + + switch (nDims) { + case 1: + HANDLE_CASE(1); + break; + case 2: + HANDLE_CASE(2); + break; + case 3: + HANDLE_CASE(3); + break; + case 4: + HANDLE_CASE(4); + break; + } + THCudaCheck(cudaGetLastError()); + } + THCudaFree(state, d_inputs); +#undef HANDLE_CASE + } else { + offset = 0; + for (j = 0; j < numInputs; j++) + { + if (should_skip(inputs[j])) continue; + int64_t dimSize = THCTensor_(size)(state, inputs[j], dimension); + THCTensor *nt = THCTensor_(newWithTensor)(state, result); + THCTensor_(narrow)(state, nt, NULL, dimension, offset, dimSize); + THCTensor_(copy)(state, nt, inputs[j]); + THCTensor_(free)(state, nt); + offset += dimSize; + } + } +} + +void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, + THCTensor *self) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self )); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, tensor)); + + + using namespace thrust::placeholders; + THCThrustAllocator thrustAlloc(state); + self = THCTensor_(newContiguous)(state, self); + thrust::device_ptr self_data(THCTensor_(data)(state, self)); + + int num_dim = THCTensor_(nDimension)(state, self); + int64_t N = THCTensor_(nElement)(state, self); + + THCudaLongTensor_resize2d(state, tensor, N, num_dim); + tensor = THCudaLongTensor_newContiguous(state, tensor); + thrust::device_ptr tensor_data(THCudaLongTensor_data(state, tensor)); + + thrust::counting_iterator idxfirst(0); + thrust::counting_iterator idxlast = idxfirst + N; + + typedef thrust::device_ptr Iter; + strided_range strided_tensor(tensor_data, + tensor_data+N*num_dim, num_dim); + +#if CUDA_VERSION >= 7000 + cudaStream_t stream = THCState_getCurrentStream(state); +#endif + + strided_range::iterator dend = thrust::copy_if( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(stream), +#endif + idxfirst, + idxlast, + self_data, + strided_tensor.begin(), + NonZeroOp() + ); + + int64_t num_nonzeros = thrust::distance(strided_tensor.begin(), dend); + + int64_t div = 1; + for (int dim = num_dim-1; dim >= 0; dim--) { + strided_range stride_dim(tensor_data+dim, + tensor_data+N*num_dim, num_dim); + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(stream), +#endif + strided_tensor.begin(), + strided_tensor.end(), + stride_dim.begin(), + idx_functor(div, self->size[dim]) + ); + div *= self->size[dim]; + } + + THCudaLongTensor_resize2d(state, tensor, num_nonzeros, num_dim); + + THCTensor_(free)(state, self); + THCudaLongTensor_free(state, tensor); + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k){ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + int nDimension = THCTensor_(nDimension)(state, src_); +#ifndef USE_TH_SIZE_ZERO_DIM + AT_ASSERT(!src_->is_empty()); +#endif + THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector"); + if (nDimension == 2) { + int64_t stride0 = THCTensor_(stride)(state, src_, 0); + int64_t stride1 = THCTensor_(stride)(state, src_, 1); + int64_t size0 = THCTensor_(size)(state, src_, 0); + int64_t size1 = THCTensor_(size)(state, src_, 1); + int64_t size = (k > 0) ? min((int64_t)size0, (int64_t)size1 - k) : min((int64_t)size0 + k, (int64_t)size1); + THCTensor_(resize1d)(state, self_, size); + if (size > 0) { + int64_t strideSelf = THCTensor_(stride)(state, self_, 0); + const dim3 threads(min((int64_t)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (int64_t)size)); + dim3 grid(min((int64_t)1024, (int64_t)THCCeilDiv(size, (int64_t)threads.x))); + int64_t start = (k >= 0 ? k * stride1 : -k * stride0); + THCTensor_copyFromDiagonal<<>> + (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, size, stride0 + stride1, strideSelf); + } + } else { + ptrdiff_t totalElements = THCTensor_(nElement)(state, src_); + ptrdiff_t size = (k > 0) ? totalElements + k : totalElements - k; + int64_t strideSrc = THCTensor_(stride)(state, src_, 0); + THCTensor_(resize2d)(state, self_, size, size); + THCTensor_(zero)(state, self_); + if (size > 0) { + int64_t stride0 = THCTensor_(stride)(state, self_, 0); + int64_t stride1 = THCTensor_(stride)(state, self_, 1); + const dim3 threads(min((int64_t)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (int64_t)size)); + dim3 grid(min((int64_t)1024, (int64_t)THCCeilDiv(size, (ptrdiff_t)threads.x))); + ptrdiff_t start = (k >= 0 ? k * stride1 : -k * stride0); + THCTensor_copyToDiagonal<<>> + (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, totalElements, stride0 + stride1, strideSrc); + } + } + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(eye)(THCState *state, THCTensor *self_, int64_t n, int64_t m) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + THArgCheck(n > 0, 1, "invalid argument"); + + if(m <= 0) + m = n; + + THCTensor_(resize2d)(state, self_, n, m); + THCTensor_(zero)(state, self_); + + int64_t sz = THMin(n, m); + int64_t stride = THCTensor_(stride)(state, self_, 0) + + THCTensor_(stride)(state, self_, 1); + + THCTensor *diag = THCTensor_(newWithStorage1d)(state, self_->storage, + self_->storageOffset, sz, stride); + + THCTensor_(fill)(state, diag, ScalarConvert::to(1)); + THCTensor_(free)(state, diag); +} + +accreal THCTensor_(trace)(THCState *state, THCTensor *src_) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, src_)); + THArgCheck((src_->_dim() == 2), 1, "expected a matrix"); + THCTensor *diag = THCTensor_(new)(state); + THCTensor_(diag)(state, diag, src_, 0); + accreal trace = THCTensor_(sumall)(state, diag); + THCTensor_(free)(state, diag); + return trace; +} + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_)); + // NumPy allows you to pass different points even if n <= 1 -- should we? + THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points"); + if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n); + if (n == 0) { + // skip + } else if (n == 1) THCTensor_(fill)(state, r_, a); + else { + THCTensor *r = THCTensor_(isContiguous)(state, r_) + ? r_ // if r_ is contiguous we can direct work on it + : THCTensor_(newContiguous)(state, r_); + real step = THCNumerics::div(THCNumerics::sub(b, a), + ScalarConvert::to(n - 1)); + LinspaceOp linspace_method(a, step); + thrust::device_ptr data_(THCTensor_(data)(state, r)); + thrust::tabulate(data_, data_ + n, linspace_method); + if (!THCTensor_(isContiguous)(state, r_)) { // We need to move data back to r_ + THCTensor_(freeCopyTo)(state, r, r_); + } + } + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_)); + // NumPy allows you to pass different points even if n <= 1 -- should we? + THArgCheck(n > 1 || ((n == 0 || n == 1) && (a == b)), 3, "invalid number of points"); + if (THCTensor_(nElement)(state, r_) != n) THCTensor_(resize1d)(state, r_, n); + if (n == 0) { + // skip + } else if (n == 1) THCTensor_(fill)(state, r_, THCNumerics::exp10(a)); + else { + THCTensor *r = THCTensor_(isContiguous)(state, r_) + ? r_ + : THCTensor_(newContiguous)(state, r_); + real step = THCNumerics::div(THCNumerics::sub(b, a), + ScalarConvert::to(n - 1)); + LogspaceOp logspace_method(a, step); + thrust::device_ptr data_(THCTensor_(data)(state, r)); + thrust::tabulate(data_, data_ + n, logspace_method); + if (!THCTensor_(isContiguous)(state, r_)) { + THCTensor_(freeCopyTo)(state, r, r_); + } + } + THCudaCheck(cudaGetLastError()); +} + +#endif + +void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_)); + THArgCheck(step > 0 || step < 0, 3, "step must be nonzero"); + THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin)) + , 2, "upper bound and larger bound inconsistent with step sign"); + ptrdiff_t size = (ptrdiff_t) (((xmax - xmin) / step) + 1); + if (THCTensor_(nElement)(state, r_) != size) THCTensor_(resize1d)(state, r_, size); + THCTensor *r = THCTensor_(newContiguous)(state, r_); + LinspaceOp linspace_method(xmin, step); + thrust::device_ptr data_(THCTensor_(data)(state, r)); + thrust::tabulate(data_, data_ + size, linspace_method); + THCTensor_(freeCopyTo)(state, r, r_); + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(arange)(THCState* state, THCTensor *r_, accreal xmin, accreal xmax, accreal step) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, r_)); + THArgCheck(step > 0 || step < 0, 3, "step must be nonzero"); + THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin)) + , 2, "upper bound and larger bound inconsistent with step sign"); + ptrdiff_t size = (ptrdiff_t) ceil(ScalarConvert::to(xmax - xmin) / step); + if (THCTensor_(nElement)(state, r_) != size) THCTensor_(resize1d)(state, r_, size); + THCTensor *r = THCTensor_(newContiguous)(state, r_); + LinspaceOp linspace_method(xmin, step); + thrust::device_ptr data_(THCTensor_(data)(state, r)); + thrust::tabulate(data_, data_ + size, linspace_method); + THCTensor_(freeCopyTo)(state, r, r_); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMath.h b/aten/src/THC/generic/THCTensorMath.h new file mode 100644 index 0000000..1cd7534 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMath.h @@ -0,0 +1,32 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMath.h" +#else + +THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, real value); +THC_API void THCTensor_(zero)(THCState *state, THCTensor *self); + +THC_API void THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor* input); +THC_API void THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor* input); +THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t); +THC_API void THCTensor_(cat)(THCState *state, THCTensor *result, THCTensor *ta, THCTensor *tb, int dimension); +THC_API void THCTensor_(catArray)(THCState *state, THCTensor *result, THCTensor **inputs, int numInputs, int dimension); +THC_API void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, THCTensor *self); + +THC_API void THCTensor_(tril)(THCState *state, THCTensor *self, THCTensor *src, int64_t k); +THC_API void THCTensor_(triu)(THCState *state, THCTensor *self, THCTensor *src, int64_t k); +THC_API void THCTensor_(diag)(THCState *state, THCTensor *self, THCTensor *src, int64_t k); +THC_API void THCTensor_(eye)(THCState *state, THCTensor *self, int64_t n, int64_t k); + +THC_API accreal THCTensor_(trace)(THCState *state, THCTensor *self); + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +THC_API void THCTensor_(linspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n); +THC_API void THCTensor_(logspace)(THCState *state, THCTensor *r_, real a, real b, int64_t n); + +#endif + +THC_API void THCTensor_(range)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step); +THC_API void THCTensor_(arange)(THCState *state, THCTensor *r_, accreal xmin, accreal xmax, accreal step); + +#endif diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu new file mode 100644 index 0000000..6d1da07 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -0,0 +1,944 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathBlas.cu" +#else + +#define ERROR_ONLY_FP_TYPES(func) \ + THError("%s for CUDA tensors only supports floating-point types. Try converting the tensors with .float()", func); + +THC_API accreal +THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + THArgCheck(THCTensor_(nElement)(state, self) == + THCTensor_(nElement)(state, src), 2, "sizes do not match"); + + self = THCTensor_(newContiguous)(state, self); + src = THCTensor_(newContiguous)(state, src); + +#ifdef THC_REAL_IS_FLOAT + accreal result = THCudaBlas_Sdot(state, + THCTensor_(nElement)(state, self), + THCTensor_(data)(state, self), 1, + THCTensor_(data)(state, src), 1); +#elif defined(THC_REAL_IS_DOUBLE) + accreal result = THCudaBlas_Ddot(state, + THCTensor_(nElement)(state, self), + THCTensor_(data)(state, self), 1, + THCTensor_(data)(state, src), 1); +#elif defined(THC_REAL_IS_HALF) + accreal result = ScalarConvert::to( + THCudaBlas_Hdot(state, + THCTensor_(nElement)(state, self), + THCTensor_(data)(state, self), 1, + THCTensor_(data)(state, src), 1)); +#endif + + THCTensor_(free)(state, src); + THCTensor_(free)(state, self); + return result; + +#else + ERROR_ONLY_FP_TYPES("dot"); + return ScalarConvert::to(0); +#endif +} + +THC_API void +THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); + if( (mat->_dim() != 2) || (vec->_dim() != 1) ) + THError("matrix and vector expected"); + + if( mat->size[1] != vec->size[0] ) + THError("size mismatch"); + + if(t->_dim() != 1) + THError("size mismatch"); + + if(t->size[0] != mat->size[0]) + THError("size mismatch"); + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + if(r_ != t) + { + THCTensor_(resizeAs)(state, r_, t); + THCTensor_(copy)(state, r_, t); + } + + if(mat->stride[0] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1], + alpha, THCTensor_(data)(state, mat), mat->stride[1], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1], + alpha, THCTensor_(data)(state, mat), mat->stride[1], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#endif + } + else if(mat->stride[1] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, mat), mat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, mat), mat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#endif + } + else + { + THCTensor *cmat = THCTensor_(newContiguous)(state, mat); + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, cmat), cmat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, cmat), cmat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#endif + + THCTensor_(free)(state, cmat); + } + +#elif defined(THC_REAL_IS_HALF) + // Currently no Hgemv/SgemvEx in Cublas + THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec); + THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size[0], 1); + + THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t); + THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size[0], 1); + + THCTensor_(addmm)(state, r_, beta, tAsMatrix, alpha, mat, vecAsMatrix); + + // r_ will have answer as matrix, need to return a vector + THCTensor_(resize1d)(state, r_, r_->size[0]); + THCTensor_(free)(state, vecAsMatrix); + THCTensor_(free)(state, tAsMatrix); +#endif +#else + ERROR_ONLY_FP_TYPES("addmv"); +#endif +} + +THC_API void +THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); + if ( (vec1->_dim() != 1) || (vec2->_dim() != 1) ) { + THError("vector and vector expected"); + } + + if (t->_dim() != 2) { + THError("size mismatch"); + } + + if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) { + THError("size mismatch"); + } + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + if (r_ != t) { + THCTensor_(resizeAs)(state, r_, t); + THCTensor_(copy)(state, r_, t); + } + + if(THCNumerics::eq(beta, ScalarConvert::to(0))) { + THCTensor_(zero)(state, r_); + } else if(THCNumerics::ne(beta, ScalarConvert::to(1))) { + THCTensor_(mul)(state, r_, r_, beta); + } + + if(r_->stride[0] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sger(state, vec1->size[0], vec2->size[0], + alpha, THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, r_), r_->stride[1]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dger(state, vec1->size[0], vec2->size[0], + alpha, THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, r_), r_->stride[1]); +#endif + } + else if(r_->stride[1] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, r_), r_->stride[0]); +#endif + } + else + { + THCTensor *cr = THCTensor_(newClone)(state, r_); + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, cr), cr->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, cr), cr->stride[0]); +#endif + + THCTensor_(freeCopyTo)(state, cr, r_); + } +#elif defined(THC_REAL_IS_HALF) + // currently no Hger/SgerEx in Cublas. + THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2); + THCTensor_(resize2d)(state, vec2T, vec2T->size[0], 1); + THCTensor_(transpose)(state, vec2T, NULL, 0, 1); + + THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1); + THCTensor_(resize2d)(state, vec1M, vec1M->size[0], 1); + + THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T); + THCTensor_(free)(state, vec2T); + THCTensor_(free)(state, vec1M); +#endif +#else + ERROR_ONLY_FP_TYPES("addr"); +#endif +} + +THC_API void +THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *m1, THCTensor *m2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2)); + char transpose_r, transpose_m1, transpose_m2; + THCTensor *r__, *m1_, *m2_; + + if( (m1->_dim() != 2) || (m2->_dim() != 2) ) + THError("matrices expected, got %dD, %dD tensors", m1->_dim(), m2->_dim()); + + if(t->_dim() != 2) + THError("matrix expected, got %dD tensor for t", t->_dim()); + + if(m1->size[1] != m2->size[0]) { + THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1); + THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2); + THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str); + } + + if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) { + THCDescBuff bt = THCTensor_(sizeDesc)(state, t); + THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1); + THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2); + THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str); + } + + if(t != r_) + { + THCTensor_(resizeAs)(state, r_, t); + if (ScalarConvert::to(beta) != 0.0) { + THCTensor_(copy)(state, r_, t); + } + } + + /* r_ */ + if(r_->stride[0] == 1 && + r_->stride[1] != 0) + { + transpose_r = 'n'; + r__ = r_; + } + else if(r_->stride[1] == 1 && + r_->stride[0] != 0) + { + THCTensor *swap = m2; + m2 = m1; + m1 = swap; + transpose_r = 't'; + r__ = r_; + } + else + { + transpose_r = 'n'; + + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, r_, 0, 1); + r__ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, r__, NULL, 0, 1); + } + + /* m1 */ + if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && + m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) + { + transpose_m1 = 'n'; + m1_ = m1; + } + else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && + m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) + { + transpose_m1 = 't'; + m1_ = m1; + } + else + { + transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); + m1_ = THCTensor_(newContiguous)(state, m1); + } + + /* m2 */ + if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && + m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) + { + transpose_m2 = 'n'; + m2_ = m2; + } + else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && + m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) + { + transpose_m2 = 't'; + m2_ = m2; + } + else + { + transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); + m2_ = THCTensor_(newContiguous)(state, m2); + } + +#ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm(state, + transpose_m1, + transpose_m2, + r__->size[(transpose_r == 'n' ? 0 : 1)], + r__->size[(transpose_r == 'n' ? 1 : 0)], + m1_->size[(transpose_r == 'n' ? 1 : 0)], + alpha, + THCTensor_(data)(state, m1_), + (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + THCTensor_(data)(state, m2_), + (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + beta, + THCTensor_(data)(state, r__), + r__->stride[(transpose_r == 'n' ? 1 : 0)]); +#elif defined(THC_REAL_IS_FLOAT) + THCudaBlas_Sgemm(state, + transpose_m1, + transpose_m2, + r__->size[(transpose_r == 'n' ? 0 : 1)], + r__->size[(transpose_r == 'n' ? 1 : 0)], + m1_->size[(transpose_r == 'n' ? 1 : 0)], + alpha, + THCTensor_(data)(state, m1_), + (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + THCTensor_(data)(state, m2_), + (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + beta, + THCTensor_(data)(state, r__), + r__->stride[(transpose_r == 'n' ? 1 : 0)]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm(state, + transpose_m1, + transpose_m2, + r__->size[(transpose_r == 'n' ? 0 : 1)], + r__->size[(transpose_r == 'n' ? 1 : 0)], + m1_->size[(transpose_r == 'n' ? 1 : 0)], + alpha, + THCTensor_(data)(state, m1_), + (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + THCTensor_(data)(state, m2_), + (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + beta, + THCTensor_(data)(state, r__), + r__->stride[(transpose_r == 'n' ? 1 : 0)]); +#endif + + /* free intermediate variables */ + if(m1_ != m1) { + THCTensor_(free)(state, m1_); + } + + if(m2_ != m2) { + THCTensor_(free)(state, m2_); + } + + if(r__ != r_) { + THCTensor_(freeCopyTo)(state, r__, r_); + } +#else + ERROR_ONLY_FP_TYPES("addmm"); +#endif +} + +THC_API void +THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, + real alpha, THCTensor *batch1, THCTensor *batch2) { +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); + THArgCheck(THCTensor_(_nDimension)(state, t) == 2, 4, "expected 2D tensor"); + THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); + THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); + + int64_t batchnum = THCTensor_(size)(state, batch1, 0); + int64_t m1d1 = THCTensor_(size)(state, batch1, 1); + int64_t innerdim = THCTensor_(size)(state, batch1, 2); + int64_t m2d2 = THCTensor_(size)(state, batch2, 2); + + THArgCheck(batchnum == THCTensor_(size)(state, batch2, 0), 7, + "equal number of batches expected"); + // M is t, as listed in the docs under addbmm + THArgCheck(m1d1 == THCTensor_(size)(state, t, 0), 6, + "first dimension must match first dimension of M"); + THArgCheck(m2d2 == THCTensor_(size)(state, t, 1), 7, + "second dimension must match second dimension of M"); + THArgCheck(innerdim == THCTensor_(size)(state, batch2, 1), 6, + "second dimension must match first dimension of batch2"); + + if (t != result) { + THCTensor_(resizeAs)(state, result, t); + if (ScalarConvert::to(beta) != 0.0) { + THCTensor_(copy)(state, result, t); + } + } + + THCTensor *slice1 = THCTensor_(new)(state); + THCTensor *slice2 = THCTensor_(new)(state); + for (int64_t i=0; i::to(1); + } + THCTensor_(free)(state, slice1); + THCTensor_(free)(state, slice2); +#else + ERROR_ONLY_FP_TYPES("addbmm"); +#endif +} + +__global__ void createBatchGemmBuffer(const real** buffer, real* data, + int64_t stride, int64_t num_batches) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) { + buffer[idx] = data + idx * stride; + } +} + +__global__ void createBatchGemmBuffer3(const real** buffer1, const real ** buffer2, const real ** buffer3, real* data1, + real * data2, real * data3, int64_t stride1, int64_t stride2, int64_t stride3, int64_t num_batches) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) { + buffer1[idx] = data1 + idx * stride1; + buffer2[idx] = data2 + idx * stride2; + buffer3[idx] = data3 + idx * stride3; + } +} + +THC_API void +THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, + real alpha, THCTensor *batch1, THCTensor *batch2) { +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); + THArgCheck(THCTensor_(_nDimension)(state, t) == 3, 4, "expected 3D tensor"); + THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); + THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); + THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6, + "equal number of batches expected"); + THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7, + "equal number of batches expected"); + THArgCheck(THCTensor_(size)(state, t, 1) == THCTensor_(size)(state, batch1, 1), 6, + "wrong matrix size"); + THArgCheck(THCTensor_(size)(state, t, 2) == THCTensor_(size)(state, batch2, 2), 7, + "wrong matrix size"); + THArgCheck(THCTensor_(size)(state, batch1, 2) == THCTensor_(size)(state, batch2, 1), 6, + "wrong matrix size"); + + if (t != result) { + THCTensor_(resizeAs)(state, result, t); + if (ScalarConvert::to(beta) != 0.0) { + THCTensor_(copy)(state, result, t); + } + } + + bool transpose_result; + char transpose_batch1, transpose_batch2; + int64_t lda, ldb, ldc; + THCTensor *result_, *batch1_, *batch2_; + if (result->stride[1] == 1) + { + transpose_result = false; + result_ = result; + ldc = result_->stride[2]; + } + else if (result->stride[2] == 1) + { + transpose_result = true; + + THCTensor *swap = batch2; + batch2 = batch1; + batch1 = swap; + + result_ = result; + ldc = result_->stride[1]; + } + else + { + transpose_result = false; + + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, result, 1, 2); + result_ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, result_, NULL, 1, 2); + + ldc = result_->stride[2]; + } + + if (batch1->stride[transpose_result ? 2 : 1] == 1 && + batch1->stride[transpose_result ? 1 : 2] != 0) + { + transpose_batch1 = 'n'; + batch1_ = batch1; + lda = batch1_->stride[transpose_result ? 1 : 2]; + } + else if (batch1->stride[transpose_result ? 1 : 2] == 1 && + batch1->stride[transpose_result ? 2 : 1] != 0) + { + transpose_batch1 = 't'; + batch1_ = batch1; + lda = batch1_->stride[transpose_result ? 2 : 1]; + } + else + { + transpose_batch1 = transpose_result ? 'n' : 't'; + // batch1_ is later freed if batch1_ != batch1 + if (THCTensor_(isContiguous)(state, batch1)) { + batch1_ = batch1; + } else { + batch1_ = THCTensor_(newContiguous)(state, batch1); + } + lda = batch1_->stride[1]; + } + + if (batch2->stride[transpose_result ? 2 : 1] == 1 && + batch2->stride[transpose_result ? 1 : 2] != 0) + { + transpose_batch2 = 'n'; + batch2_ = batch2; + ldb = batch2_->stride[transpose_result ? 1 : 2]; + } + else if (batch2->stride[transpose_result ? 1 : 2] == 1 && + batch2->stride[transpose_result ? 2 : 1] != 0) + { + transpose_batch2 = 't'; + batch2_ = batch2; + ldb = batch2_->stride[transpose_result ? 2 : 1]; + } + else + { + transpose_batch2 = transpose_result ? 'n' : 't'; + // batch2_ is later freed if batch2_ != batch2 + if (THCTensor_(isContiguous)(state, batch2)) { + batch2_ = batch2; + } else { + batch2_ = THCTensor_(newContiguous)(state, batch2); + } + ldb = batch2_->stride[1]; + } + int64_t num_batches = result_->size[0]; + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + // Compute pointers to matrices in each batch. +#if CUDA_VERSION < 8000 + size_t matrices_size = num_batches * sizeof(real*); + +// Copy pointers to device. + auto d_matrices1 = static_cast(THCudaMalloc(state, matrices_size)); + auto d_matrices2 = static_cast(THCudaMalloc(state, matrices_size)); + auto d_result_matrices = static_cast(THCudaMalloc(state, matrices_size)); + + const int64_t block = 512; + const int64_t grid = (num_batches + block - 1) / block; + + createBatchGemmBuffer3<<>>( + d_matrices1, d_matrices2, (const real**)d_result_matrices, THCTensor_(data)(state, batch1_), + THCTensor_(data)(state, batch2_), THCTensor_(data)(state, result_), + batch1_->stride[0], batch2_->stride[0], result_->stride[0], num_batches); + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_SgemmBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + d_matrices1, lda, + d_matrices2, ldb, + beta, + d_result_matrices, ldc, + num_batches); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_DgemmBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + d_matrices1, lda, + d_matrices2, ldb, + beta, + d_result_matrices, ldc, + num_batches); +#endif //THC_REAL + + THCudaFree(state, d_matrices1); + THCudaFree(state, d_matrices2); + THCudaFree(state, d_result_matrices); + +#else +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_SgemmStridedBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + THCTensor_(data)(state, batch1_), lda, batch1_->stride[0], + THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0], + beta, + THCTensor_(data)(state, result_), ldc, result_->stride[0], + num_batches); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_DgemmStridedBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + THCTensor_(data)(state, batch1_), lda, batch1_->stride[0], + THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0], + beta, + THCTensor_(data)(state, result_), ldc, result_->stride[0], + num_batches); +#endif //THC_REAL +#endif //CUDA_VERSION + +#elif defined(THC_REAL_IS_HALF) + +#if CUDA_VERSION < 9010 + // Currently no HgemmBatched in Cublas + for (int64_t i = 0; i < num_batches; ++i) { + THCudaBlas_Hgemm( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda, + THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb, + beta, + THCTensor_(data)(state, result_) + i * result_->stride[0], ldc); + } +#else + cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state); + if (prop->major >= 5){ + + THCudaBlas_HgemmStridedBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + THCTensor_(data)(state, batch1_), lda, batch1_->stride[0], + THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0], + beta, + THCTensor_(data)(state, result_), ldc, result_->stride[0], + num_batches); + } else { + for (int64_t i = 0; i < num_batches; ++i) { + THCudaBlas_Hgemm( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda, + THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb, + beta, + THCTensor_(data)(state, result_) + i * result_->stride[0], ldc); + } + } + +#endif +#endif + if (batch1_ != batch1) { + THCTensor_(free)(state, batch1_); + } + + if (batch2_ != batch2) { + THCTensor_(free)(state, batch2_); + } + + if (result_ != result) { + THCTensor_(freeCopyTo)(state, result_, result); + } + +#else + ERROR_ONLY_FP_TYPES("baddbmm"); +#endif +} + +THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 2, ra_, a)); + THArgCheck(THCTensor_(_nDimension)(state, a) == 3, 3, "expected 3D tensor"); + THArgCheck(THCTensor_(size)(state, a, 1) == + THCTensor_(size)(state, a, 2), 3, "matrices must be square"); + + if (ra_ != a) { + THCTensor_(resizeAs)(state, ra_, a); + // not sure if this is kosher, but things are nicer if we return in column major + if (ra_->stride[0] == 1) { + THCTensor_(transpose)(state, ra_, NULL, 1, 0); + } else if (ra_->stride[2] == 1) { + THCTensor_(transpose)(state, ra_, NULL, 1, 2); + } + THCTensor_(copy)(state, ra_, a); + } + + + int n = a->size[1]; + int lda; + THCTensor *ra__; + + if (ra_->stride[1] == 1) { + // column ordered, what BLAS wants + lda = ra_->stride[2]; + ra__ = ra_; + } else { + // not column ordered, need to make it such (requires copy) + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, ra_, 1, 2); + ra__ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, ra__, NULL, 1, 2); + lda = ra__->stride[2]; + } + + int64_t num_batches = ra__->size[0]; + + if (!pivot) { + THCudaIntTensor *t = THCudaIntTensor_new(state); + THCudaIntTensor_range(state, t, 1, n, 1); + THCudaIntTensor_unsqueeze1d(state, t, t, 0); + THCudaIntTensor** ptrs = (THCudaIntTensor**) THAlloc(sizeof(THCudaIntTensor*)*num_batches); + for (int64_t i=0; i(THCudaMalloc(state, matrices_size)); + + const int64_t block = 512; + const int64_t grid = (num_batches + block - 1) / block; + createBatchGemmBuffer<<>>( + (const real**)d_result, THCTensor_(data)(state, ra__), + ra__->stride[0], num_batches); + + int *pivots_gpu = NULL; + if (pivot) { + pivots_gpu = THCudaIntTensor_data(state, rpivots_); + } +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgetrf(state, n, d_result, lda, pivots_gpu, info_gpu, num_batches); +#endif + + THCudaFree(state, d_result); + + if (ra__ != ra_) { + THCTensor_(freeCopyTo)(state, ra__, ra_); + } + + if (free_rinfo_) { + int min = THCudaIntTensor_minall(state, rinfo_); + int max = THCudaIntTensor_maxall(state, rinfo_); + THCudaIntTensor_free(state, rinfo_); + if (min != 0 || max != 0) { + THError("failed to factorize some batch elements (min info == %d, max info == %d)", + min, max); + } + } + +#else + THError("btrifact for CUDA tensors is only supported for floats and doubles"); +#endif +} + + +THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, + THCTensor *atf, THCudaIntTensor *pivots) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b)); + THArgCheck(THCTensor_(_nDimension)(state, atf) == 3, 3, "expected 3D tensor"); + THArgCheck(THCTensor_(_nDimension)(state, b) == 3 || + THCTensor_(_nDimension)(state, b) == 2, 4, "expected 2D or 3D tensor"); + THArgCheck(THCTensor_(size)(state, atf, 0) == + THCTensor_(size)(state, b, 0), 3, "number of batches must be equal"); + THArgCheck(THCTensor_(size)(state, atf, 1) == + THCTensor_(size)(state, atf, 2), 3, "A matrices must be square"); + THArgCheck(THCTensor_(size)(state, atf, 1) == + THCTensor_(size)(state, b, 1), 3, "dimensions of A and b must be equal"); + + if (rb_ != b) { + THCTensor_(resizeAs)(state, rb_, b); + THCTensor_(copy)(state, rb_, b); + } + + + int n = atf->size[1]; + int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1; + THCTensor *atf_; + THCTensor *rb__; + int lda, ldb; + + // correct ordering of A_tf + if (atf->stride[1] == 1) { + // column ordered, what BLAS wants + lda = atf->stride[2]; + atf_ = atf; + } else { + // not column ordered, need to make it such (requires copy) + // it would be nice if we could use the op(A) flags to automatically + // transpose A if needed, but this leads to unpredictable behavior if the + // user clones A_tf later with a different ordering + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, atf, 1, 2); + atf_ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, atf_, NULL, 1, 2); + lda = atf_->stride[2]; + } + + // correct ordering of B + if (rb_->stride[1] == 1) { + // column ordered + if (rb_->_dim() == 2 || rb_->size[2] == 1) { + ldb = n; + } else { + ldb = rb_->stride[2]; + } + rb__ = rb_; + } else { + // make column ordered + if (rb_->_dim() > 2) { + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, rb_, 1, 2); + rb__ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, rb__, NULL, 1, 2); + ldb = rb__->stride[2]; + } else { + rb__ = THCTensor_(newClone)(state, rb_); + ldb = n; + } + } + + int64_t num_batches = rb_->size[0]; + size_t matrices_size = num_batches * sizeof(real*); + + // Copy pointers to device. + auto d_result = static_cast(THCudaMalloc(state, matrices_size)); + auto d_atf = static_cast(THCudaMalloc(state, matrices_size)); + + const int64_t block = 512; + const int64_t grid = (num_batches + block - 1) / block; + createBatchGemmBuffer<<>>( + (const real**)d_result, THCTensor_(data)(state, rb__), + rb__->stride[0], num_batches); + createBatchGemmBuffer<<>>( + d_atf, THCTensor_(data)(state, atf_), + atf_->stride[0], num_batches); + + if (!THCudaIntTensor_isContiguous(state, pivots)) { + THError("Error: pivots is not contiguous."); + } + + int *pivots_data = THCudaIntTensor_data(state, pivots); + int info; + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgetrs(state, 'n', n, nrhs, d_atf, lda, pivots_data, d_result, ldb, &info, num_batches); +#endif + + if (info < 0) { + THError("Illegal arg %d", -info); + } + + THCudaFree(state, d_result); + THCudaFree(state, d_atf); + + if (atf_ != atf) { + THCTensor_(free)(state, atf_); + } + + if (rb__ != rb_) { + THCTensor_(freeCopyTo)(state, rb__, rb_); + } + +#else + THError("btrisolve for CUDA tensors is only supported for floats and doubles"); +#endif +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMathBlas.h b/aten/src/THC/generic/THCTensorMathBlas.h new file mode 100644 index 0000000..1279d7e --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathBlas.h @@ -0,0 +1,16 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathBlas.h" +#else + +THC_API accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(addmv)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec); +THC_API void THCTensor_(addmm)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat1, THCTensor *mat2); +THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2); +THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2); +THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2); + +THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a); +THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *atf, THCudaIntTensor *pivots); + + +#endif diff --git a/aten/src/THC/generic/THCTensorMathCompare.cu b/aten/src/THC/generic/THCTensorMathCompare.cu new file mode 100644 index 0000000..fca7046 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathCompare.cu @@ -0,0 +1,101 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu" +#else + +THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLTValueOp(value)); +} + +THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGTValueOp(value)); +} + +THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLEValueOp(value)); +} + +THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGEValueOp(value)); +} + +THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorEQValueOp(value)); +} + +THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorNEValueOp(value)); +} + +THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLTValueOp(value)); +} + +THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGTValueOp(value)); +} + +THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLEValueOp(value)); +} + +THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGEValueOp(value)); +} + +THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorEQValueOp(value)); +} + +THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorNEValueOp(value)); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMathCompare.h b/aten/src/THC/generic/THCTensorMathCompare.h new file mode 100644 index 0000000..7b8837c --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathCompare.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompare.h" +#else + +THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); + +THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); + + +#endif diff --git a/aten/src/THC/generic/THCTensorMathCompareT.cu b/aten/src/THC/generic/THCTensorMathCompareT.cu new file mode 100644 index 0000000..ee7bc41 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathCompareT.cu @@ -0,0 +1,113 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu" +#else + +THC_API void +THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLTOp()); +} + +THC_API void +THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGTOp()); +} + +THC_API void +THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLEOp()); +} + +THC_API void +THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGEOp()); +} + +THC_API void +THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorEQOp()); +} + +THC_API void +THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorNEOp()); +} + +THC_API void +THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLTOp()); +} + +THC_API void +THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGTOp()); +} + +THC_API void +THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLEOp()); +} + +THC_API void +THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGEOp()); +} + +THC_API void +THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorEQOp()); +} + +THC_API void +THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorNEOp()); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMathCompareT.h b/aten/src/THC/generic/THCTensorMathCompareT.h new file mode 100644 index 0000000..0d76835 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathCompareT.h @@ -0,0 +1,19 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.h" +#else + +THC_API void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); + +THC_API void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); + +#endif diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu new file mode 100644 index 0000000..fa72207 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -0,0 +1,737 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathMagma.cu" +#else + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + +#ifdef USE_MAGMA + +static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src, int k) +{ + int64_t size[1] = { k }; + int64_t stride[1] = { 1 }; + THCTensor_(resizeNd)(state, self, 1, size, stride); + size_t len = k * sizeof(real); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice)); +} + +static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n) +{ + int64_t size[2] = { m, n }; + int64_t stride[2] = { 1, m }; + THCTensor_(resizeNd)(state, self, 2, size, stride); + size_t len = m * n * sizeof(real); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice)); +} + +static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self) +{ + THAssert(self->_dim() == 2); + size_t len = THCTensor_(nElement)(state, self)*sizeof(real); + THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1); + THCTensor *selfc = THCTensor_(newContiguous)(state, temp); + THCudaCheck(cudaMemcpy(dst, THCStorage_(data)(state, selfc->storage) + selfc->storageOffset, len, cudaMemcpyDeviceToHost)); + THCTensor_(free)(state, temp); + THCTensor_(free)(state, selfc); +} + +#endif // USE_MAGMA + +static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src) +{ + THAssert(src->_dim() == 2); + if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0]) + { + THCTensor_(retain)(state, self); + return self; + } + + if (self == src) + self = THCTensor_(new)(state); + else + THCTensor_(retain)(state, self); + + int64_t size[2] = { src->size[0], src->size[1] }; + int64_t stride[2] = { 1, src->size[0] }; + + THCTensor_(resizeNd)(state, self, 2, size, stride); + THCTensor_(copy)(state, self, src); + return self; +} + + +THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); + THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional"); + THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square"); + THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible"); + + int64_t n = a_->size[0]; + int64_t nrhs = b_->size[1]; + + THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); + THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); + real *a_data = THCTensor_(data)(state, a); + real *b_data = THCTensor_(data)(state, b); + + int *ipiv = th_magma_malloc_pinned(n); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info); +#else + magma_dgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info); +#endif + + if (info < 0) + THError("MAGMA gesv : Argument %d : illegal value", -info); + else if (info > 0) + THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info); + + magma_free_pinned(ipiv); + THCTensor_(freeCopyTo)(state, a, ra_); + THCTensor_(freeCopyTo)(state, b, rb_); +#else + THError(NoMagma(gesv)); +#endif +} + +THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_, + const char *uplo, const char *trans, const char *diag) +{ +#ifdef USE_MAGMA + THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); + THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional"); + THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square"); + THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible"); + + magma_side_t sz = MagmaLeft; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + magma_trans_t ts = trans[0] == 'N' ? MagmaNoTrans : MagmaTrans; + magma_diag_t dg = diag[0] == 'U' ? MagmaUnit : MagmaNonUnit; + + real alpha = 1; + + int64_t n = a_->size[0]; + int64_t nrhs = b_->size[1]; + + THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); + THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); + real *a_data = THCTensor_(data)(state, a); + real *b_data = THCTensor_(data)(state, b); + +#if defined(THC_REAL_IS_FLOAT) + magma_strsm(sz, ul, ts, dg, n, nrhs, alpha, a_data, n, b_data, n); +#else + magma_dtrsm(sz, ul, ts, dg, n, nrhs, alpha, a_data, n, b_data, n); +#endif + + THCTensor_(freeCopyTo)(state, a, ra_); + THCTensor_(freeCopyTo)(state, b, rb_); +#else + THError(NoMagma(trtrs)); +#endif +} + +THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); + THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional"); + THArgCheck(a_->size[0] == b_->size[0], 2, "Expected A and b to have same size " + "at dim 0, but they have incompatible sizes"); + THArgCheck(a_->size[0] >= a_->size[1], 2, "Expected A with shape (m x n) to have " + "m >= n. The case for m < n is not implemented yet."); + + THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); + THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); + real *a_data = THCTensor_(data)(state, a); + real *b_data = THCTensor_(data)(state, b); + + int64_t m = a->size[0]; + int64_t n = a->size[1]; + int64_t nrhs = b->size[1]; + real wkopt; + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); +#else + magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); +#endif + + real *hwork = th_magma_malloc_pinned((size_t)wkopt); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); +#else + magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); +#endif + + magma_free_pinned(hwork); + + if (info != 0) + THError("MAGMA gels : Argument %d : illegal value", -info); + + THCTensor_(freeCopyTo)(state, a, ra_); + THCTensor_(freeCopyTo)(state, b, rb_); +#else + THError(NoMagma(gels)); +#endif +} + +THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) +{ +#ifdef USE_MAGMA + int64_t n = a->size[0]; + int64_t lda = n; + + magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower; + magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec; + + THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a); + real *input_data = THCTensor_(data)(state, input); + + // eigen values and workspace + real *w = th_magma_malloc_pinned(n); + real *wA = th_magma_malloc_pinned(lda * n); + + // compute optimal size of work array + int info; + real lwork; + int liwork; + +#if defined(THC_REAL_IS_FLOAT) + magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); +#else + magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); +#endif + + real *work = th_magma_malloc_pinned((size_t)lwork); + int *iwork = th_magma_malloc_pinned(liwork); + + // compute eigenvalues and, optionally, eigenvectors +#if defined(THC_REAL_IS_FLOAT) + magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); +#else + magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); +#endif + + // copy eigen values from w to re_ + if (info == 0) + THCTensor_(copyArray1d)(state, re_, w, n); + + magma_free_pinned(iwork); + magma_free_pinned(work); + magma_free_pinned(wA); + magma_free_pinned(w); + + // check error value + if (info > 0) + THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); + else if (info < 0) + THError("MAGMA syev : Argument %d : illegal value", -info); + + THCTensor_(freeCopyTo)(state, input, rv_); +#else + THError(NoMagma(syev)); +#endif +} + +THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) +{ +#ifdef USE_MAGMA + THArgCheck(!a_->is_empty() && a_->dim() == 2, 3, "A should be (non-empty) 2 dimensional"); + THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square"); + + magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec; + int64_t n = a_->size[0]; + + real *a_data = th_magma_malloc_pinned(n * n); + THCTensor_(copyTensor2d)(state, a_data, a_); + + real *wr = th_magma_malloc_pinned(n); + real *wi = th_magma_malloc_pinned(n); + + real *vr_data = NULL; + int64_t ldvr = 1; + if (jobvr == MagmaVec) + { + vr_data = th_magma_malloc_pinned(n * n); + ldvr = n; + } + + real wkopt; + int info; + +#if defined(THC_REAL_IS_FLOAT) + magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); +#else + magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); +#endif + + int lwork = (int) wkopt; + real *work_data = th_magma_malloc_pinned(lwork); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); +#else + magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); +#endif + + if (info > 0) + THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); + else if (info < 0) + THError("MAGMA geev : Argument %d : illegal value", -info); + + { + THCTensor_(resize2d)(state, re_, 2, n); + THCTensor *re = THCTensor_(newContiguous)(state, re_); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice)); + THCTensor_(freeCopyTo)(state, re, re_); + THCTensor_(transpose)(state, re_, NULL, 0, 1); + } + + if (jobvr == MagmaVec) + THCTensor_(copyArray2d)(state, rv_, vr_data, n, n); + + magma_free_pinned(work_data); + magma_free_pinned(vr_data); + magma_free_pinned(wi); + magma_free_pinned(wr); + magma_free_pinned(a_data); + +#else + THError(NoMagma(geev)); +#endif +} + +THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu) +{ +#ifdef USE_MAGMA + THCTensor *ra_ = THCTensor_(new)(state); + THCTensor_(gesvd2)(state, ru_, rs_, rv_, ra_, a, jobu); + THCTensor_(free)(state, ra_); +#else + THError(NoMagma(gesvd)); +#endif +} + +THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus) +{ +#ifdef USE_MAGMA + THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); + + magma_vec_t jobz = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec; + + int iunused[1]; + int64_t m = a->size[0]; + int64_t n = a->size[1]; + int64_t k = m < n ? m : n; + int64_t j = (jobz == MagmaAllVec) ? m : k; + int64_t jv = (jobz == MagmaAllVec) ? n : k; + + real *a_data = th_magma_malloc_pinned(m * n); + THCTensor_(copyTensor2d)(state, a_data, a); + + real *rs_data = th_magma_malloc_pinned(k); + real *ru_data = th_magma_malloc_pinned(m * j); + real *rv_data = th_magma_malloc_pinned(n * n); + + real wkopt; + int info; + +#if defined(THC_REAL_IS_FLOAT) + magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info); +#else + magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, iunused, &info); +#endif + + int lwork = (int) wkopt; + real *work_data = th_magma_malloc_pinned(lwork); + int *iwork = th_magma_malloc_pinned(8 * k); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info); +#else + magma_dgesdd(jobz, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, iwork, &info); +#endif + + if (info > 0) + THError("MAGMA gesdd : the updating process of SBDSDC did not converge (error: %d)", info); + else if (info < 0) + THError("MAGMA gesdd : Argument %d : illegal value", -info); + + THCTensor_(copyArray2d)(state, rv_, rv_data, n, n); + THCTensor_(transpose)(state, rv_, NULL, 0, 1); + if (jobz != MagmaAllVec) + THCTensor_(narrow)(state, rv_, rv_, 1, 0, jv); + THCTensor_(copyArray2d)(state, ru_, ru_data, m, j); + THCTensor_(copyArray1d)(state, rs_, rs_data, k); + THCTensor_(copyArray2d)(state, ra_, a_data, m, n); + + magma_free_pinned(work_data); + magma_free_pinned(iwork); + magma_free_pinned(rv_data); + magma_free_pinned(ru_data); + magma_free_pinned(rs_data); + magma_free_pinned(a_data); +#else + THError(NoMagma(gesvd2)); +#endif +} + +THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) +{ + THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + +#ifdef USE_MAGMA + int info; + int64_t n = a->size[0]; + int lwork = n * magma_get_sgetri_nb(n); + + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + real *input_data = THCTensor_(data)(state, input); + + int *ipiv = th_magma_malloc_pinned(n); + + THCTensor *work = THCTensor_(newWithSize1d)(state, lwork); + real *work_data = THCTensor_(data)(state, work); + + // Run LU +#if defined(THC_REAL_IS_FLOAT) + magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info); +#else + magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info); +#endif + + if (info > 0) + THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("MAGMA getrf : Argument %d : illegal value", -info); + + // Inverse +#if defined(THC_REAL_IS_FLOAT) + magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); +#else + magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); +#endif + + if (info > 0) + THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("MAGMA getri : Argument %d : illegal value", -info); + + THCTensor_(free)(state, work); + magma_free_pinned(ipiv); + THCTensor_(freeCopyTo)(state, input, ra_); +#else + int64_t n = a->size[0]; + + // input + THCTensor *input = THCTensor_(newColumnMajor)(state, a, a); + THCTensor_(resizeNd)(state, ra_, 2, input->size, input->stride); + + real *matrices1[1] = { THCTensor_(data)(state, input) }; + real *matrices2[1] = { THCTensor_(data)(state, ra_) }; + + // Copy pointers to device. + auto d_matrices1 = static_cast(THCudaMalloc(state, sizeof(real*))); + auto d_matrices2 = static_cast(THCudaMalloc(state, sizeof(real*))); + + THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, sizeof(real*), + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, sizeof(real*), + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + int info; + auto info_gpu = static_cast(THCudaMalloc(state, sizeof(int))); + + auto ipiv_gpu = static_cast(THCudaMalloc(state, n * sizeof(int))); + + // Run LU +#if defined(THC_REAL_IS_FLOAT) + THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); +#else + THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); +#endif + + THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); + + if (info > 0) + THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("CUBLAS getrf : Argument %d : illegal value", -info); + + // Inverse +#if defined(THC_REAL_IS_FLOAT) + THCudaBlas_Sgetri(state, n, (const real**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); +#else + THCudaBlas_Dgetri(state, n, (const real**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); +#endif + + THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); + + if (info > 0) + THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("CUBLAS getri : Argument %d : illegal value", -info); + + THCudaFree(state, ipiv_gpu); + THCudaFree(state, info_gpu); + + THCudaFree(state, d_matrices1); + THCudaFree(state, d_matrices2); + + THCTensor_(free)(state, input); +#endif +} + +__global__ void THCTensor_(copyUpperSymmetric)(real *input, int n, int len) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) { + const int r = idx % n; + const int c = idx / n; + if (r > c) { + input[idx] = input[r*n + c]; + } + } +} + +__global__ void THCTensor_(copyLowerSymmetric)(real *input, int n, int len) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) { + const int r = idx % n; + const int c = idx / n; + if (r < c) { + input[idx] = input[r*n + c]; + } + } +} + +THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +{ +#ifdef USE_MAGMA + THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int64_t n = a->size[0]; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + real *input_data = THCTensor_(data)(state, input); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_spotri_gpu(ul, n, input_data, n, &info); +#else + magma_dpotri_gpu(ul, n, input_data, n, &info); +#endif + + if (info > 0) + THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info); + else if (info < 0) + THError("MAGMA potri : Argument %d : illegal value", -info); + + cudaStream_t stream = THCState_getCurrentStream(state); + const int len = n*n; + dim3 blocks(std::min(DIVUP(len, 128), 65535)); + dim3 threads(128); + if (uplo[0] == 'U') { + THCTensor_(copyUpperSymmetric)<<>>(input_data, n, len); + } else { + THCTensor_(copyLowerSymmetric)<<>>(input_data, n, len); + } + + THCTensor_(freeCopyTo)(state, input, ra_); +#else + THError(NoMagma(potri)); +#endif +} + +THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +{ +#ifdef USE_MAGMA + THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int64_t n = a->size[0]; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + real *input_data = THCTensor_(data)(state, input); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_spotrf_gpu(ul, n, input_data, n, &info); +#else + magma_dpotrf_gpu(ul, n, input_data, n, &info); +#endif + + // check error value + if (info > 0) + THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info); + else if (info < 0) + THError("MAGMA potrf : Argument %d : illegal value", -info); + + if (uplo[0] == 'U') { + THCTensor_(triu)(state, ra_, input, 0); + } else { + THCTensor_(tril)(state, ra_, input, 0); + } + THCTensor_(free)(state, input); +#else + THError(NoMagma(potrf)); +#endif +} + +THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) +{ +#ifdef USE_MAGMA + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int64_t n = a->size[0]; + int64_t nrhs = b->size[1]; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + + THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b); + real *b_data = THCTensor_(data)(state, b_); + THCTensor *a_ = THCTensor_(newColumnMajor)(state, a, a); + real *a_data = THCTensor_(data)(state, a_); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_spotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info); +#else + magma_dpotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info); +#endif + + // check error value + if (info < 0) + THError("MAGMA potrs : Argument %d : illegal value", -info); + + THCTensor_(freeCopyTo)(state, b_, rb_); + THCTensor_(free)(state, a_); +#else + THError(NoMagma(potrs)); +#endif +} + +THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); + + THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); + int64_t m = a->size[0]; + int64_t n = a->size[1]; + int64_t k = (m < n ? m : n); + +#if defined(THC_REAL_IS_FLOAT) + int64_t nb = magma_get_sgeqrf_nb(m, n); +#else + int64_t nb = magma_get_dgeqrf_nb(m, n); +#endif + + real *rtau_data = th_magma_malloc_pinned(k); + real *a_data = THCTensor_(data)(state, a); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_sgeqrf2_gpu(m, n, a_data, m, rtau_data, &info); +#else + magma_dgeqrf2_gpu(m, n, a_data, m, rtau_data, &info); +#endif + + if (info != 0) + THError("MAGMA geqrf2 : Argument %d : illegal value.", -info); + + THCTensor_(freeCopyTo)(state, a, ra_); + THCTensor_(copyArray1d)(state, rtau_, rtau_data, k); + magma_free_pinned(rtau_data); +#else + THError(NoMagma(geqrf)); +#endif +} + +THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); + + THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_); + int64_t m = a->size[0]; + int64_t n = a->size[1]; + int64_t k = (m < n ? m : n); + +#if defined(THC_REAL_IS_FLOAT) + int64_t nb = magma_get_sgeqrf_nb(m, n); +#else + int64_t nb = magma_get_dgeqrf_nb(m, n); +#endif + + real *a_data = THCTensor_(data)(state, a); + real *tau_data = th_magma_malloc_pinned(k); + THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + magma_roundup(n, 32))*nb); + real *work_data = THCTensor_(data)(state, work); + + int info; + // We need to call two different versions of ?geqrf: + // ?geqrf_gpu allows fast computation of Q via ?orqrf_gpu, but doesn't give + // R properly. Note that the MAGMA documentation for this method is wrong. + // http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800 + // ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orqrf_gpu +#if defined(THC_REAL_IS_FLOAT) + magma_sgeqrf2_gpu(m, n, a_data, m, tau_data, &info); +#else + magma_dgeqrf2_gpu(m, n, a_data, m, tau_data, &info); +#endif + + if (info != 0) + THError("MAGMA geqrf2 : Argument %d : illegal value.", -info); + + THCTensor_(narrow)(state, a, a, 0, 0, k); + THCTensor_(triu)(state, rr_, a, 0); + THCTensor_(free)(state, a); + + a = THCTensor_(newColumnMajor)(state, rq_, a_); + a_data = THCTensor_(data)(state, a); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info); +#else + magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info); +#endif + + if (info != 0) + THError("MAGMA geqrf : Argument %d : illegal value.", -info); + + THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a); + real *q_data = THCTensor_(data)(state, q); + +#if defined(THC_REAL_IS_FLOAT) + magma_sorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info); +#else + magma_dorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info); +#endif + + if (info != 0) + THError("MAGMA orgqr : Argument %d : illegal value.", -info); + + THCTensor_(free)(state, a); + THCTensor_(free)(state, work); + magma_free_pinned(tau_data); + + THCTensor_(narrow)(state, q, q, 1, 0, k); + THCTensor_(freeCopyTo)(state, q, rq_); +#else + THError(NoMagma(qr)); +#endif +} + +#endif + +#endif diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h new file mode 100644 index 0000000..1462af4 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathMagma.h @@ -0,0 +1,25 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathMagma.h" +#else + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + +// MAGMA (i.e. CUDA implementation of LAPACK functions) +THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_); +THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_, + const char *uplo, const char *trans, const char *diag); +THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_); +THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobz, const char *uplo); +THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvr); +THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu); +THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu); +THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a); +THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo); +THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo); +THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *a, THCTensor *b, const char *uplo); +THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_); +THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a); + +#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + +#endif diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu new file mode 100644 index 0000000..e0f1219 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -0,0 +1,340 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu" +#else + +THC_API void +THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorAddConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorSubConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorSubConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, real value, real alpha) +{ +#ifdef THC_REAL_IS_HALF + auto v = THC_half2float(value) * THC_half2float(alpha); + THCTensor_(add)(state, self_, src_, THC_float2half(v)); +#else + THCTensor_(add)(state, self_, src_, value * alpha); +#endif +} + +THC_API void +THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, real value, real alpha) +{ +#ifdef THC_REAL_IS_HALF + auto v = THC_half2float(value) * THC_half2float(alpha); + THCTensor_(sub)(state, self_, src_, THC_float2half(v)); +#else + THCTensor_(sub)(state, self_, src_, value * alpha); +#endif +} + +THC_API void +THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorMulConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + THArgCheck(value != ScalarConvert::to(0), 3, "divide by zero"); + + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorDivConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorDivConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THCTensor_(mul)(state, self_, src_, pow(2, value)); +#elif defined(THC_REAL_IS_HALF) + return THError("lshift not supported for torch.CudaHalfTensor"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorLShiftConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorLShiftConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THCTensor_(mul)(state, self_, src_, pow(2, -value)); +#elif defined(THC_REAL_IS_HALF) + return THError("rshift not supported for torch.CudaHalfTensor"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorRShiftConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorRShiftConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorFmodOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorFmodOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorRemainderOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorRemainderOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + THArgCheck(!src_->is_empty() && src_->dim() == 2, 1, "expected a matrix"); + + if (self_ != src_) + THCTensor_(resizeAs)(state, self_, src_); + + int64_t stride0 = self_->stride[0]; + int64_t stride1 = self_->stride[1]; + real *start = THCTensor_(data)(state, self_); + + TensorTriOp op(start, stride0, stride1, k); + + if (self_ == src_) { + if (!THC_pointwiseApply1(state, src_, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + THArgCheck(!src_->is_empty() && src_->dim() == 2, 1, "expected a matrix"); + + if (self_ != src_) + THCTensor_(resizeAs)(state, self_, src_); + + int64_t stride0 = self_->stride[0]; + int64_t stride1 = self_->stride[1]; + real *start = THCTensor_(data)(state, self_); + + TensorTriOp op(start, stride0, stride1, k); + + if (self_ == src_) { + if (!THC_pointwiseApply1(state, src_, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + + if (!THC_pointwiseApply2(state, self_, src_, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (!THCTensor_(isSameSizeAs(state, self_, src_))) { + return 0; + } + + // This is not as efficient as TH, but the basic idea: create a buffer that stores + // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value + // in this buffer is 1, the two tensors are equal, otherwise they are not + + THLongStorage *size = THCTensor_(newSizeOf)(state, self_); + THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, size, NULL); + + if (!THC_pointwiseApply3(state, buf, self_, src_, TensorEQOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + unsigned char min = THCudaByteTensor_minall(state, buf); + + THLongStorage_free(size); + THCudaByteTensor_free(state, buf); + + return min != 0; +} + +THC_API void +THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + return THError("bitand only supported for integer type tensors"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorBitAndConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorBitAndConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + return THError("bitor only supported for integer type tensors"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorBitOrConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorBitOrConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + return THError("bitxor only supported for integer type tensors"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorBitXorConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorBitXorConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h new file mode 100644 index 0000000..b54b0c6 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathPairwise.h @@ -0,0 +1,21 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.h" +#else + +THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(add_scaled)(THCState *state, THCTensor *self, THCTensor *src, real value, real alpha); +THC_API void THCTensor_(sub_scaled)(THCState *state, THCTensor *self, THCTensor *src, real value, real alpha); +THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(lshift)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(rshift)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, real value); + +THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src); + +#endif diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu new file mode 100644 index 0000000..7fb6fda --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -0,0 +1,765 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.cu" +#else + +#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL) \ + struct Tensor_##NAME##_##REAL##_Op { \ + __device__ __forceinline__ void operator()(real* out, real* in) const { \ + *out = CFUNC(*in); \ + } \ + \ + __device__ __forceinline__ void operator()(real* v) const { \ + *v = CFUNC(*v); \ + } \ + }; \ + \ + void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); \ + if (self_ == src) { \ + if (!THC_pointwiseApply1(state, self_, Tensor_##NAME##_##REAL##_Op())) { \ + THArgCheck(false, 2, CUTORCH_DIM_WARNING); \ + } \ + } else { \ + THCTensor_(resizeAs)(state, self_, src); \ + \ + if (!THC_pointwiseApply2(state, self_, src, Tensor_##NAME##_##REAL##_Op())) { \ + THArgCheck(false, 2, CUTORCH_DIM_WARNING); \ + } \ + } \ + \ + THCudaCheck(cudaGetLastError()); \ + } + +#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC, REAL) \ + IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL) + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( log, THCNumerics::log, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(lgamma, THCNumerics::lgamma, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log10, THCNumerics::log10, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, THCNumerics::log1p, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( log2, THCNumerics::log2, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( exp, THCNumerics::exp, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(expm1, THCNumerics::expm1, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( cos, THCNumerics::cos, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sin, THCNumerics::sin, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sqrt, THCNumerics::sqrt, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(rsqrt, THCNumerics::rsqrt, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( ceil, THCNumerics::ceil, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(floor, THCNumerics::floor, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(trunc, THCNumerics::trunc, Real) + +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( acos, THCNumerics::acos, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( cosh, THCNumerics::cosh, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( asin, THCNumerics::asin, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sinh, THCNumerics::sinh, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( tan, THCNumerics::tan, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( atan, THCNumerics::atan, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( tanh, THCNumerics::tanh, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( erf, THCNumerics::erf, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( erfc, THCNumerics::erfc, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(erfinv, THCNumerics::erfinv,Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( round, THCNumerics::round, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( frac, THCNumerics::frac, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( cinv, THCNumerics::cinv, Real) + +#endif + +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( neg, THCNumerics::neg, Real) +IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( abs, THCNumerics::abs, Real) + +#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_ +#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC + +void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ == src) { + if (!THC_pointwiseApply1(state, self_, TensorSignOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src); + + if (!THC_pointwiseApply2(state, self_, src, TensorSignOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real min_value, + real max_value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ == src) { + if (!THC_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src); + + if (!THC_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y)); + + int i; + int nd = THCTensor_(nDimension)(state, x); + ptrdiff_t nelem = THCTensor_(nElement)(state, x); + THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions"); + for (i = 0; i < nd; i++) { + THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i); + if (dimension < 0 && THCTensor_(size)(state, x, i) == 3) { + dimension = i; + } + } + + THArgCheck(dimension >= 0 && dimension < nd, 3, "dimension %d out of range", dimension+1); + THArgCheck(THCTensor_(size)(state, x, dimension) == 3, 3, + "dimension %d does not have size 3", dimension+1); + THCTensor_(resizeAs)(state, self, x); + + int64_t sx = THCTensor_(stride)(state, x, dimension); + int64_t sy = THCTensor_(stride)(state, y, dimension); + int64_t so = THCTensor_(stride)(state, self, dimension); + THCTensor *nx = THCTensor_(newNarrow)(state, x, dimension, 0, 1); + THCTensor *ny = THCTensor_(newNarrow)(state, y, dimension, 0, 1); + THCTensor *nself = THCTensor_(newNarrow)(state, self, dimension, 0, 1); + if (!THC_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + THCTensor_(free)(state, nx); + THCTensor_(free)(state, ny); + THCTensor_(free)(state, nself); +} + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +void THCTensor_(atan2)(THCState *state, THCTensor *self_, THCTensor *tx, THCTensor *ty) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, tx, ty)); + THArgCheck(THCTensor_(nElement)(state, tx) == + THCTensor_(nElement)(state, ty), 3, "sizes do not match"); + THCTensor_(resizeAs)(state, self_, tx); + + if (!THC_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ == src) { + if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src); + + if (!THC_pointwiseApply2(state, self_, src, TensorSigmoidOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(digamma)(THCState* state, THCTensor* self_, THCTensor* src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ != src) { + THCTensor_(resizeAs)(state, self_, src); + } + if (!THC_pointwiseApply2(state, self_, src, TensorDigammaOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(polygamma)(THCState* state, THCTensor* self_, int64_t n, THCTensor* src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ != src) { + THCTensor_(resizeAs)(state, self_, src); + } + switch (n) { + case 0: + if (!THC_pointwiseApply2(state, self_, src, TensorDigammaOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + break; + case 1: + if (!THC_pointwiseApply2(state, self_, src, TensorTrigammaOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + break; + default: + THError("polygamma(n,x) is not implemented for n>=2"); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b)); + THArgCheck(THCTensor_(nElement)(state, a) == + THCTensor_(nElement)(state, b), 3, "sizes do not match"); + THCTensor_(resizeAs)(state, result, a); + + if (!THC_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif + +THC_API void +THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + if (value == ScalarConvert::to(1)) { + // self += src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorAddOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self += value * src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + if (value == ScalarConvert::to(1)) { + // self = src1 + src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self = src1 + value * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + if (value == ScalarConvert::to(1)) { + // self -= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorSubOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self += -value * src2 + if (!THC_pointwiseApply2(state, self_, src2, + TensorCAddOp( + ScalarNegate::to(value)))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + if (value == ScalarConvert::to(1)) { + // self = src1 - src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorSubOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self = src1 - value * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, + TensorCAddOp( + ScalarNegate::to(value)))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self *= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorMulOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorMulOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self = pow(self, src2) + if (!THC_pointwiseApply2(state, self_, src2, TensorCPowOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = pow(src1, src2) + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCPowOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real value) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ == src) { + if (THCNumerics::eq(value, ScalarConvert::to(1))) { + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else if (THCNumerics::eq(value, ScalarConvert::to(2))) { + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else if (THCNumerics::eq(value, ScalarConvert::to(3))) { + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + } else if (THCNumerics::eq(value, ScalarConvert::to(-1))) { + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else if (THCNumerics::eq(value, ScalarConvert::to(-2))) { + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } +#endif + } else { + // fallback implementation using pow + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } else { + THCTensor_(resizeAs)(state, self_, src); + + if (THCNumerics::eq(value, ScalarConvert::to(1))) { + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else if (THCNumerics::eq(value, ScalarConvert::to(2))) { + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else if (THCNumerics::eq(value, ScalarConvert::to(3))) { + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + } else if (THCNumerics::eq(value, ScalarConvert::to(-1))) { + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else if (THCNumerics::eq(value, ScalarConvert::to(-2))) { + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } +#endif + } else { + // fallback implementation using pow + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *src) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ == src) { + if (!THC_pointwiseApply1(state, self_, TensorTPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src); + + if (!THC_pointwiseApply2(state, self_, src, TensorTPowOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} +THC_API void +THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorDivOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorDivOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) + return THError("clshift not supported for torch.CudaHalfTensor"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorLShiftOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorLShiftOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) + return THError("crshift not supported for torch.CudaHalfTensor"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorRShiftOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorRShiftOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 2, "sizes do not match"); + + if (self == src1) { + if (!THC_pointwiseApply2(state, self, src2, TensorMaxOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self, src1); + if (!THC_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } +} + +THC_API void +THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 2, "sizes do not match"); + + if (self == src1) { + if (!THC_pointwiseApply2(state, self, src2, TensorMinOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self, src1); + if (!THC_pointwiseApply3(state, self, src1, src2, TensorMinOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } +} + +THC_API void +THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 2, "sizes do not match"); + + if (self == src1) { + if (!THC_pointwiseApply2(state, self, src2, TensorCRemainderOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self, src1); + if (!THC_pointwiseApply3(state, self, src1, src2, TensorCRemainderOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } +} + +THC_API void +THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 2, "sizes do not match"); + + if (self == src1) { + if (!THC_pointwiseApply2(state, self, src2, TensorCFmodOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self, src1); + if (!THC_pointwiseApply3(state, self, src1, src2, TensorCFmodOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } +} + +THC_API void +THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + + if (self == src) { + if (!THC_pointwiseApply1(state, self, TensorMaxValueOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self, src); + if (!THC_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } +} + +THC_API void +THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + + if (self == src) { + if (!THC_pointwiseApply1(state, self, TensorMinValueOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self, src); + if (!THC_pointwiseApply2(state, self, src, TensorMinValueOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } +} + +THC_API void +THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2)); + if(self_ != t) + { + THCTensor_(resizeAs)(state, self_, t); + THCTensor_(copy)(state, self_, t); + } + else + { + THArgCheck(THCTensor_(nElement)(state, self_) == THCTensor_(nElement)(state, src1), + 1, "sizes do not match"); + } + + THArgCheck(THCTensor_(nElement)(state, src1) == THCTensor_(nElement)(state, src2), + 3, "sizes do not match"); + + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2)); + if(self_ != t) + { + THCTensor_(resizeAs)(state, self_, t); + THCTensor_(copy)(state, self_, t); + } + else + { + THArgCheck(THCTensor_(nElement)(state, self_) == THCTensor_(nElement)(state, src1), + 1, "sizes do not match"); + } + THArgCheck(THCTensor_(nElement)(state, src1) == THCTensor_(nElement)(state, src2), + 3, "sizes do not match"); + + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + return THError("cbitand is only supported for integer type tensors"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorBitAndOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitAndOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + return THError("cbitor is only supported for integer type tensors"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorBitOrOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitOrOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +THC_API void +THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + return THError("cbitor is only supported for integer type tensors"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorBitXorOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitXorOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} +#endif diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h new file mode 100644 index 0000000..7f79027 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathPointwise.h @@ -0,0 +1,72 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.h" +#else + +THC_API void THCTensor_(pow)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(tpow)(THCState *state, THCTensor *self, real value, THCTensor *src); +THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +THC_API void THCTensor_(sigmoid)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(log)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(lgamma)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(digamma)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(polygamma)(THCState *state, THCTensor *self, int64_t n, THCTensor *src); +THC_API void THCTensor_(log10)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(log1p)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(log2)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(exp)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(expm1)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(cos)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(acos)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(cosh)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(sin)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(asin)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(sinh)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(tan)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(atan)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(atan2)(THCState *state, THCTensor *r_, THCTensor *tx, THCTensor *ty); +THC_API void THCTensor_(tanh)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(erf)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(erfc)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(erfinv)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(sqrt)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(rsqrt)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(ceil)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(floor)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(round)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(trunc)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(frac)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w); + +THC_API void THCTensor_(cinv)(THCState *state, THCTensor *self, THCTensor *src); + +#endif + +THC_API void THCTensor_(neg)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(abs)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(clamp)(THCState *state, THCTensor *self, THCTensor *src, real min_value, real max_value); +THC_API void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension); + +THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2); +THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2); +THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); + +THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2); + +#endif diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu new file mode 100644 index 0000000..e5d8e22 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -0,0 +1,476 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu" +#else + +THC_API void +THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + if (!THC_reduceDim(state, self, src, + thrust::identity{}, + ReduceAdd{}, + thrust::identity{}, + scalar_cast(0), + dimension, + keepdim)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + if (!THC_reduceDim(state, self, src, + thrust::identity{}, + ReduceMultiply{}, + thrust::identity{}, + scalar_cast(1), + dimension, + keepdim)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + const accreal size = scalar_cast(THCTensor_(size)(state, src, dim)); + if (!THC_reduceDim(state, self, src, + thrust::identity{}, + ReduceAdd{}, + ReduceDivide{size}, + scalar_cast(0), + dim, + keepdim)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +THC_API void +THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, int dimension, real maxnorm) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + THCTensor *self_; + THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0); + THCTensor *data = THCTensor_(newClone)(state, src_); + ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0]; + + THArgCheck(dimension >= 0 && dimension < THCTensor_(_nDimension)(state, src), 3, "invalid dimension"); + THArgCheck(THCNumerics::gt(value, scalar_cast(0)), 2, "non-positive-norm not supported"); + THArgCheck(THCTensor_(_nDimension)(state, src) > 1, 1, "need at least 2 dimensions"); + + dim3 grid(data->size[0]); + dim3 threads(32); + + THCTensor_kernel_renorm + <<>> + (THCTensor_(data)(state, data), scalar_cast(value), size, scalar_cast(maxnorm)); + + cudaError errcode = cudaGetLastError(); + if(errcode != cudaSuccess) + THError(cudaGetErrorString(errcode)); + + THCTensor_(free)(state, src_); + self_ = THCTensor_(newTranspose)(state, data, dimension, 0); + THCTensor_(resizeAs)(state, self, self_); + THCTensor_(freeCopyTo)(state, self_, self); + THCTensor_(free)(state, data); +} + +THC_API void +THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + + THCTensor_preserveReduceDimSemantics( + state, self_, THCTensor_(_nDimension)(state, src), dimension, keepdim); + THLongStorage *dim = THCTensor_(newSizeOf)(state, src); + THLongStorage_set(dim, dimension, 1); + THCTensor_(resize)(state, self_, dim, NULL); + THLongStorage_free(dim); + + THCTensor *self = THCTensor_(newContiguous)(state, self_); + src = THCTensor_(newContiguous)(state, src); + + if (dimension == THCTensor_(_nDimension)(state, src) - 1) { + THCTensor_varInnermostDim(state, self, src, biased); + } else { + THCTensor_varOuterDim(state, self, src, dimension, biased); + } + + THCTensor_(free)(state, src); + THCTensor_(freeCopyTo)(state, self, self_); + + if (!keepdim) { + THCTensor_(squeeze1d)(state, self_, self_, dimension); + } +} + +THC_API void +THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + + THCTensor_preserveReduceDimSemantics( + state, self_, THCTensor_(_nDimension)(state, src), dimension, keepdim); + THLongStorage *dim = THCTensor_(newSizeOf)(state, src); + THLongStorage_set(dim, dimension, 1); + THCTensor_(resize)(state, self_, dim, NULL); + THLongStorage_free(dim); + + THCTensor *self = THCTensor_(newContiguous)(state, self_); + src = THCTensor_(newContiguous)(state, src); + + if (dimension == THCTensor_(_nDimension)(state, src) - 1) { + THCTensor_varInnermostDim(state, self, src, biased); + } else { + THCTensor_varOuterDim(state, self, src, dimension, biased); + } + + THCTensor_(free)(state, src); + THCTensor_(freeCopyTo)(state, self, self_); + + if (!keepdim) { + THCTensor_(squeeze1d)(state, self_, self_, dimension); + } +} + +THC_API accreal +THCTensor_(stdall)(THCState *state, THCTensor *self, int biased) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + return THCNumerics::sqrt((THCTensor_(varall)(state, self, biased))); +} + +THC_API accreal +THCTensor_(varall)(THCState *state, THCTensor *self, int biased) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + accreal mean = THCTensor_(meanall)(state, self); + + accreal val; + if (!THC_reduceAll(state, self, + SquareFunctor(mean), + ReduceAdd(), + scalar_cast(0), + &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + val = THCNumerics::div( + val, + scalar_cast(std::max(0, THCTensor_(nElement)(state, self) - (biased ? 0 : 1))) + ); + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API void +THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real _value, int dimension, int keepdim) +{ + const accreal value = scalar_cast(_value); + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + if (THCNumerics::eq(value, scalar_cast(0))) { + THC_reduceDim(state, self, src, + TensorNonZeroOp{}, + ReduceAdd{}, + thrust::identity{}, + scalar_cast(0), + dimension, keepdim); + } else if (THCNumerics::eq(value, scalar_cast(1))) { + THC_reduceDim(state, self, src, + TensorNormOp{value}, + ReduceAdd{}, + thrust::identity{}, + scalar_cast(0), + dimension, keepdim); + } else if (THCNumerics::eq(value, scalar_cast(2))) { + THC_reduceDim(state, self, src, + TensorNormOp{value}, + ReduceAdd{}, + ReducePow{scalar_cast(.5)}, + scalar_cast(0), + dimension, keepdim); + } else if (THCNumerics::eq(value, scalar_cast(INFINITY))) { + THC_reduceDim(state, self, src, + TensorNormOp{value}, + ReduceMax{}, + thrust::identity{}, + scalar_cast(0), + dimension, keepdim); + } else { + THC_reduceDim(state, self, src, + TensorNormOp{value}, + ReduceAdd{}, + ReducePow{THCNumerics::cinv(value)}, + scalar_cast(0), + dimension, keepdim); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API accreal +THCTensor_(normall)(THCState *state, THCTensor *self, real _value) +{ + const accreal value = scalar_cast(_value); + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + accreal result; + + if (THCNumerics::eq(value, scalar_cast(0))) { + THC_reduceAll(state, self, + TensorNonZeroOp{}, + ReduceAdd{}, + scalar_cast(0), + &result, 0); + } else if (THCNumerics::eq(value, scalar_cast(1))) { + THC_reduceAll(state, self, + TensorNormOp{value}, + ReduceAdd{}, + scalar_cast(0), + &result, 0); + } else if (THCNumerics::eq(value, scalar_cast(2))) { + THC_reduceAll(state, self, + TensorNormOp{value}, + ReduceAdd{}, + scalar_cast(0), + &result, 0); + result = THCNumerics::sqrt(result); + } else if (THCNumerics::eq(value, scalar_cast(INFINITY))) { + THC_reduceAll(state, self, + TensorNormOp{value}, + ReduceMax{}, + scalar_cast(0), + &result, 0); + } else { + THC_reduceAll(state, self, + TensorNormOp{value}, + ReduceAdd{}, + scalar_cast(0), + &result, 0); + result = THCNumerics::pow(result, + THCNumerics::cinv(value)); + } + + THCudaCheck(cudaGetLastError()); + return result; +} + +accreal THCTensor_(dist)(THCState *state, THCTensor *self, + THCTensor *src, real _value) +{ + const accreal value = scalar_cast(_value); + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + self = THCTensor_(newContiguous)(state, self); + ptrdiff_t size = THCTensor_(nElement)(state, self); + src = THCTensor_(newContiguous)(state, src); + thrust::device_ptr self_data(THCTensor_(data)(state, self)); + thrust::device_ptr src_data(THCTensor_(data)(state, src)); + + THCThrustAllocator thrustAlloc(state); + accreal result = thrust::inner_product( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + self_data, self_data+size, src_data, scalar_cast(0), + thrust::plus(), + ThrustTensorDistOp(value)); + + THCTensor_(free)(state, src); + THCTensor_(free)(state, self); + + return THCNumerics::pow(result, THCNumerics::cinv(value)); +} + +#endif + +THC_API accreal +THCTensor_(sumall)(THCState *state, THCTensor *self) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + accreal val; + if (!THC_reduceAll(state, self, + thrust::identity{}, + ReduceAdd{}, + scalar_cast(0), + &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API accreal +THCTensor_(prodall)(THCState *state, THCTensor *self) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + accreal val; + if (!THC_reduceAll(state, self, + thrust::identity{}, + ReduceMultiply{}, + scalar_cast(1), + &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API accreal +THCTensor_(meanall)(THCState *state, THCTensor *self) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self); +} + +THC_API real +THCTensor_(minall)(THCState *state, THCTensor *self) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + accreal val; + if (!THC_reduceAll(state, self, + thrust::identity{}, + ReduceMin{}, + THCNumerics::max(), &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return scalar_cast(val); +} + +THC_API real +THCTensor_(maxall)(THCState *state, THCTensor *self) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + accreal val; + if (!THC_reduceAll(state, self, + thrust::identity{}, + ReduceMax{}, + THCNumerics::min(), &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return scalar_cast(val); +} + +THC_API real +THCTensor_(medianall)(THCState *state, THCTensor *self) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + + real val; + ptrdiff_t nelem, k; + + nelem = THCTensor_(nElement)(state, self); + k = (nelem-1) >> 1; + + THLongStorage *size = THLongStorage_newWithSize1(nelem); + THCTensor *view = THCTensor_(newView)(state, self, size); + + THLongStorage_free(size); + + THCTensor *sorted = THCTensor_(new)(state); + THCudaLongTensor *indices = THCudaLongTensor_new(state); + + THCTensor_(sort)(state, sorted, indices, view, 0, 0); + + val = THCTensor_(get1d)(state, sorted, k); + + THCTensor_(free)(state, view); + THCTensor_(free)(state, sorted); + THCudaLongTensor_free(state, indices); + + THCudaCheck(cudaGetLastError()); + + return val; +} + +THC_API void +THCTensor_(median)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *self, + int dimension, + int keepdim) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); + + int64_t t_size_dim, k; + + t_size_dim = THCTensor_(size)(state, self, dimension); + + k = (t_size_dim-1) >> 1; + + THCTensor *sorted = THCTensor_(new)(state); + THCudaLongTensor *sorted_indices = THCudaLongTensor_new(state); + + THCTensor_(sort)(state, sorted, sorted_indices, self, dimension, 0); + + THCTensor *newValues = THCTensor_(newNarrow)(state, sorted, dimension, k, 1); + THCudaLongTensor *newIndices = THCudaLongTensor_newNarrow(state, sorted_indices, dimension, k, 1); + + THCTensor_(free)(state, sorted); + THCudaLongTensor_free(state, sorted_indices); + + if (!keepdim) { + THCTensor_(squeeze1d)(state, newValues, newValues, dimension); + THCudaLongTensor_squeeze1d(state, newIndices, newIndices, dimension); + } + + THCTensor_(resizeAs)(state, values, newValues); + THCudaLongTensor_resizeAs(state, indices, newIndices); + THCTensor_(copy)(state, values, newValues); + THCudaLongTensor_copy(state, indices, newIndices); + + THCTensor_(free)(state, newValues); + THCudaLongTensor_free(state, newIndices); + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(max)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + int dimension, + int keepdim) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src)); + + thrust::pair + init = + thrust::make_pair( + THCNumerics::min(), 0); + + return THC_reduceDimIndex( + state, values, indices, src, dimension, keepdim, init, + MaxValuePair()); +} + +THC_API void +THCTensor_(min)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + int dimension, + int keepdim) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src)); + + thrust::pair + init = + thrust::make_pair( + THCNumerics::max(), 0); + + return THC_reduceDimIndex( + state, values, indices, src, dimension, keepdim, init, + MinValuePair()); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMathReduce.h b/aten/src/THC/generic/THCTensorMathReduce.h new file mode 100644 index 0000000..4fbbc94 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathReduce.h @@ -0,0 +1,47 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathReduce.h" +#else + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +THC_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, int dimension, real max_norm); +THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, int dim, int biased, int keepdim); +THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, int dimension, int keepdim); +THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, int dim, int biased, int keepdim); + +THC_API accreal THCTensor_(stdall)(THCState *state, THCTensor *self, int biased); +THC_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, real value); +THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased); + +#endif + +THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim); +THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim); +THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim); + +THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self); +THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self); +THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self); + +THC_API void THCTensor_(min)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, int dim, int keepdim); +THC_API void THCTensor_(max)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, int dim, int keepdim); + +THC_API real THCTensor_(minall)(THCState *state, THCTensor *self); +THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self); +THC_API real THCTensor_(medianall)(THCState *state, THCTensor *self); + +THC_API void THCTensor_(median)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, int dim, int keepdim); + +THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src, + real value); + +#endif diff --git a/aten/src/THC/generic/THCTensorMathScan.cu b/aten/src/THC/generic/THCTensorMathScan.cu new file mode 100644 index 0000000..5aafb3b --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathScan.cu @@ -0,0 +1,122 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathScan.cu" +#else + +#ifndef THC_REAL_IS_HALF +template +__host__ void THCTensor_(scanThrust)( + THCState *state, + THCTensor *dst, + THCTensor *src, + BinaryFunction binary_op) +{ + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr src_data(THCTensor_(data)(state, src)); + thrust::device_ptr dst_data(THCTensor_(data)(state, dst)); + ptrdiff_t size = THCTensor_(nElement)(state, src); + thrust::inclusive_scan( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + src_data, src_data + size, dst_data, + binary_op); +} +#endif + +template +__host__ void THCTensor_(scanOuterDim)(THCState *state, THCTensor *tgt, + THCTensor *src, int dimension, + real init, BinaryOp binary_op) +{ + unsigned ndim = THCTensor_(_nDimension)(state, src); + // Treat all outer dimensions (i.e. dim < dimension) as one. + unsigned num_orows = 1; + for (int dim = 0; dim < dimension; dim++) { + num_orows *= THCTensor_(size)(state, src, dim); + } + unsigned row_size = THCTensor_(size)(state, src, dimension); + // Treat all inner dimensions (i.e. dim > dimension) as one. + unsigned num_irows = 1; + for (unsigned dim = dimension + 1; dim < ndim; dim++) { + num_irows *= THCTensor_(size)(state, src, dim); + } + + dim3 threads(min(512, num_irows)); + unsigned maxGridDim = 1024; + dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x))); + + THCTensor_kernel_scanOuterDim<<>>( + THCTensor_(data)(state, tgt), THCTensor_(data)(state, src), + num_orows, num_irows, row_size, init, binary_op); + + THCudaCheck(cudaGetLastError()); +} + +template +__host__ void THCTensor_(scanInnermostDim)(THCState *state, THCTensor *tgt, + THCTensor *src, real init, + BinaryFunction binary_op) +{ + unsigned ndim = THCTensor_(_nDimension)(state, src); + // Treat all outer dimensions as a single dimension. + unsigned num_rows = 1; + for (unsigned dim = 0; dim < ndim - 1; dim++) { + num_rows *= THCTensor_(size)(state, src, dim); + } + unsigned row_size = THCTensor_(size)(state, src, ndim - 1); + + dim3 threads(16, 32); + dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y))); + + THCTensor_kernel_scanInnermostDim<<>>( + THCTensor_(data)(state, tgt), THCTensor_(data)(state, src), num_rows, row_size, init, binary_op); + + THCudaCheck(cudaGetLastError()); +} + +template +void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src, + int dimension, real init, BinaryFunction binary_op) +{ + // "init" must be the identity element for binary_op + int ndim = THCTensor_(nDimension)(state, src); + THArgCheck(dimension >= 0 && dimension < ndim, 3, "dimension %d out of range", + dimension + TH_INDEX_BASE); + + THCTensor_(resizeAs)(state, self_, src); + THCTensor *self = THCTensor_(newContiguous)(state, self_); + src = THCTensor_(newContiguous)(state, src); + + if (!self->is_empty()) { + #ifndef THC_REAL_IS_HALF + if (ndim == 1) { + // thrust does not take an "init" + THCTensor_(scanThrust)(state, self, src, binary_op); + } else + #endif + if (dimension == ndim - 1) { + THCTensor_(scanInnermostDim)(state, self, src, init, binary_op); + } else { + THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op); + } + } + + THCTensor_(free)(state, src); + THCTensor_(freeCopyTo)(state, self, self_); +} + +void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, int dimension) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + return THCTensor_(scanDim)(state, self, src, dimension, + ScalarConvert::to(0.0), AddOp()); +} + +void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, int dimension) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); + return THCTensor_(scanDim)(state, self, src, dimension, + ScalarConvert::to(1.0), MulOp()); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorMathScan.h b/aten/src/THC/generic/THCTensorMathScan.h new file mode 100644 index 0000000..435519a --- /dev/null +++ b/aten/src/THC/generic/THCTensorMathScan.h @@ -0,0 +1,8 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathScan.h" +#else + +THC_API void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, int dim); +THC_API void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, int dim); + +#endif diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu new file mode 100644 index 0000000..d54d171 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMode.cu @@ -0,0 +1,323 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMode.cu" +#else + +THC_API void THCTensor_(calculateMode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + THCudaLongStorage *sortBuffer, + int dimension, + THLongStorage *position) { + THAssert(THCTensor_(isContiguous)(state, input)); + + // Because the input is contiguous, we want to get a reference to the + // location of the buffer at the innermost dimension that we are going + // to calculate the mode for --> we do this by manually doing the stride + // calculations to get an offset + real *data = THCTensor_(data)(state, input); + for (int i = 0; i < THLongStorage_size(position); ++i) { + data += THLongStorage_data(position)[i] * THCTensor_(stride)(state, input, i); + } + + int64_t nElement = THCTensor_(size)(state, input, THCTensor_(_nDimension)(state, input) - 1); + THCThrustAllocator thrustAlloc(state); + + // Wrap input data, sortBuffer, in Thrust device vectors + thrust::device_ptr vecPtr = thrust::device_pointer_cast(data); + thrust::device_vector iter(vecPtr, vecPtr + nElement); + thrust::device_ptr sbPtr = thrust::device_pointer_cast(THCudaLongStorage_data(state, sortBuffer)); + thrust::device_vector seq(sbPtr, sbPtr + nElement); + + // Fill sortBuffer with [0, 1, 2, ... nElement - 1] + thrust::sequence( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + seq.begin(), seq.end()); + + // Sort the input data. The original indices of the data are stored in seq + thrust::sort_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + iter.begin(), iter.end(), seq.begin() +#if defined(THC_REAL_IS_HALF) + , ThrustHalfLess() +#endif + ); + + // Count # of unique elements via an inner product between adjacent elements. + // Add 1 if two neighboring element are not equal. + int unique = 1 + thrust::inner_product( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + iter.begin(), iter.end() - 1, iter.begin() + 1, 0, thrust::plus(), +#if defined(THC_REAL_IS_HALF) + ThrustHalfNotEqualTo() +#else + thrust::not_equal_to() +#endif + ); + + // Count frequency of each element + thrust::device_vector keys(unique); + thrust::device_vector counts(unique); + thrust::reduce_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + iter.begin(), iter.end(), + thrust::constant_iterator(1), keys.begin(), counts.begin() +#if defined(THC_REAL_IS_HALF) + , ThrustHalfEqualTo() +#endif + ); + + // Find index of maximum count + thrust::device_vector::iterator it = thrust::max_element( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + counts.begin(), counts.end()); + real mode = keys[it - counts.begin()]; + + // Find first index within which it occurs +#if defined(THC_REAL_IS_HALF) + thrust::device_vector::iterator positionIter = thrust::find_if( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + iter.begin(), iter.end(), ThrustHalfEqualToPredicate(mode)); +#else + thrust::device_vector::iterator positionIter = thrust::find( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#else + thrust::device, +#endif + iter.begin(), iter.end(), mode); +#endif + + THAssert(positionIter != iter.end()); + int64_t index = TH_INDEX_BASE + seq[positionIter - iter.begin()]; + + // Place mode, index in output + ptrdiff_t valuesOffset = THCTensor_(storageOffset)(state, values); + int64_t indicesOffset = THCudaLongTensor_storageOffset(state, indices); + + for (int i = 0; i < THLongStorage_size(position); ++i) { + int64_t pos = THLongStorage_data(position)[i]; + valuesOffset += THCTensor_(stride)(state, values, i) * pos; + indicesOffset += THCudaLongTensor_stride(state, indices, i) * pos; + } + THCStorage_(set)(state, THCTensor_(storage)(state, values), valuesOffset, mode); + THCudaLongStorage_set(state, THCudaLongTensor_storage(state, indices), indicesOffset, index); +} + +// this probably could be a loop, not a recursive algorithm +THC_API void THCTensor_(dimApplyMode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + THCudaLongStorage *sortBuffer, + int dimension, + THLongStorage *position, + int curDim) { + int64_t ndim = THCTensor_(_nDimension)(state, input); + + // Because we have transposed the Tensor, the data for the dimension we are mode'ing along + // is always in the innermost dimension + if (curDim == ndim - 1) { + THCTensor_(calculateMode)(state, values, indices, input, sortBuffer, dimension, position); + } else { + // Loop through the values and recurse + for (int i = 0; i < THCTensor_(size)(state, input, curDim); ++i) { + THLongStorage_data(position)[curDim] = i; + THCTensor_(dimApplyMode)(state, values, indices, input, sortBuffer, dimension, position, curDim + 1); + } + } +} + +#define MAX_GRID_SIZE 65535 +#define MAX_BLOCK_SIZE 1024 + +THC_API void THCTensor_(mode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + int dimension, + int keepdim) { + THLongStorage *dim; + THCTensor *transposed, *contiguous, *valuesTransposed; + THLongStorage *position; + THCudaLongStorage *sortBuffer; + THCudaLongTensor *indicesTransposed; + int64_t ndim, sliceSize, slices; + + + THAssert(THCTensor_(checkGPU)(state, 1, values)); + + // Verify they are asking for a valid dimension + ndim = THCTensor_(_nDimension)(state, input); + THArgCheck(dimension >= 0 && dimension < ndim, 4, "Dimension of out bounds"); + + sliceSize = THCTensor_(size)(state, input, dimension); + slices = THCTensor_(nElement)(state, input) / sliceSize; + + // Resize output value, index Tensors to appropriate sizes (i.e. the same as + // the input Tensor, except at dim=dimension, the size is 1) + THCTensor_preserveReduceDimSemantics( + state, values, ndim, dimension, keepdim); + THCTensor_preserveReduceDimSemantics( + state, indices, ndim, dimension, keepdim); + dim = THCTensor_(newSizeOf)(state, input); + THLongStorage_set(dim, dimension, 1); + THCTensor_(resize)(state, values, dim, NULL); + THCudaLongTensor_resize(state, indices, dim, NULL); + THLongStorage_free(dim); + + // If sliceSize is 1, copy input to values and set indices + if (sliceSize == 1) { + THCTensor_(copy)(state, values, input); + THCudaLongTensor_fill(state, indices, TH_INDEX_BASE); + if (!keepdim) { + THCTensor_(squeeze1d)(state, values, values, dimension); + THCudaLongTensor_squeeze1d(state, indices, indices, dimension); + } + return; + } + + // Requirements for fused kernel implementation: + // + // 1. sliceSize <= 2 * max threads per block + // 2. uses one block per slice, so number of slices must be less than the maximum number of blocks for + // a kernel launch + // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed) + if (sliceSize <= MAX_BLOCK_SIZE && + slices <= MAX_GRID_SIZE && + THCTensor_canUse32BitIndexMath(state, input)) { + // Beginning our optimized implementation. First thing we want to do is to transpose + // the input Tensor along the sort dimension, and then make it contiguous + transposed = THCTensor_(newTranspose)(state, input, dimension, ndim - 1); + contiguous = THCTensor_(newContiguous)(state, transposed); + + // We also need to view the values and indices Tensors as transposed in order to + // properly determine the offset into the underlying storage in which to place the + // mode and index for a particular set of dimension values + valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim-1); + indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim-1); + + // Set-up TensorInfo structs for passing to kernel + TensorInfo tiValues = getTensorInfo(state, valuesTransposed); + TensorInfo tiIndices = getTensorInfo(state, indicesTransposed); + + // The number of blocks is the number of slices that we need to calculate the mode for. Each block + // is responsible for computing a single mode + dim3 grid; + THC_getGridFromTiles(slices, grid); + + // The blocksize is two elements per thread, rounded up to the nearest power of 2 + int64_t ceilPowerOf2 = nextHighestPowerOf2(sliceSize); + + // Macro that calls kernel --> note that we set the block dimensions here, and + // the amount of shared memory + #define HANDLE_MODE(SIZE) \ + { \ + dim3 blockSize(SIZE / 2); \ +\ + int memsize = (sizeof(real) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \ + computeMode \ + <<>>( \ + THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \ + } + + // Tradeoff between compilation time and the number of specializations. Ideally we would have + // one HANDLE_MODE for each power of 2 + switch(ceilPowerOf2) { + case 2048: + HANDLE_MODE(2048) + break; + case 1024: + case 512: + case 256: + HANDLE_MODE(1024) + break; + case 128: + case 64: + HANDLE_MODE(128) + break; + case 32: + case 16: + case 8: + case 4: + case 2: + HANDLE_MODE(32) + break; + case 1: + default: + assert(false); + } + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, transposed); + THCTensor_(free)(state, contiguous); + THCTensor_(free)(state, valuesTransposed); + THCudaLongTensor_free(state, indicesTransposed); + } else { + // Beginning our naive implementation: We don't want to mutate the input Tensor, but + // we need to be able to sort the inputs along the dimension in order to calculate the + // mode. Additionally, its ideal if the data along the dimension is contiguous. So + // we transpose the dimension with the innermost dimension and make a new contiguous + // version that we can use. + transposed = THCTensor_(newClone)(state, input); + THCTensor_(transpose)(state, transposed, NULL, dimension, ndim - 1); + contiguous = THCTensor_(newContiguous)(state, transposed); + THCTensor_(free)(state, transposed); + + // We also need to view the values and indices Tensors as transposed in order to + // properly determine the offset into the underlying storage in which to place the + // mode and index for a particular set of dimension values + valuesTransposed = THCTensor_(newTranspose)(state, values, dimension, ndim - 1); + indicesTransposed = THCudaLongTensor_newTranspose(state, indices, dimension, ndim - 1); + + // Position is a Storage that will store the dimension values we are processing + position = THLongStorage_newWithSize(ndim - 1); + + // Sort Buffer is a Storage that will be used in the internal sort required to calculate + // the mode efficiently + sortBuffer = THCudaLongStorage_newWithSize(state, sliceSize); + + // Call mode + THCTensor_(dimApplyMode)(state, valuesTransposed, indicesTransposed, contiguous, sortBuffer, dimension, position, 0); + + THCTensor_(free)(state, contiguous); + THLongStorage_free(position); + THCTensor_(free)(state, valuesTransposed); + THCudaLongTensor_free(state, indicesTransposed); + THCudaLongStorage_free(state, sortBuffer); + } + + if (!keepdim) { + THCTensor_(squeeze1d)(state, values, values, dimension); + THCudaLongTensor_squeeze1d(state, indices, indices, dimension); + } +} + +#undef MAX_GRID_SIZE +#undef MAX_BLOCK_SIZE + +#endif diff --git a/aten/src/THC/generic/THCTensorMode.h b/aten/src/THC/generic/THCTensorMode.h new file mode 100644 index 0000000..6f24380 --- /dev/null +++ b/aten/src/THC/generic/THCTensorMode.h @@ -0,0 +1,14 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMode.h" +#else + +/* Returns the mode, and index of the mode, for the set of values + * along a given dimension in the input tensor. */ +THC_API void THCTensor_(mode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + int dimension, + int keepdim); + +#endif // THC_GENERIC_FILE diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu new file mode 100644 index 0000000..353fadc --- /dev/null +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -0,0 +1,545 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorRandom.cu" +#else + +#define NUM_BLOCKS min((int)THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS) + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generate_uniform<<>>( + gen->state.gen_states, size, data, a, b); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generate_normal<<>>( + gen->state.gen_states, size, data, mean, stdv); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +THC_API void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) { + THCTensor_(resizeAs)(state, self, means); + THCTensor_(normal)(state, self, 0, stddev); + THCTensor_(cadd)(state, self, self, ScalarConvert::to(1), means); +} + +THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs) +{ + THCTensor_(resizeAs)(state, self, stddevs); + THCTensor_(normal)(state, self, 0, 1); + THCTensor_(cmul)(state, self, self, stddevs); + THCTensor_(add)(state, self, self, ScalarConvert::to(mean)); +} + +THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs) +{ + THCTensor_(resizeAs)(state, self, means); + THCTensor_(normal)(state, self, 0, 1); + THCTensor_(cmul)(state, self, self, stddevs); + THCTensor_(cadd)(state, self, self, ScalarConvert::to(1), means); +} + +THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv) +{ + + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generateLogNormal<<>>( + gen->state.gen_states, size, data, mean, stdv); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generate_exponential<<>>( + gen->state.gen_states, size, data, lambda); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generate_cauchy<<>>( + gen->state.gen_states, size, data, median, sigma); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +void THCTensor_(renormRows)(struct THCState* state, + THCTensor* t) { + THAssert(THCTensor_(_nDimension)(state, t) == 2); + int64_t rows = THCTensor_(size)(state, t, 0); + int64_t cols = THCTensor_(size)(state, t, 1); + + cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state); + THAssert(props != NULL); + + int numSM = props->multiProcessorCount; + int maxThreads = props->maxThreadsPerBlock; + + dim3 grid(rows < numSM * 4 ? rows : numSM * 4); + dim3 block(cols < maxThreads ? cols : maxThreads); + + renormRowsL1 + <<>>(THCTensor_(data)(state, t), + rows, cols); +} + +THC_API void THCTensor_(multinomial)(struct THCState *state, + THCudaLongTensor *self, + THCTensor *prob_dist, + int n_sample, + int with_replacement) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist)); + THCGenerator* gen = THCRandom_getGenerator(state); + + int inputSize = THCTensor_(_nDimension)(state, prob_dist); + THArgCheck(inputSize > 0 && inputSize <= 2, 2, + "prob_dist must be 1 or 2 dim"); + + // Categories are in the innermost dimension + int64_t numDist = + inputSize == 1 ? 1 : THCTensor_(size)(state, prob_dist, 0); + int64_t numCategoriesLong = + inputSize == 1 ? THCTensor_(size)(state, prob_dist, 0) : + THCTensor_(size)(state, prob_dist, 1); + + // Since the index tensor is float, numCategories cannot exceed max + // float integer precision + THArgCheck(numCategoriesLong <= FLOAT32_MAX_CONSECUTIVE_INT, 2, + "number of categories cannot exceed 2^24"); + int numCategories = (int) numCategoriesLong; + + THArgCheck(n_sample > 0, 3, "cannot sample <= 0 samples"); + + if (!with_replacement) { + THArgCheck(n_sample <= numCategories, 2, + "cannot sample n_sample > prob_dist:size(1) samples without " + "replacement"); + } + + int free_prob_dist = 0; + + // Restructure data for 2d + if (inputSize == 1) { + THCTensor *temp = THCTensor_(new)(state); + THCTensor_(unsqueeze1d)(state, temp, prob_dist, 0); + prob_dist = temp; + free_prob_dist = 1; + } + + THCudaLongTensor_resize2d(state, self, numDist, n_sample); + + // get current device properties + cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state); + THAssert(props != NULL); + int numSM = props->multiProcessorCount; + int maxThreads = props->maxThreadsPerBlock; + int maxShared = props->sharedMemPerBlock; + int requiredShared = (numCategories < maxThreads ? numCategories : maxThreads) + * (sizeof(real) * sizeof(accreal)); + + if (n_sample == 1 && maxShared >= requiredShared) { + // Optimized allocation-free implementation + // To exploit greater parallelism for the sampling, generate the + // Uniform random samples in a separate kernel launch, into + // temporarily allocated memory. The device RNG is thread-limited + THCTensor *sampled = THCTensor_(newWithSize2d)(state, numDist, n_sample); + THCTensor_(uniform)(state, sampled, 0.0, 1.0); + + dim3 block(numCategories < maxThreads ? numCategories : maxThreads); + dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4); + + sampleMultinomialOnce + <<>>( + THCudaLongTensor_data(state, self), + numDist, + numCategories, + THCTensor_(data)(state, sampled), + THCTensor_(data)(state, prob_dist), + THCTensor_(stride)(state, prob_dist, 0), + THCTensor_(stride)(state, prob_dist, 1) + ); + THCTensor_(free)(state, sampled); + } else { + // Generic, slow implementation with memory allocations + + // For sampling without replacement, we modify the distribution + // for subsequent samples in this space + THCTensor* origDist = THCTensor_(new)(state); + THCTensor_(resizeAs)(state, origDist, prob_dist); + THCTensor_(copy)(state, origDist, prob_dist); + + THCTensor* normDist = THCTensor_(new)(state); + THCTensor_(resizeAs)(state, normDist, prob_dist); + + THCTensor* prefixSum = THCTensor_(new)(state); + + // Renorm along rows + THCTensor_(copy)(state, normDist, origDist); + THCTensor_(renormRows)(state, normDist); + + // Prefix sum along rows + THCTensor_(cumsum)(state, prefixSum, normDist, 1); + + if (with_replacement) { + // Sample with replacement + + // Binary search is warp divergent (so effectively we're running + // with just a single thread), but for better utilization, + // we need each block to have at least 4 warps. + dim3 block(32, 4); + + // Each warp in a block will generate a sample from one + // distribution concurrently. + dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS); + + sampleMultinomialWithReplacement + <<>>( + gen->state.gen_states, + n_sample, + THCudaLongTensor_data(state, self), + numDist, numCategories, + THCTensor_(data)(state, prefixSum)); + } else { + // Sample without replacement + + // Binary search is warp divergent (so effectively we're running + // with just a single thread), but for better utilization, + // we need each block to have at least 4 warps. + dim3 block(32, 4); + + // Each warp in a block will generate a sample from a different + // distribution concurrently. + ptrdiff_t numBlocks = THCCeilDiv(numDist, (int64_t) 4); + dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS); + + for (int sample = 0; sample < n_sample; ++sample) { + if (sample > 0) { + // Update probabilities + // Renorm along rows + THCTensor_(copy)(state, normDist, origDist); + THCTensor_(renormRows)(state, normDist); + + // Prefix sum along rows + THCTensor_(cumsum)(state, prefixSum, normDist, 1); + } + + // The kernel can only draw one sample before we have to + // recalculate our distribution + sampleMultinomialWithoutReplacement + <<>>( + gen->state.gen_states, + n_sample, + sample, + THCudaLongTensor_data(state, self), + numDist, numCategories, + THCTensor_(data)(state, origDist), + THCTensor_(data)(state, prefixSum)); + } + } + + THCTensor_(free)(state, prefixSum); + THCTensor_(free)(state, normDist); + THCTensor_(free)(state, origDist); + } + + // Revert data restructuring based on input sizes + if (inputSize == 1) { + THCudaLongTensor_resize1d(state, self, n_sample); + } + if (free_prob_dist) { + THCTensor_(free)(state, prob_dist); + } +} + +THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){ + THAssert(THCTensor_(isContiguous)(state, _q)); + THAssert(THCudaLongTensor_isContiguous(state, _J)); + THAssert(THCTensor_(isContiguous)(state, _probs)); + int64_t inputsize = THCTensor_(nElement)(state, _probs); + THCudaLongTensor *smaller = THCudaLongTensor_newWithSize1d(state, inputsize); + THCudaLongTensor *larger = THCudaLongTensor_newWithSize1d(state, inputsize); + THCudaLongTensor *smaller_short = THCudaLongTensor_newWithSize1d(state, inputsize); + THCudaLongTensor *larger_short = THCudaLongTensor_newWithSize1d(state, inputsize); + + THCudaLongTensor_resize1d(state, _J, inputsize); + THCTensor_(resize1d)(state, _q, inputsize); + + real one = ScalarConvert::to(1); + int inputBlockDim = THCCeilDiv((int)inputsize + BLOCK_SIZE - 1, BLOCK_SIZE); + aliasMultinomialFilter + <<>>( + THCTensor_(data)(state, _q), + THCTensor_(data)(state, _probs), + THCudaLongTensor_data(state, smaller), + THCudaLongTensor_data(state, larger), + THCudaLongTensor_data(state, _J), + THCudaLongTensor_data(state, smaller_short), + THCudaLongTensor_data(state, larger_short), + one, inputsize + ); + + THCudaLongTensor_nonzero(state, smaller_short, smaller); + THCudaLongTensor_nonzero(state, larger_short, larger); + int h_large_c = THCudaLongTensor_nElement(state, larger_short); + THCudaLongTensor_resize1d(state, smaller_short, inputsize); + THCudaLongTensor_resize1d(state, larger_short, inputsize); + aliasMultinomialSetup + <<<1, 1, 0, THCState_getCurrentStream(state)>>>( + THCudaLongTensor_data(state, _J), + THCTensor_(data)(state, _q), + inputsize, + THCudaLongTensor_data(state, smaller_short), + THCudaLongTensor_data(state, larger_short), + inputsize - h_large_c, h_large_c + ); + real q_max = THCTensor_(maxall)(state, _q); + condDiv<<< + inputBlockDim, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, _q), + THCudaLongTensor_data(state, _J), + inputsize, q_max + ); + + THCudaLongTensor_free(state, smaller); + THCudaLongTensor_free(state, larger); + THCudaLongTensor_free(state, smaller_short); + THCudaLongTensor_free(state, larger_short); +} + +THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){ + THAssert(THCTensor_(isContiguous)(state, _q)); + THAssert(THCudaLongTensor_isContiguous(state, _J)); + THCGenerator* gen = THCRandom_getGenerator(state); + int64_t K = THCudaLongTensor_nElement(state, _J); + int64_t output_nelem = THCudaLongTensor_nElement(state, self); + ptrdiff_t size = THCudaLongTensor_nElement(state, self); + + THCTensor *uniform = THCTensor_(newWithSize1d)(state, output_nelem); + THCTensor *bernoulli = THCTensor_(newWithSize1d)(state, output_nelem); + + THCTensor_(uniform)(state, uniform, 0, K); + THCTensor_(uniform)(state, bernoulli, 0, 1); + + multinomialAliasDrawKernel + <<>>( + size, + THCudaLongTensor_data(state, self), + THCudaLongTensor_data(state, _J), + THCTensor_(data)(state, _q), + K, + THCTensor_(data)(state, uniform), + THCTensor_(data)(state, bernoulli) + ); +} + +#endif + +#if defined(THC_REAL_IS_DOUBLE) +GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_double, x <= p) +#else +GENERATE_KERNEL1(generate_bernoulli, real, double p, float, curand_uniform, (ScalarConvert::to(x <= p))) +#endif + +THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generate_bernoulli<<>>( + gen->state.gen_states, size, data, p); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p) +{ +#if defined(THC_REAL_IS_FLOAT) + THCTensor_(bernoulli_FloatTensor)(state, self, p); +#elif defined(THC_REAL_IS_DOUBLE) + THCTensor_(bernoulli_DoubleTensor)(state, self, p); +#endif +} + +#define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE) \ +THC_API void THCTensor_(NAME)(THCState* state, \ + THCTensor *self_, PROB_TYPE *probs_) \ +{ \ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_)); \ + ptrdiff_t size = THCTensor_(nElement)(state, self_); \ + if (size == 0) return; \ + THCGenerator* gen = THCRandom_getGenerator(state); \ + THCTensor *self = THCTensor_(newContiguous)(state, self_); \ + PROB_TYPE *probs = PROB_TYPE##_newContiguous(state, probs_); \ + ptrdiff_t prob_size = PROB_TYPE##_nElement(state, probs); \ + real *result_data = THCTensor_(data)(state, self); \ + PROB_DATA_TYPE *probs_data = PROB_TYPE##_data(state, probs); \ + \ + THArgCheck(size == prob_size, 3, "inconsistent tensor size"); \ + \ + generate_bernoulli_tensor<<>>( \ + gen->state.gen_states, size, result_data, probs_data); \ + \ + PROB_TYPE##_free(state, probs); \ + THCTensor_(freeCopyTo)(state, self, self_); \ +} + +DEFINE_BERNOULLI_TENSOR(bernoulli_FloatTensor, THCudaTensor, float) +DEFINE_BERNOULLI_TENSOR(bernoulli_DoubleTensor, THCudaDoubleTensor, double) + +#if defined(THC_REAL_IS_DOUBLE) +GENERATE_KERNEL1(generate_geometric, double, double p, double, curand_uniform_double, ceil(log(x) / log(1-p))) +#else +GENERATE_KERNEL1(generate_geometric, real, double p, float, curand_uniform, (ScalarConvert::to(ceilf(logf(x) / log(1-p))))) +#endif + +#if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT) +#define CURAND64(STATE) (((uint64_t)curand(STATE)) << 32) | (uint64_t)curand(STATE) +GENERATE_KERNEL2(generate_random, real, int32_t base, uint32_t range, uint32_t, curand, \ + static_cast(static_cast((x % range) + base))) +GENERATE_KERNEL2(generate_random_64, real, int64_t base, uint64_t range, uint64_t, CURAND64, \ + static_cast(static_cast((x % range) + base))) +#elif defined(THC_REAL_IS_HALF) +GENERATE_KERNEL2(generate_random, real, int32_t base, uint32_t range, uint32_t, curand, + (ScalarConvert::to(static_cast(x % range + base)))) +#else +GENERATE_KERNEL2(generate_random, real, int32_t base, uint32_t range, uint32_t, curand, + static_cast(static_cast(x % range + base))) +#endif + +THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + generate_geometric<<>>( + gen->state.gen_states, size, data, p); + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val) +{ + THArgCheck(min_val < max_val, 2, + "max must be greater than min, but got: min = %lld, max = %lld", min_val, max_val); + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + + uint64_t range = max_val - min_val; + +#if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT) + if (range > 1ULL << 32) { + generate_random_64<<>>( + gen->state.gen_states, static_cast(size), data, min_val, range); + } else { +#endif + generate_random<<>>( + gen->state.gen_states, static_cast(size), data, static_cast(min_val), static_cast(range)); +#if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT) + } +#endif + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +THC_API void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val) +{ + THCTensor_(clampedRandom)(state, self_, 0LL, max_val); +}; + +#define HLF_MANT_DIG 11 + +THC_API void THCTensor_(random)(THCState* state, THCTensor *self_) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); + ptrdiff_t size = THCTensor_(nElement)(state, self_); + if (size == 0) return; + THCGenerator* gen = THCRandom_getGenerator(state); + THCTensor *self = THCTensor_(newContiguous)(state, self_); + real *data = THCTensor_(data)(state, self); + +#if defined(THC_REAL_IS_HALF) + generate_random<<>>( + gen->state.gen_states, static_cast(size), data, static_cast(0UL), static_cast((1UL << HLF_MANT_DIG) + 1)); +#elif defined(THC_REAL_IS_FLOAT) + generate_random<<>>( + gen->state.gen_states, static_cast(size), data, static_cast(0UL), static_cast((1UL << FLT_MANT_DIG) + 1)); +#elif defined(THC_REAL_IS_DOUBLE) + generate_random_64<<>>( + gen->state.gen_states, static_cast(size), data, static_cast(0ULL), static_cast((1ULL << DBL_MANT_DIG) + 1)); +#elif defined(THC_REAL_IS_LONG) + generate_random_64<<>>( + gen->state.gen_states, static_cast(size), data, static_cast(0ULL), static_cast(std::numeric_limits::max()) + 1); +#else + generate_random<<>>( + gen->state.gen_states, static_cast(size), data, static_cast(0UL), static_cast(std::numeric_limits::max()) + 1); +#endif + + THCTensor_(freeCopyTo)(state, self, self_); +}; + +#undef HLF_MANT_DIG +#undef CURAND64 +#undef NUM_BLOCKS + +#endif diff --git a/aten/src/THC/generic/THCTensorRandom.h b/aten/src/THC/generic/THCTensorRandom.h new file mode 100644 index 0000000..1deb2db --- /dev/null +++ b/aten/src/THC/generic/THCTensorRandom.h @@ -0,0 +1,30 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorRandom.h" +#else + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + +THC_API void THCTensor_(uniform)(struct THCState *state, THCTensor *self, double a, double b); +THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv); +THC_API void THCTensor_(normal_means)(struct THCState *state, THCTensor *self, THCTensor *means, double stddev); +THC_API void THCTensor_(normal_stddevs)(struct THCState *state, THCTensor *self, double mean, THCTensor *stddevs); +THC_API void THCTensor_(normal_means_stddevs)(struct THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs); +THC_API void THCTensor_(logNormal)(struct THCState *state, THCTensor *self, double mean, double stdv); +THC_API void THCTensor_(exponential)(struct THCState *state, THCTensor *self, double lambda); +THC_API void THCTensor_(cauchy)(struct THCState *state, THCTensor *self, double median, double sigma); +THC_API void THCTensor_(multinomial)(struct THCState *state, THCudaLongTensor *self, THCTensor *prob_dist, int n_sample, int with_replacement); +THC_API void THCTensor_(multinomialAliasSetup)(struct THCState *state, THCTensor *probs, THCudaLongTensor *J, THCTensor *q); +THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q); + +#endif + +THC_API void THCTensor_(random)(struct THCState *state, THCTensor *self); +THC_API void THCTensor_(clampedRandom)(struct THCState *state, THCTensor *self, int64_t min, int64_t max); +THC_API void THCTensor_(cappedRandom)(struct THCState *state, THCTensor *self, int64_t max); +THC_API void THCTensor_(bernoulli)(struct THCState *state, THCTensor *self, double p); +THC_API void THCTensor_(bernoulli_FloatTensor)(struct THCState *state, THCTensor *self, THCudaTensor *p); +THC_API void THCTensor_(bernoulli_DoubleTensor)(struct THCState *state, THCTensor *self, THCudaDoubleTensor *p); +THC_API void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p); +THC_API void THCTensor_(geometric)(struct THCState *state, THCTensor *self, double p); + +#endif diff --git a/aten/src/THC/generic/THCTensorScatterGather.cu b/aten/src/THC/generic/THCTensorScatterGather.cu new file mode 100644 index 0000000..f04ae5a --- /dev/null +++ b/aten/src/THC/generic/THCTensorScatterGather.cu @@ -0,0 +1,362 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorScatterGather.cu" +#else + +#define RUN(TYPE, DIMS, REAL) \ + THCudaTensor_gatherKernel \ + <<>>( \ + tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); + +void THCTensor_(gather)(THCState* state, THCTensor *tensor, + THCTensor *src, int dim, THCudaLongTensor *index) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); + + THArgCheck(THCudaLongTensor__nDimension(state, index) == THCTensor_(_nDimension)(state, src), 4, + "Index tensor must have same dimensions as input tensor"); + THLongStorage *indexSize = THCudaLongTensor_newSizeOf(state, index); + THArgCheck(THCTensor_(isSize)(state, tensor, indexSize), 4, + "Index tensor must have the same size as output tensor."); + THLongStorage_free(indexSize); + THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 3, + "Index dimension is out of bounds"); + THArgCheck(THCTensor_(_nDimension)(state, src) == THCTensor_(_nDimension)(state, tensor), 2, + "Input tensor must have same dimensions as output tensor"); + + for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) { + if (d != dim) { + THArgCheck(THCTensor_(size)(state, tensor, d) == THCTensor_(size)(state, src, d), 2, + "Input tensor must have same size as output tensor apart from the specified dimension"); + } + } + + THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS, + 1, CUTORCH_DIM_WARNING); + + + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); + + THCTensor* oldTensor = NULL; + if (THCTensor_maybeOverlappingIndices(state, tensor)) { + oldTensor = tensor; + tensor = THCTensor_(newContiguous)(state, tensor); + } + + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, index)) { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo srcInfo = + getTensorInfo(state, src); + TensorInfo indexInfo = + getTensorInfo(state, index); + + // Specialize for a small number of dimensions. + switch (indexInfo.dims) { + case 1: + RUN(unsigned int, 1, real); + THCudaCheck(cudaGetLastError()); + break; + case 2: + RUN(unsigned int, 2, real); + THCudaCheck(cudaGetLastError()); + break; + case 3: + RUN(unsigned int, 3, real); + THCudaCheck(cudaGetLastError()); + break; + default: + RUN(unsigned int, -1, real); + THCudaCheck(cudaGetLastError()); + break; + } + } else { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo srcInfo = + getTensorInfo(state, src); + TensorInfo indexInfo = + getTensorInfo(state, index); + RUN(uint64_t, -1, real); + THCudaCheck(cudaGetLastError()); + } + + if (oldTensor) { + THCTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + THCTensor_(free)(state, tensor); + tensor = oldTensor; + } + THCudaCheck(cudaGetLastError()); +} + +#undef RUN + + +#define RUN(TYPE, DIMS, REAL) \ + THCudaTensor_scatterKernel \ + <<>>( \ + tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); + +void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); + + THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 2, + "Index dimension is out of bounds"); + THArgCheck(THCudaLongTensor__nDimension(state, index) == THCTensor_(_nDimension)(state, src), 3, + "Index tensor must have same dimensions as input tensor"); + THArgCheck(THCTensor_(_nDimension)(state, src) == THCTensor_(_nDimension)(state, tensor), 4, + "Input tensor must have same dimensions as output tensor"); + + for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) { + int64_t indexSizeD = THCudaLongTensor_size(state, index, d); + if (d != dim) { + THArgCheck(indexSizeD <= THCTensor_(size)(state, tensor, d), 3, + "Index tensor must not have larger size than output tensor apart from the specified dimension %d, but got index %s output %s", + dim, THCudaLongTensor_sizeDesc(state, index).str, THCTensor_(sizeDesc)(state, tensor).str); + } + THArgCheck(indexSizeD <= THCTensor_(size)(state, src, d), 3, + "Index tensor must not have larger size than input tensor, but got index %s input %s", + THCudaLongTensor_sizeDesc(state, index).str, THCTensor_(sizeDesc)(state, src).str); + } + + THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS, + 1, CUTORCH_DIM_WARNING); + + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); + + THCTensor* oldTensor = NULL; + if (THCTensor_maybeOverlappingIndices(state, tensor)) { + oldTensor = tensor; + tensor = THCTensor_(newContiguous)(state, tensor); + } + + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, index)) { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo srcInfo = + getTensorInfo(state, src); + TensorInfo indexInfo = + getTensorInfo(state, index); + + // Specialize for a small number of dimensions. + switch (indexInfo.dims) { + case 1: + RUN(unsigned int, 1, real); + break; + case 2: + RUN(unsigned int, 2, real); + break; + case 3: + RUN(unsigned int, 3, real); + break; + default: + RUN(unsigned int, -1, real); + break; + } + } else { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo srcInfo = + getTensorInfo(state, src); + TensorInfo indexInfo = + getTensorInfo(state, index); + + RUN(uint64_t, -1, real) + } + + if (oldTensor) { + THCTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + THCTensor_(free)(state, tensor); + tensor = oldTensor; + } + THCudaCheck(cudaGetLastError()); +} + +#undef RUN + +#define RUN(TYPE, DIMS, REAL) \ + THCudaTensor_scatterAddKernel \ + <<>>( \ + tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); + +void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); + + THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 2, + "Index dimension is out of bounds"); + THArgCheck(THCudaLongTensor__nDimension(state, index) == THCTensor_(_nDimension)(state, src), 3, + "Index tensor must have same dimensions as input tensor"); + THArgCheck(THCTensor_(_nDimension)(state, src) == THCTensor_(_nDimension)(state, tensor), 4, + "Input tensor must have same dimensions as output tensor"); + THLongStorage *indexDims = THCudaLongTensor_newSizeOf(state, index); + THArgCheck(THCTensor_(isSize)(state, src, indexDims), 3, + "Index tensor must have the same size as input tensor."); + THLongStorage_free(indexDims); + + for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) { + if (d != dim) { + THArgCheck(THCTensor_(size)(state, tensor, d) == THCTensor_(size)(state, src, d), 4, + "Input tensor must have same size as output tensor apart from the specified dimension"); + } + } + + THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS, + 1, CUTORCH_DIM_WARNING); + + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); + + THCTensor* oldTensor = NULL; + if (THCTensor_maybeOverlappingIndices(state, tensor)) { + oldTensor = tensor; + tensor = THCTensor_(newContiguous)(state, tensor); + } + + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, src) && + THCTensor_canUse32BitIndexMath(state, index)) { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo srcInfo = + getTensorInfo(state, src); + TensorInfo indexInfo = + getTensorInfo(state, index); + + // Specialize for a small number of dimensions. + switch (indexInfo.dims) { + case 1: + RUN(unsigned int, 1, real); + break; + case 2: + RUN(unsigned int, 2, real); + break; + case 3: + RUN(unsigned int, 3, real); + break; + default: + RUN(unsigned int, -1, real); + break; + } + } else { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo srcInfo = + getTensorInfo(state, src); + TensorInfo indexInfo = + getTensorInfo(state, index); + + RUN(uint64_t, -1, real) + } + + if (oldTensor) { + THCTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + THCTensor_(free)(state, tensor); + tensor = oldTensor; + } + THCudaCheck(cudaGetLastError()); +} + +#undef RUN + +#define RUN(TYPE, DIMS, REAL) \ + THCudaTensor_scatterFillKernel \ + <<>>( \ + tensorInfo, indexInfo, value, dim, (TYPE)totalElements); + +void +THCTensor_(scatterFill)(THCState* state, THCTensor *tensor, + int dim, THCudaLongTensor *index, real value) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); + + THArgCheck(dim >= 0 && dim < THCTensor_(_nDimension)(state, tensor), 2, + "Index dimension is out of bounds"); + THArgCheck(THCudaLongTensor__nDimension(state, index) == + THCTensor_(_nDimension)(state, tensor), 3, + "Index tensor must have same dimensions as output tensor"); + + for (int d = 0; d < THCTensor_(_nDimension)(state, tensor); d++) { + if (d != dim) { + THArgCheck(THCTensor_(size)(state, tensor, d) == + THCudaLongTensor_size(state, index, d), 4, + "Index tensor must have same size as output tensor apart from the specified dimension"); + } + } + + THArgCheck(THCTensor_(_nDimension)(state, tensor) <= MAX_CUTORCH_DIMS, + 1, CUTORCH_DIM_WARNING); + + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING); + + THCTensor* oldTensor = NULL; + if (THCTensor_maybeOverlappingIndices(state, tensor)) { + oldTensor = tensor; + tensor = THCTensor_(newContiguous)(state, tensor); + } + + if (THCTensor_canUse32BitIndexMath(state, tensor) && + THCTensor_canUse32BitIndexMath(state, index)) { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo indexInfo = + getTensorInfo(state, index); + + // Specialize for a small number of dimensions. + switch (indexInfo.dims) { + case 1: + RUN(unsigned int, 1, real); + break; + case 2: + RUN(unsigned int, 2, real); + break; + case 3: + RUN(unsigned int, 3, real); + break; + default: + RUN(unsigned int, -1, real); + break; + } + } else { + TensorInfo tensorInfo = + getTensorInfo(state, tensor); + TensorInfo indexInfo = + getTensorInfo(state, index); + + RUN(uint64_t, -1, real); + } + + if (oldTensor) { + THCTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + THCTensor_(free)(state, tensor); + tensor = oldTensor; + } + THCudaCheck(cudaGetLastError()); +} + +#undef RUN + +#endif diff --git a/aten/src/THC/generic/THCTensorScatterGather.h b/aten/src/THC/generic/THCTensorScatterGather.h new file mode 100644 index 0000000..e7e83b2 --- /dev/null +++ b/aten/src/THC/generic/THCTensorScatterGather.h @@ -0,0 +1,10 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorScatterGather.h" +#else + +THC_API void THCTensor_(gather)(THCState* state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index); +THC_API void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src); +THC_API void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src); +THC_API void THCTensor_(scatterFill)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, real value); + +#endif diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu new file mode 100644 index 0000000..a97d19b --- /dev/null +++ b/aten/src/THC/generic/THCTensorSort.cu @@ -0,0 +1,336 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorSort.cu" +#else + +// In alignment with default sort on a c++ map, this function +// will permute key and value tensors identically, and +// in such a way that the 'key' tensor is ordered numerically +THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, + THCTensor* key, + THCudaLongTensor* value, + int dim, bool dir) { + THLongStorage *valueSize = THCudaLongTensor_newSizeOf(state, value); + THArgCheck(THCTensor_(isSize)(state, key, valueSize), 2, + "Key tensor must have same size as value tensor"); + THLongStorage_free(valueSize); + int dims = THCudaLongTensor__nDimension(state, value); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING); + dims = THCTensor_(_nDimension)(state, key); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + + ptrdiff_t inElements = THCTensor_(nElement)(state, key); + int64_t keySliceSize = THCTensor_(size)(state, key, dim); + ptrdiff_t keySlices = inElements / keySliceSize; + + if (THCTensor_(_nDimension)(state, key) == 0) { + // Zero-dim tensor; do nothing + return; + } + + // The amount of shared memory and block size is based on + // 2^ceil(lg(n)); we choose that sorting implementation for a given + // size. + int64_t ceilPowerOf2 = nextHighestPowerOf2(keySliceSize); + + // FIXME: We'd have to find some other trick with Thrust to perform a + // vectorized (key, value) sort by slice segment + if (ceilPowerOf2 > 2048) { + THError("sortKeyValueInplace only works for sizes <= 2048 at present"); + } + + // The grid is based on the number of independent slices that we + // have to sort; one block per slice + dim3 grid; + if (!THC_getGridFromTiles(keySlices, grid)) { + THError("Slice to sort is too large"); + } + +#define HANDLE_CASE(TYPE, A, SIZE) \ + do { \ + int blockSize = SIZE / 2; \ + if (blockSize < 1) { \ + blockSize = 1; \ + } \ + \ + dim3 block(blockSize); \ + \ + if (dir) { \ + bitonicSortKVInPlace, TYPE, SIZE> \ + <<>>( \ + keyInfo, \ + keySlices, \ + (TYPE) keySliceSize, \ + (TYPE) keyInfo.strides[collapseKeyDim], \ + valueInfo, \ + (TYPE) valueInfo.strides[collapseValueDim], \ + GTComp()); \ + } else { \ + bitonicSortKVInPlace, TYPE, SIZE> \ + <<>>( \ + keyInfo, \ + keySlices, \ + (TYPE) keySliceSize, \ + (TYPE) keyInfo.strides[collapseKeyDim], \ + valueInfo, \ + (TYPE) valueInfo.strides[collapseValueDim], \ + LTComp()); \ + } \ + } while (0) + +#define HANDLE_SORT_CASE(TYPE, A) \ + { \ + switch (ceilPowerOf2) { \ + case 2048: \ + HANDLE_CASE(TYPE, A, 2048); \ + break; \ + case 1024: \ + case 512: \ + case 256: \ + HANDLE_CASE(TYPE, A, 1024); \ + break; \ + case 128: \ + case 64: \ + HANDLE_CASE(TYPE, A, 128); \ + break; \ + case 32: \ + case 16: \ + case 8: \ + case 4: \ + case 2: \ + HANDLE_CASE(TYPE, A, 32); \ + break; \ + case 1: \ + /* Nothing to do, data already sorted */ \ + break; \ + default: \ + assert(false); \ + } \ + } + + // The constructed key/value tensor info is used to select the slice + // we are sorting on a per-block basis + if (THCTensor_canUse32BitIndexMath(state, key)) { + TensorInfo keyInfo = + getTensorInfo(state, key); + keyInfo.reduceDim(dim); + int collapseKeyDim = keyInfo.collapseDims(dim); + + TensorInfo valueInfo = + getTensorInfo(state, value); + valueInfo.reduceDim(dim); + int collapseValueDim = valueInfo.collapseDims(dim); + + if (keyInfo.isContiguous()) { + HANDLE_SORT_CASE(unsigned int, -2); + } else { + switch (keyInfo.dims) { + case 2: + HANDLE_SORT_CASE(unsigned int, 2); + break; + default: + HANDLE_SORT_CASE(unsigned int, -1); + break; + } + } + } else { + TensorInfo keyInfo = + getTensorInfo(state, key); + keyInfo.reduceDim(dim); + int collapseKeyDim = keyInfo.collapseDims(dim); + + TensorInfo valueInfo = + getTensorInfo(state, value); + valueInfo.reduceDim(dim); + int collapseValueDim = valueInfo.collapseDims(dim); + + // int64_t case is rare, just instantiate the generic version + HANDLE_SORT_CASE(uint64_t, -1); + } +#undef HANDLE_CASE +#undef HANDLE_SORT_CASE +#undef HANDLE_A_CASE + + THCudaCheck(cudaGetLastError()); +} + +void THCTensor_(sortViaThrust)(THCState* state, + THCTensor* sorted, + THCudaLongTensor* indices, + THCTensor* input, + int dim, bool dir) { + int nDims = THCTensor_(_nDimension)(state, input); + + ptrdiff_t totalElements = THCTensor_(nElement)(state, input); + int64_t sliceSize = THCTensor_(size)(state, input, dim); + int64_t sliceStride = THCTensor_(stride)(state, input, dim); + + // We perform a vectorized segmented sort in Thrust. + // Say we are sorting a (2, 3) tensor. We have in flattened form: + // values 0.4 1.2 5.3 6.2 1.3 2.3 + // indices 0 1 2 3 4 5 + // where indices is a global index (across all slices) + + // First we sort by values, globally: + // values 6.2 5.3 2.3 1.2 1.3 0.4 + // indices 3 2 5 1 4 0 + + // Then we stable sort by segment, which is index / 3: + // values 5.3 1.2 0.4 6.2 2.3 1.3 + // indices 2 1 0 3 5 4 + + // Then we translate the global index to a per-slice Lua index + // (index % 3) + 1: + // values 5.3 1.2 0.4 6.2 2.3 1.3 + // indices 3 2 1 1 3 2 + + // This method can only work if the slice we are sorting (`dim`) is + // innermost, and both values and indices are contiguous. We do this + // by re-arranging the input into this form as needed, which will + // unfortunately allocate memory if the request is not in this form. + // Vectorized sort is slower than iterated sort if the number of + // slices is small (since we're sorting twice, instead of invoking a + // smaller sort `numSlices` times), but the Thrust sort + // implementation here is a catch-all, so we're not looking for + // efficiency, but instead correctness. + THCTensor_(copy)(state, sorted, input); + THCTensor* trKeys = THCTensor_(newWithTensor)(state, sorted); + THCudaLongTensor* trIndices = THCudaLongTensor_newWithTensor(state, indices); + + // Transpose dim to innermost + if (dim != nDims - 1) { + THCTensor_(transpose)(state, trKeys, NULL, dim, nDims - 1); + THCudaLongTensor_transpose(state, trIndices, NULL, dim, nDims - 1); + } + + // Thrust must operate on a contiguous layout + THCTensor* trContigKey = THCTensor_(newContiguous)(state, trKeys); + THCudaLongTensor* trContigIndices = THCudaLongTensor_newContiguous(state, trIndices); + + THCTensor_(free)(state, trKeys); + THCudaLongTensor_free(state, trIndices); + + THCThrustAllocator thrustAlloc(state); + + thrust::device_ptr keyIter(THCTensor_(data)(state, trContigKey)); + + // Since we are composing a global index across all segments rather + // than a per-segment index, we treat the memory as int so we don't + // have problems sorting slices < 2^24 but where the entire tensor + // has more than 2^24 elements + thrust::device_ptr + indexIter((int64_t*) THCudaLongTensor_data(state, trContigIndices)); + + // Fill the indices with a global index across all slices + thrust::counting_iterator countIter(0); + + thrust::copy( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + countIter, countIter + totalElements, indexIter); + + // First, we sort globally (across all slices) according to key + // (the values we're sorting) + if (dir) { + thrust::stable_sort_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + keyIter, keyIter + totalElements, indexIter, ThrustGTOp()); + } else { + thrust::stable_sort_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + keyIter, keyIter + totalElements, indexIter, ThrustLTOp()); + } + + // Then, re-sort according to slice that each index is + // in. This completes the segment sort in Thrust, since we're + // stably sorting here, preserving the relative order of values + // per each slice + thrust::stable_sort_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + indexIter, indexIter + totalElements, keyIter, + SliceComp(sliceSize)); + + // Translate the global integer 0-based index to a per-slice real + // Lua index + thrust::for_each( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + indexIter, indexIter + totalElements, + GlobalIndexToPerSliceIndex(sliceSize)); + + // Reverse the transposition as needed + if (dim != nDims - 1) { + THCTensor_(transpose)(state, trContigKey, NULL, dim, nDims - 1); + THCudaLongTensor_transpose(state, trContigIndices, NULL, dim, nDims - 1); + } + + // Then copy back to the expected output + THCTensor_(freeCopyTo)(state, trContigKey, sorted); + THCudaLongTensor_freeCopyTo(state, trContigIndices, indices); +} + +THC_API void THCTensor_(sort)(THCState* state, + THCTensor *sorted, + THCudaLongTensor *indices, + THCTensor *input, + int dim, int order) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input)); + THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices)); + int64_t dims = THCTensor_(_nDimension)(state, sorted); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + dims = THCTensor_(_nDimension)(state, input); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); + dims = THCudaLongTensor__nDimension(state, indices); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING); + + // Make sure sufficient output space is allocated + THCTensor_(resizeAs)(state, sorted, input); + THLongStorage *inputSize = THCTensor_(newSizeOf)(state, input); + THCudaLongTensor_resize(state, indices, inputSize, NULL); + THLongStorage_free(inputSize); + + // How large are the slices that we are sorting? + int64_t sliceSize = THCTensor_(size)(state, input, dim); + + // Workaround: + // CUDA 8 uses more shared memory than 7.5 for bitonicSortKVInPlace, + // and so for the double word types, + // we get "too many resources requested for launch" in the 2048 case +#if CUDA_VERSION >= 8000 +#if defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_LONG) + int maxSliceSize = 1024; +#else + int maxSliceSize = 2048; +#endif +#else + int maxSliceSize = 2048; +#endif + + if (sliceSize <= maxSliceSize) { + // Fill `indices` (the values) with the + // slice-relative index. + THCudaLongTensor_fillSliceWithIndex(state, indices, dim); + + // We sort k/v pairs in-place; copy unsorted input to output + THCTensor_(copy)(state, sorted, input); + + // Sort using our in-place k/v kernel that supports arbitrary + // layout + THCTensor_(sortKeyValueInplace)(state, sorted, indices, dim, order); + } else { + // Otherwise, fall back upon Thrust, which handles all other cases + // (potentially slowly, with extra copies/memory allocations) + THCTensor_(sortViaThrust)(state, sorted, indices, input, dim, (bool) order); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THC/generic/THCTensorSort.h b/aten/src/THC/generic/THCTensorSort.h new file mode 100644 index 0000000..009d825 --- /dev/null +++ b/aten/src/THC/generic/THCTensorSort.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorSort.h" +#else + +/* Performs an in-place sort of (keys, values). Only works for slice sizes + <= 2048 at the moment (slice size == size of keys/values dim `dim`) */ +THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, + THCTensor* keys, + THCudaLongTensor* values, + int dim, int order); + +/* Performs an out-of-place sort of `input`, returning the per-slice indices + in `indices` and the sorted values in `sorted` */ +THC_API void THCTensor_(sort)(THCState* state, + THCTensor* sorted, + THCudaLongTensor* indices, + THCTensor* input, + int dim, int order); + +#endif diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu new file mode 100644 index 0000000..c2f3a28 --- /dev/null +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -0,0 +1,165 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorTopK.cu" +#else + +THC_API void THCTensor_(topk)(THCState* state, + THCTensor *topK, + THCudaLongTensor *indices, + THCTensor *input_, + int64_t k, int dim, int dir, int sorted) { + THAssert(topK != NULL && indices != NULL && input_ != NULL); + THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_)); + THArgCheck(THCTensor_(_nDimension)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + int64_t dims = THCudaLongTensor__nDimension(state, indices); + THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING); + int numDims = THCTensor_(_nDimension)(state, input_); + THArgCheck(numDims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); + + THArgCheck(dim >= 0 && dim < numDims, 6, "dim not in range"); + + int64_t sliceSize = THCTensor_(size)(state, input_, dim); + THArgCheck(k > 0 && k <= sliceSize, 5, "k not in range for dimension"); + + THCTensor *input = THCTensor_(newContiguous)(state, input_); + + // Build the output size, which is the dim being selected set to + // size k + THLongStorage* topKSize = THCTensor_(newSizeOf)(state, input); + THLongStorage_set(topKSize, dim, k); + THCTensor_(resize)(state, topK, topKSize, NULL); + THCudaLongTensor_resize(state, indices, topKSize, NULL); + THLongStorage_free(topKSize); + +#define RUN_K(INDEX_T, DIM, DIR) \ + gatherTopK \ + <<>>( \ + inputInfo, \ + sliceSize, \ + k, \ + inputSlices, \ + /* The actual dimension that the k-selection is running in */ \ + /* may have changed from collapseDims() */ \ + inputInfo.strides[collapseInputDim], \ + topKInfo, \ + topKSlices, \ + topKInfo.strides[collapseTopKDim], \ + indicesInfo, \ + indicesInfo.strides[collapseIndicesDim]) + +#define RUN_DIR(INDEX_T, DIM) \ + if (dir) { \ + RUN_K(INDEX_T, DIM, true); \ + } else { \ + RUN_K(INDEX_T, DIM, false); \ + } + +#define RUN_DIM(INDEX_T) \ + if (allDims == 1) { \ + RUN_DIR(INDEX_T, 1); \ + } else if (allDims == 2) { \ + RUN_DIR(INDEX_T, 2); \ + } else if (allDims == 3) { \ + RUN_DIR(INDEX_T, 3); \ + } else { \ + RUN_DIR(INDEX_T, -1); \ + } + +#define RUN_T(INDEX_T) \ + TensorInfo inputInfo = \ + getTensorInfo(state, input); \ + TensorInfo topKInfo = \ + getTensorInfo(state, topK); \ + TensorInfo indicesInfo = \ + getTensorInfo(state, indices); \ + \ + /* We use these structures solely to find the offset to */ \ + /* each slice we are operating on */ \ + inputInfo.sizes[dim] = 1; \ + topKInfo.sizes[dim] = 1; \ + indicesInfo.sizes[dim] = 1; \ + \ + /* Collapse all other dims */ \ + int collapseInputDim = inputInfo.collapseDims(dim); \ + int collapseTopKDim = topKInfo.collapseDims(dim); \ + int collapseIndicesDim = indicesInfo.collapseDims(dim); \ + \ + int64_t inputSlices = 1; \ + for (int i = 0; i < inputInfo.dims; ++i) { \ + inputSlices *= inputInfo.sizes[i]; \ + } \ + int64_t topKSlices = 1; \ + for (int i = 0; i < topKInfo.dims; ++i) { \ + topKSlices *= topKInfo.sizes[i]; \ + } \ + \ + dim3 grid; \ + if (!THC_getGridFromTiles(inputSlices, grid)) { \ + THError("Slice to sort is too large"); \ + } \ + \ + dim3 block(std::min(THCRoundUp(sliceSize, (int64_t) 32), (int64_t) 1024)); \ + \ + /* This is used as a template parameter to calculate indices. */ \ + /* We only specialize it if all collapsed dim sizes are the */ \ + /* same; otherwise, we use -1 which is the specialization */ \ + /* parameter for arbitrary dimensions */ \ + int allDims = inputInfo.dims; \ + if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \ + allDims = -1; \ + } \ + \ + RUN_DIM(INDEX_T); + + // Based on required index size, run the algorithm with the + // appropriate index type + if (THCTensor_canUse32BitIndexMath(state, input) && + THCTensor_canUse32BitIndexMath(state, topK) && + THCTensor_canUse32BitIndexMath(state, indices)) { + RUN_T(uint32_t); + } else { + RUN_T(uint64_t); + } +#undef RUN_T +#undef RUN_DIM +#undef RUN_DIR +#undef RUN_K + + // Sort the results if the user wants them sorted, since our + // selection routine does not ensure sorting + if (sorted) { + // FIXME: the k/v inplace sort along slice only works for size <= + // 2048 at the moment + if (sliceSize <= 2048) { + // This avoids any memory allocations and performs all sorting + // work inplace along the slice + THCTensor_(sortKeyValueInplace)(state, topK, indices, dim, dir); + } else { + // Depend upon the backup sort that returns indices, which we + // can use in conjunction with gather to produce the original + // indices. + // This is not the most efficient implementation, especially since + // there are memory allocations performed here. If the user desires + // greater performance, they should torch.gather() the results + // themselves using the reported indices, providing previously + // allocated tensors to receive the results. + THCTensor* sortedTopK = THCTensor_(new)(state); + THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state); + THCTensor_(sort)(state, sortedTopK, sortedIndices, topK, dim, dir); + + THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state); + + THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices); + THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices); + + THCTensor_(freeCopyTo)(state, sortedTopK, topK); + THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices); + THCudaLongTensor_free(state, sortedIndices); + } + } + + THCudaLongTensor_free(state, input); + + THCudaCheck(cudaGetLastError()); +} + +#endif // THC_GENERIC_FILE diff --git a/aten/src/THC/generic/THCTensorTopK.h b/aten/src/THC/generic/THCTensorTopK.h new file mode 100644 index 0000000..95dbceb --- /dev/null +++ b/aten/src/THC/generic/THCTensorTopK.h @@ -0,0 +1,13 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorTopK.h" +#else + +/* Returns the set of all kth smallest (or largest) elements, depending */ +/* on `dir` */ +THC_API void THCTensor_(topk)(THCState* state, + THCTensor* topK, + THCudaLongTensor* indices, + THCTensor* input, + int64_t k, int dim, int dir, int sorted); + +#endif // THC_GENERIC_FILE diff --git a/aten/src/THCUNN/Abs.cu b/aten/src/THCUNN/Abs.cu new file mode 100644 index 0000000..72b7ff3 --- /dev/null +++ b/aten/src/THCUNN/Abs.cu @@ -0,0 +1,25 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct absupdateOutput_functor +{ + __device__ void operator()(T* output, const T* input) const + { + *output = THCNumerics::abs(*input); + } +}; + +template +struct absupdateGradInput_functor +{ + __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const + { + *gradInput = *input < 0 ? - *gradOutput : *gradOutput; + } +}; + +#include "generic/Abs.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu new file mode 100644 index 0000000..cb0f475 --- /dev/null +++ b/aten/src/THCUNN/AbsCriterion.cu @@ -0,0 +1,62 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCApply.cuh" + +#include +#include +#include +#include +#include + +template +struct abs_functor +{ + __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const + { + Dtype z = x-y; + return ScalarConvert::to(z >= 0 ? z : -z); + } +}; + +template +struct abs_updateOutput_no_reduce_functor +{ + __host__ __device__ void operator()(const Dtype* x, const Dtype* y, Dtype *out) + { + Dtype z = *x - *y; + *out = z >= 0 ? z : -z; + } +}; + +template +struct abs_updateGradInput_no_reduce_functor +{ + __forceinline__ __host__ __device__ void operator()( + const Dtype *x, + const Dtype *y, + Dtype *gradInput) + { + *gradInput = ScalarConvert::to(*x >= *y ? 1 : -1); + } +}; + +template +struct abs_updateGradInput_functor +{ + const Dtype norm; + const Dtype gradOutput; + + abs_updateGradInput_functor(Dtype norm_, Dtype gradOutput_) + : norm(norm_), gradOutput(gradOutput_) + {} + + __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const + { + return ((x - y) >= 0 ? norm : -norm) * gradOutput; + } +}; + +#include "generic/AbsCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu new file mode 100644 index 0000000..3624588 --- /dev/null +++ b/aten/src/THCUNN/BCECriterion.cu @@ -0,0 +1,134 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCThrustAllocator.cuh" +#include "THCApply.cuh" + +#include +#include +#include +#include +#include +#include + +template +inline __host__ __device__ T eps(); + +template <> +inline __host__ __device__ float eps() { return 1e-12f; } + +template <> +inline __host__ __device__ double eps() { return 1e-12; } + +template +inline __host__ __device__ T safe_log(T a) { + if (a == 0.) + { + return THCNumerics::log(eps()); + } + return THCNumerics::log(a); +} + +template +struct bce_functor +{ + template + __host__ __device__ + Acctype operator()(Tuple x) + { + Dtype input = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + assert(input >= 0. && input <= 1.); + return - (t * safe_log(ScalarConvert::to(input)) + + (Acctype(1) - t) * safe_log(Acctype(1) - input)); + } +}; + +template +struct bce_updateOutput_no_reduce_functor +{ + __forceinline__ __host__ __device__ + void operator()( + const Dtype *input, + const Dtype *target, + Dtype *output) + { + assert(*input >= 0. && *input <= 1.); + *output = ScalarConvert::to( + -(*target * safe_log(ScalarConvert::to(*input)) + + (Acctype(1) - *target) * safe_log(Acctype(1) - *input))); + } +}; + +template +struct bce_functor_weights +{ + template + __host__ __device__ + Acctype operator()(Tuple x) + { + Dtype input = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + Dtype w = thrust::get<2>(x); + assert(input >= 0. && input <= 1.); + return - w * (t * safe_log(ScalarConvert::to(input)) + + (Acctype(1) - t) * safe_log(Acctype(1) - input)); + } +}; + +template +struct bce_updateGradInput_no_reduce_functor +{ + __forceinline__ __host__ __device__ + void operator()( + const Dtype *x, + const Dtype *t, + Dtype *gradInput) + { + *gradInput = ScalarConvert::to( + - (*t - *x) / ((Acctype(1) - *x + eps()) * (*x + eps()))); + } +}; + +template +struct bce_updateGradInput_functor +{ + const Dtype norm; + + bce_updateGradInput_functor(Dtype norm_) + : norm(norm_) + {} + + template + __host__ __device__ + Dtype operator()(Tuple x) + { + Dtype o = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + return ScalarConvert::to(- (t - o) / ((Acctype(1) - o + eps()) * (o + eps())) * norm); + } +}; + +template +struct bce_updateGradInput_functor_weights +{ + const Dtype norm; + + bce_updateGradInput_functor_weights(Dtype norm_) + : norm(norm_) + {} + + template + __host__ __device__ + Dtype operator()(Tuple x) + { + Dtype o = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + Dtype w = thrust::get<2>(x); + return ScalarConvert::to(- (t - o) / ((Acctype(1) - o + eps()) * (o + eps())) * norm * w); + } +}; + +#include "generic/BCECriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu new file mode 100644 index 0000000..03531b3 --- /dev/null +++ b/aten/src/THCUNN/BatchNormalization.cu @@ -0,0 +1,291 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" + +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +const int WARP_SIZE = 32; + +// The maximum number of threads in a block +const int MAX_BLOCK_SIZE = 512; + +// Number of threads in a block given an input size up to MAX_BLOCK_SIZE +static int getNumThreads(int nElem) { + int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE }; + for (int i = 0; i != 5; ++i) { + if (nElem <= threadSizes[i]) { + return threadSizes[i]; + } + } + return MAX_BLOCK_SIZE; +} + +// Returns the index of the most significant 1 bit in `val`. +__device__ __forceinline__ int getMSB(int val) { + return 31 - __clz(val); +} + +template +struct Float2 { + Acctype v1, v2; + __device__ Float2() {} + __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert::to(v1)), v2(ScalarConvert::to(v2)) {} + __device__ Float2(Dtype v) : v1(ScalarConvert::to(v)), v2(ScalarConvert::to(v)) {} + __device__ Float2(int v) : v1(ScalarConvert::to(v)), v2(ScalarConvert::to(v)) {} + __device__ Float2& operator+=(const Float2& a) { + v1 += a.v1; + v2 += a.v2; + return *this; + } +}; + +template +struct SumOp { + __device__ SumOp(const DeviceTensor3 t) : tensor(t) {} + __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) { + return ScalarConvert::to(tensor[batch][plane][n]); + } + const DeviceTensor3 tensor; +}; + +template +struct VarOp { + __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {} + __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) { + Dtype val = tensor[batch][plane][n]; + return (val - mean) * (val - mean); + } + const Acctype mean; + const DeviceTensor3 tensor; +}; + +template +struct GradOp { + __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g) + : mean(m), input(i), gradOutput(g) {} + __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) { + Dtype g = gradOutput[batch][plane][n]; + Dtype c = ScalarConvert::to(input[batch][plane][n] - mean); + return Float2(g, g * c); + } + const Acctype mean; + const DeviceTensor3 input; + const DeviceTensor3 gradOutput; +}; + +// Sum across all threads within a warp +template +static __device__ __forceinline__ T warpSum(T val) { +#if __CUDA_ARCH__ >= 300 + for (int i = 0; i < getMSB(WARP_SIZE); ++i) { + val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE); + } +#else + __shared__ T values[MAX_BLOCK_SIZE]; + values[threadIdx.x] = val; + __threadfence_block(); + const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; + for (int i = 1; i < WARP_SIZE; i++) { + val += values[base + ((i + threadIdx.x) % WARP_SIZE)]; + } +#endif + return val; +} + +template +static __device__ __forceinline__ Float2 warpSum(Float2 value) { + value.v1 = warpSum(value.v1); + value.v2 = warpSum(value.v2); + return value; +} + +// Sum across (batch, x/y/z) applying Op() pointwise +template +__device__ T reduce(Op op, DeviceTensor3 tensor, int plane) { + T sum = (T)0; + for (int batch = 0; batch < tensor.getSize(0); ++batch) { + for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) { + sum += op(batch, plane, x); + } + } + + // sum over NumThreads within a warp + sum = warpSum(sum); + + // 'transpose', and reduce within warp again + __shared__ T shared[32]; + __syncthreads(); + if (threadIdx.x % WARP_SIZE == 0) { + shared[threadIdx.x / WARP_SIZE] = sum; + } + if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { + // zero out the other entries in shared + shared[threadIdx.x] = (T)0; + } + __syncthreads(); + if (threadIdx.x / WARP_SIZE == 0) { + sum = warpSum(shared[threadIdx.x]); + if (threadIdx.x == 0) { + shared[0] = sum; + } + } + __syncthreads(); + + // Everyone picks it up, should be broadcast into the whole gradInput + return shared[0]; +} + +template +__global__ void BatchNormalizationUpdateOutputInference_kernel( + const DeviceTensor3 input, + DeviceTensor3 output, + const DeviceTensor1 runningMean, + const DeviceTensor1 runningVar, + const DeviceTensor1 weight, + const DeviceTensor1 bias, + Acctype epsilon) { + + int plane = blockIdx.x; + + Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon); + Acctype mean = ScalarConvert::to(runningMean[plane].ldg()); + Acctype gamma = weight.numElements() > 0 ? ScalarConvert::to(weight[plane].ldg()) : Acctype(1); + Acctype beta = bias.numElements() > 0 ? ScalarConvert::to(bias[plane].ldg()) : Acctype(0); + + // Write normalized and update the output + for (int batch = 0; batch < input.getSize(0); batch++) { + for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) { + Dtype inp = input[batch][plane][x].ldg(); + output[batch][plane][x] = ScalarConvert::to(gamma * (inp - mean) * invstd + beta); + } + } +} + +template +__global__ void BatchNormalizationUpdateOutput_kernel( + const DeviceTensor3 input, + DeviceTensor3 output, + const DeviceTensor1 weight, + const DeviceTensor1 bias, + const Acctype epsilon, + const Acctype momentum, + DeviceTensor1 runningMean, + DeviceTensor1 runningVar, + DeviceTensor1 saveMean, + DeviceTensor1 saveStd) { + + int plane = blockIdx.x; + int N = input.getSize(0) * input.getSize(2); + + Acctype norm = Acctype(1) / N; + + // Compute the mean and variance across (batch, x/y/z) + Acctype mean = reduce(SumOp(input), input, plane) * norm; + __syncthreads(); + Acctype varN = reduce(VarOp(mean, input), input, plane); + Acctype invStd = 0; + if (varN != Acctype(0) || epsilon != Acctype(0)) { + invStd = 1 / sqrt(varN * norm + epsilon); + } + + // Save the mean, variance, and moving averages + if (threadIdx.x == 0) { + // Momentum based writeback + Acctype unbiasedVar = varN / (N - 1); + saveMean[plane] = ScalarConvert::to(mean); + saveStd[plane] = ScalarConvert::to(invStd); + if (runningMean.data() != NULL) { + runningMean[plane] = ScalarConvert::to((1 - momentum) * runningMean[plane] + momentum * mean); + } + if (runningVar.data() != NULL) { + runningVar[plane] = ScalarConvert::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar); + } + } + + // Write normalized and update the output + Acctype gamma = weight.numElements() > 0 ? ScalarConvert::to(weight[plane]) : ScalarConvert::to(1); + Acctype beta = bias.numElements() > 0 ? ScalarConvert::to(bias[plane]) : ScalarConvert::to(0); + for (int batch = 0; batch < input.getSize(0); ++batch) { + for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) { + Dtype inp = input[batch][plane][x].ldg(); + output[batch][plane][x] = ScalarConvert::to(gamma * (inp - mean) * invStd + beta); + } + } +} + +template +__global__ void BatchNormalizationBackward_kernel( + const DeviceTensor3 input, + const DeviceTensor3 gradOutput, + DeviceTensor3 gradInput, + DeviceTensor1 gradWeight, + DeviceTensor1 gradBias, + const DeviceTensor1 weight, + const DeviceTensor1 runningMean, + const DeviceTensor1 runningVar, + const DeviceTensor1 saveMean, + const DeviceTensor1 saveStd, + bool train, + Acctype scale, + double eps) { + + int plane = blockIdx.x; + int N = gradOutput.getSize(0) * gradOutput.getSize(2); + + Acctype mean, stdVal; + if (train) { + mean = ScalarConvert::to(saveMean[plane]); + stdVal = ScalarConvert::to(saveStd[plane]); + } else { + mean = ScalarConvert::to(runningMean[plane]); + stdVal = 1 / sqrt(runningVar[plane] + eps); + } + + Acctype weightVal = weight.numElements() > 0 ? ScalarConvert::to(weight[plane]) : Acctype(1); + Acctype norm = Acctype(1) / N; + + // Compute two values across (batch, x/y/z) in one pass: + // 1. Sum(gradOutput) + // 2. DotProduct(input - mean, gradOutput) + GradOp g(mean, input, gradOutput); + Float2 res = reduce, GradOp, DeviceTensor3>(g, gradOutput, plane); + Acctype gradOutputSum = res.v1; + Acctype dotP = res.v2; + + Acctype gradMean = gradOutputSum * norm; + Acctype projScale = dotP * norm * stdVal * stdVal; + Acctype gradScale = stdVal * weightVal; + + if (gradInput.numElements() > 0) { + for (int batch = 0; batch < gradOutput.getSize(0); ++batch) { + for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) { + Dtype gradOut = gradOutput[batch][plane][x]; + if (train) { + Dtype inp = input[batch][plane][x]; + Acctype proj = (inp - mean) * projScale; + gradInput[batch][plane][x] = ScalarConvert::to((gradOut - proj - gradMean) * gradScale); + } else { + gradInput[batch][plane][x] = ScalarConvert::to(gradOut * gradScale); + } + } + } + } + + if (gradWeight.numElements() > 0) { + if (threadIdx.x == 0) { + gradWeight[plane] += ScalarConvert::to(scale * dotP * stdVal); + } + } + + if (gradBias.numElements() > 0) { + if (threadIdx.x == 0) { + gradBias[plane] += ScalarConvert::to(scale * gradOutputSum); + } + } +} + +#include "generic/BatchNormalization.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt new file mode 100644 index 0000000..79b11c2 --- /dev/null +++ b/aten/src/THCUNN/CMakeLists.txt @@ -0,0 +1,88 @@ +SET(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} +${CMAKE_CURRENT_SOURCE_DIR}/AbsCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/Abs.cu +${CMAKE_CURRENT_SOURCE_DIR}/BatchNormalization.cu +${CMAKE_CURRENT_SOURCE_DIR}/BCECriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/ClassNLLCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/Col2Im.cu +${CMAKE_CURRENT_SOURCE_DIR}/DistKLDivCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/ELU.cu +${CMAKE_CURRENT_SOURCE_DIR}/FeatureLPPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/FusedRNNKernel.cu +${CMAKE_CURRENT_SOURCE_DIR}/GatedLinearUnit.cu +${CMAKE_CURRENT_SOURCE_DIR}/HardTanh.cu +${CMAKE_CURRENT_SOURCE_DIR}/Im2Col.cu +${CMAKE_CURRENT_SOURCE_DIR}/IndexLinear.cu +${CMAKE_CURRENT_SOURCE_DIR}/L1Cost.cu +${CMAKE_CURRENT_SOURCE_DIR}/LeakyReLU.cu +${CMAKE_CURRENT_SOURCE_DIR}/LogSigmoid.cu +${CMAKE_CURRENT_SOURCE_DIR}/LookupTableBag.cu +${CMAKE_CURRENT_SOURCE_DIR}/LookupTable.cu +${CMAKE_CURRENT_SOURCE_DIR}/MarginCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/MSECriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/MultiLabelMarginCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/MultiMarginCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/PReLU.cu +${CMAKE_CURRENT_SOURCE_DIR}/RReLU.cu +${CMAKE_CURRENT_SOURCE_DIR}/Sigmoid.cu +${CMAKE_CURRENT_SOURCE_DIR}/SmoothL1Criterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/SoftMarginCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/SoftPlus.cu +${CMAKE_CURRENT_SOURCE_DIR}/SoftShrink.cu +${CMAKE_CURRENT_SOURCE_DIR}/SparseLinear.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialAdaptiveAveragePooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialAdaptiveMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialAveragePooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialClassNLLCriterion.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionLocal.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialCrossMapLRN.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialDepthwiseConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialReplicationPadding.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialSubSampling.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBilinear.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingNearest.cu +${CMAKE_CURRENT_SOURCE_DIR}/Sqrt.cu +${CMAKE_CURRENT_SOURCE_DIR}/Square.cu +${CMAKE_CURRENT_SOURCE_DIR}/Tanh.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalReflectionPadding.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalReplicationPadding.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalRowConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingLinear.cu +${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingNearest.cu +${CMAKE_CURRENT_SOURCE_DIR}/Threshold.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveAveragePooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAveragePooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricGridSamplerBilinear.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingNearest.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingTrilinear.cu +PARENT_SCOPE) + +set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} + "${CMAKE_CURRENT_SOURCE_DIR}" +PARENT_SCOPE) + +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DESTINATION ${ATEN_INSTALL_INCLUDE_SUBDIR} + FILES_MATCHING PATTERN "*.h" PATTERN "*.cuh") diff --git a/aten/src/THCUNN/ClassNLLCriterion.cu b/aten/src/THCUNN/ClassNLLCriterion.cu new file mode 100644 index 0000000..1043454 --- /dev/null +++ b/aten/src/THCUNN/ClassNLLCriterion.cu @@ -0,0 +1,185 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" + +#include +#include + +static const int NTHREADS = 32; + +template +__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(Dtype *output, + Dtype *total_weight, + Dtype *input, + THCIndex_t *target, + Dtype *weights, + int size_average, + int n_classes, + int64_t ignore_index) { + assert(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0); + + // TODO: T4951791 Reuse code between updateOutput_kernel1 and + // updateOutput_kernel. + + int t = (int) *target - TH_INDEX_BASE; + if (t != (int) ignore_index) { + assert(t >= 0 && t < n_classes); + Dtype cur_weight = weights ? weights[t] : ScalarConvert::to(1); + *output = -cur_weight * input[t]; + *total_weight = cur_weight; + if (size_average && *total_weight > 0) { + *output /= *total_weight; + } + } +} + +template +__global__ void ClassNLLCriterion_updateOutput_no_reduce_kernel( + int batch_size, + THCDeviceTensor input, + THCDeviceTensor target, + THCDeviceTensor output, + Dtype *weights, + int n_classes, + int ignore_index) { + + CUDA_KERNEL_LOOP(index, batch_size) { + int cur_target = target[index] - TH_INDEX_BASE; + if (cur_target == ignore_index) { + output[index] = ScalarConvert::to(0); + continue; + } + assert(cur_target >= 0 && cur_target < n_classes); + Dtype weight = + weights ? weights[cur_target] : ScalarConvert::to(1); + output[index] = -weight * input[index][cur_target]; + } +} + +template +__global__ void ClassNLLCriterion_updateGradInput_no_reduce_kernel( + int batch_size, + THCDeviceTensor target, + THCDeviceTensor gradOutput, + THCDeviceTensor gradInput, + Dtype *weights, + int n_classes, + int ignore_index) { + + CUDA_KERNEL_LOOP(index, batch_size) { + int cur_target = target[index] - TH_INDEX_BASE; + if (cur_target == ignore_index) { + continue; + } + assert(cur_target >= 0 && cur_target < n_classes); + Dtype weight = + weights ? weights[cur_target] : ScalarConvert::to(1); + gradInput[index][cur_target] = -weight * gradOutput[index]; + } +} + +template +__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output, + Dtype *total_weight, + Dtype *input, + THCIndex_t *target, + Dtype *weights, + int size_average, + int nframe, + int ndim, + int n_classes, + int64_t ignore_index) { + __shared__ Acctype shInputs[NTHREADS], acc_weight[NTHREADS]; + int i, t; + Dtype cur_weight; + + shInputs[threadIdx.x] = ScalarConvert::to(0); + acc_weight[threadIdx.x] = ScalarConvert::to(0); + for (i = threadIdx.x; i < nframe; i += NTHREADS) { + t = target[i] - TH_INDEX_BASE; + if (t != (int) ignore_index) { + assert(t >= 0 && t < n_classes); + cur_weight = weights ? weights[t] : ScalarConvert::to(1); + shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight; + acc_weight[threadIdx.x] += cur_weight; + } + } + __syncthreads(); + + // TODO: T4951791 Reuse code between updateOutput_kernel1 and + // updateOutput_kernel + + if (threadIdx.x == 0) { + *output = *total_weight = ScalarConvert::to(0); + Acctype outputAcc = 0; + Acctype total_weightAcc = 0; + for (i = 0; i < NTHREADS; ++i){ + // FIXME should we do somethigng here + outputAcc += shInputs[i]; + total_weightAcc += acc_weight[i]; + } + *total_weight = ScalarConvert::to(total_weightAcc); + *output = ScalarConvert::to(outputAcc); + if (size_average && *total_weight > 0) { + *output = ScalarConvert::to(outputAcc / total_weightAcc); + } + + } +} + +template +__global__ void cunn_ClassNLLCriterion_updateGradInput_kernel1( + Dtype* gradInput, + Dtype* gradOutput, + Dtype* weights, + THCIndex_t* target, + Dtype* total_weight, + int size_average, + int n_classes, + int64_t ignore_index) +{ + if (*total_weight <= 0) { + return; + } + Dtype norm = size_average ? (ScalarConvert::to(1) / *total_weight) : ScalarConvert::to(1); + int t = (int)*target - TH_INDEX_BASE; + if (t != (int) ignore_index) { + assert(t >= 0 && t < n_classes); + gradInput[t] = -(weights ? weights[t] : ScalarConvert::to(1)) * norm * gradOutput[0]; + } +} + +template +__global__ void cunn_ClassNLLCriterion_updateGradInput_kernel( + Dtype *gradInput, + Dtype *gradOutput, + THCIndex_t *target, + Dtype *weights, + Dtype *total_weight, + int size_average, + int nframe, + int ndim, + int n_classes, + int64_t ignore_index) +{ + if (*total_weight <= 0) { + return; + } + int i, t; + Dtype norm = size_average ? (ScalarConvert::to(1) / *total_weight) : ScalarConvert::to(1); + + for (i = threadIdx.x; i < nframe; i += NTHREADS) { + t = (int)target[i] - TH_INDEX_BASE; + if (t != (int) ignore_index) { + assert(t >= 0 && t < n_classes); + gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert::to(1)) * norm * gradOutput[0]; + } + } +} + +#include "generic/ClassNLLCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/Col2Im.cu b/aten/src/THCUNN/Col2Im.cu new file mode 100644 index 0000000..d7fd995 --- /dev/null +++ b/aten/src/THCUNN/Col2Im.cu @@ -0,0 +1,11 @@ +#include "THCUNN.h" +#include "common.h" +#include "im2col.h" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/Col2Im.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu new file mode 100644 index 0000000..e4e85b7 --- /dev/null +++ b/aten/src/THCUNN/DistKLDivCriterion.cu @@ -0,0 +1,64 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCApply.cuh" + +#include +#include +#include +#include +#include + +template +struct kl_functor +{ + __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const + { + Acctype yAcc = ScalarConvert::to(y); + return y > 0 ? yAcc * (THCNumerics::log(yAcc) - x) : Acctype(0); + } +}; + +template +struct kl_updateOutput_no_reduce_functor +{ + __forceinline__ __host__ __device__ void operator()( + const Dtype *x, + const Dtype *y, + Dtype *output) + { + *output = *y > 0 ? *y * (THCNumerics::log(*y) - *x) : ScalarConvert::to(0); + } +}; + +template +struct kl_updateGradInput_no_reduce_functor +{ + __host__ __device__ void operator()( + const Dtype *target, + const Dtype *gradOutput, + Dtype *gradInput) + { + *gradInput = *target > 0 ? (-*target) * *gradOutput : ScalarConvert::to(0); + } +}; + +template +struct kl_updateGradInput_functor +{ + const Dtype norm; + const Dtype gradOutput; + + kl_updateGradInput_functor(Dtype norm_, Dtype gradOutput_) + : norm(norm_), gradOutput(gradOutput_) + {} + + __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const + { + return y > 0 ? norm * (-y) * gradOutput : ScalarConvert::to(0); + } +}; + +#include "generic/DistKLDivCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu new file mode 100644 index 0000000..d17d185 --- /dev/null +++ b/aten/src/THCUNN/ELU.cu @@ -0,0 +1,59 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct ELUupdateOutput_functor +{ + const T negcoef_; + const T poscoef_; + + ELUupdateOutput_functor(T negcoef, T poscoef) + : negcoef_(negcoef) + , poscoef_(poscoef) + {} + + __device__ void operator()(T *output, const T *input) const + { + *output = *input <= 0 ? (exp(*input) - 1) * negcoef_ : *input * poscoef_; + } +}; + +// in-place variant +template +struct ELUupdateOutputIP_functor +{ + const T negcoef_; + const T poscoef_; + + ELUupdateOutputIP_functor(T negcoef, T poscoef) + : negcoef_(negcoef) + , poscoef_(poscoef) + {} + + __device__ void operator()(T *x) const + { + *x = *x <= 0 ? (exp(*x) - 1) * negcoef_ : *x * poscoef_; + } +}; + +template +struct ELUupdateGradInput_functor +{ + const T negcoef_; + const T poscoef_; + + ELUupdateGradInput_functor(T negcoef, T poscoef) + : negcoef_(negcoef) + , poscoef_(poscoef) + {} + + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const + { + *gradInput = (*output) <= 0 ? (*gradOutput * (*output + negcoef_)) : (*gradOutput * poscoef_); + } +}; + +#include "generic/ELU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/FeatureLPPooling.cu b/aten/src/THCUNN/FeatureLPPooling.cu new file mode 100644 index 0000000..7026f0d --- /dev/null +++ b/aten/src/THCUNN/FeatureLPPooling.cu @@ -0,0 +1,653 @@ +#include "THCUNN.h" +#include "THCAtomics.cuh" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCNumerics.cuh" +#include "THCTensorTypeUtils.cuh" + +#define OUTPUT_FEATURES_PER_THREAD 32 +#define MAX_WARPS_PER_RUN 4 + +namespace detail { + +/// Various utilities for dealing with arrays of values which are +/// maintained in thread-local registers. All accesses are done in such +/// a way such that the index is statically known, which preserves the +/// compiler's ability to allocate the values to registers, as opposed +/// to local memory. +template +struct RegisterUtils { + /// Register shifting: move elements towards the beginning of the + /// array (towards 0) by `Shift` places: + /// arr[i] = arr[i + Shift] + /// The `Shift` elements at the end are left unchanged. + template + __device__ __forceinline__ static void shiftLeft(T arr[N]) { + // e.g., N = 5, Shift = 2: + // 0 1 2 3 4 becomes => + // 2 3 4 3 4 (last are unchanged) +#pragma unroll + for (int i = 0; i < N - Shift; ++i) { + arr[i] = arr[i + Shift]; + } + } +}; + +template +__device__ __forceinline__ +int getDim1Point(const THCDeviceTensor& input) { + int threadPoint = blockIdx.x * blockDim.x + threadIdx.x; + return threadPoint / input.getSize(3); +} + +template +__device__ __forceinline__ +int getDim2Point(const THCDeviceTensor& input) { + int threadPoint = blockIdx.x * blockDim.x + threadIdx.x; + return threadPoint % input.getSize(3); +} + +__device__ __forceinline__ +int getStartOutputFeature() { + return blockIdx.y * OUTPUT_FEATURES_PER_THREAD; +} + +template +__device__ __forceinline__ +int getEndOutputFeature(const THCDeviceTensor& output) { + return min((blockIdx.y + 1) * OUTPUT_FEATURES_PER_THREAD, output.getSize(1)); +} + +__device__ __forceinline__ +int getBatch() { + return blockIdx.z; +} + +// All of these functions that follow are MathOps; they are template +// parameters so L2 can be more efficiently implemented +// template +// typedef T (*MathOp)(const T in, const T arg); + +template +__device__ __forceinline__ T power2(const T in, const T power) { + return THCNumerics::mul(in, in); +} + +template +__device__ __forceinline__ T root2(const T in, const T power) { + return THCNumerics::sqrt(in); +} + +template +__device__ __forceinline__ T powerGrad2(const T in, const T power) { + return in; +} + +template +__device__ __forceinline__ T powerN(const T in, const T power) { + return THCNumerics::pow(in, power); +} + +template +__device__ __forceinline__ T rootN(const T in, const T power) { + const T invPower = THCNumerics::cinv(power); + return THCNumerics::pow(in, invPower); +} + +template +__device__ __forceinline__ T powerGradN(const T in, const T power) { + return THCNumerics::pow(in, + THCNumerics::sub(power, + ScalarConvert::to(1))); +} + +// Input is of the form: +// [batch][feature dim][optional dim 1][optional dim 2] +template +__global__ void +featureLPPoolingUpdateOutput(const THCDeviceTensor input, + THCDeviceTensor output, + T power) { + // What non-feature points is this thread handling? + int dim1Point = getDim1Point(input); + int dim2Point = getDim2Point(input); + + if (dim1Point >= input.getSize(2) || dim2Point >= input.getSize(3)) { + // This thread in the warp is out of bounds + return; + } + + // What feature points is this thread handling? + int startOutputFeature = getStartOutputFeature(); + int endOutputFeature = getEndOutputFeature(output); + int startInputFeature = startOutputFeature * Stride; + + // What batch points is this thread handling? + int batch = getBatch(); + + // If stride >= width, then there is no loaded data reuse. + // If stride > 1 and stride < width, then shift by stride, since we + // can reuse Width - Stride elements from the previous round. + // e.g., width = 5, stride = 2, + // output 0 uses input 0 1 2 3 4 + // output 1 uses input 2 3 4 5 6 (inputs 2 - 4 are reused, i.e., 5 - + // 2 elements are reused, and we have to shift the array by 2) + // + // e.g., width = 5, stride = 3, + // output 0 uses input 0 1 2 3 4 + // output 1 uses input 3 4 5 6 7 (inputs 3 - 4 are reused, i.e., 5 - 3 + // elements are reused, and we have to shift the array by 3) + + // Valid only pooling: load Width elements from input (Width - + // Stride is handled here, at the top of the loop we handle the + // remaining Stride elements). We already verified that the input is + // larger than the width. + // `in` will contain the input values ^ power. + T in[Width]; + +#pragma unroll + for (int i = 0; i < Width - Stride; ++i) { + const T data = + input[batch][startInputFeature + i][dim1Point][dim2Point]; + in[i] = PowerFunc(data, power); + } + + for (int outputFeature = startOutputFeature; + outputFeature < endOutputFeature; + ++outputFeature) { + // If Stride < Width, we're loading Stride new values starting at + // Width - Stride + // If Stride >= Width, we're loading Width new values starting at 0 + if (Stride < Width) { + int nextInputFeature = outputFeature * Stride + Width - Stride; + +#pragma unroll + for (int i = 0; i < Stride; ++i) { + const T data = + input[batch][nextInputFeature + i][dim1Point][dim2Point]; + in[Width - Stride + i] = PowerFunc(data, power); + } + } else { + int nextInputFeature = outputFeature * Stride; + +#pragma unroll + for (int i = 0; i < Width; ++i) { + T data = input[batch][nextInputFeature + i][dim1Point][dim2Point]; + in[i] = PowerFunc(data, power); + } + } + + // Calculate the new output feature + T val = ScalarConvert::to(0); + for (int i = 0; i < Width; ++i) { + val = THCNumerics::add(val, in[i]); + } + + val = RootFunc(val, power); + output[batch][outputFeature][dim1Point][dim2Point] = val; + + if (Stride < Width) { + // Shift registers for calculating the next point + RegisterUtils::template shiftLeft(in); + } + } +} + +// forward pass: f(a, ..., z) = (a^p + ... + z^p)^(1 / p) +// for bprop: +// partial df(a, ... z)/da = a^(p - 1) * (a^p + ... + z^p)^((1 / p) - 1) = +// a^(p - 1) * 1/(f(a, ..., z)^(p - 1)) = (a / f(a, ..., z))^(p - 1) +// +// example: for p = 2, df(a, ..., z)/da = a / f(a, ..., z) +// example: for p = 3, df(a, ..., z)/da = (a / f(a, ..., z))^2 +// +// PowerGradFunc implements x^(p - 1) +template +__global__ void +featureLPPoolingUpdateGradInput(const THCDeviceTensor gradOutput, + const THCDeviceTensor input, + const THCDeviceTensor output, + THCDeviceTensor gradInput, + T power) { + // What non-feature points is this thread handling? + int dim1Point = getDim1Point(input); + int dim2Point = getDim2Point(input); + + if (dim1Point >= input.getSize(2) || dim2Point >= input.getSize(3)) { + // This thread in the warp is out of bounds + return; + } + + // What feature points is this thread handling? [start, end) + int startOutputFeature = getStartOutputFeature(); + int endOutputFeature = getEndOutputFeature(output); + + // What is the first input point that the output features depend + // upon? [start, end) + int startInputFeature = startOutputFeature * Stride; + int endInputFeature = endOutputFeature * Stride; + + // What batch points is this thread handling? + int batch = getBatch(); + + // atomicAdd into gradInput is slow, avoid it where possible. + // We can do this because there is a range of gradInput elements + // that we are updating exclusively. This is how we find it + // + // width = 3 stride = 1 example: + // ------------------------------ + // startOutputFeature for this thread + // | + // | + // previous thread's output feature + // | | + // | | gradOutput + // __v____v___________________ + // | | | | | | + // --------------------------- + // |\ \_____ + // | \__ \ gradInput + // __v____v____v_____________ + // | | | | | | + // --------------------------- + // A A + // | | + // startInputFeature + // | + // exclusiveStartInputFeature + // + // exclusiveStartInputFeature is the first input feature that we can + // write into exclusively; the one right before it overlaps with + // updates from a previous thread and thus has to use atomicAdd. + int exclusiveStartInputFeature = + startInputFeature == 0 ? + // no thread is before ourselves + 0 : + // there is a thread before ourselves + startInputFeature + (Width - 1) * Stride; + + // Similarly, exclusiveEndInputFeature is the last input feature + // that we can write into exclusively, since we might be overlapping + // with the following thread + int exclusiveEndInputFeature = + endOutputFeature == output.getSize(1) ? + // no thread is after ourselves + endInputFeature + (Width - 1) * Stride : + // there is a thread after ourselves + endInputFeature; + + // As with updateOutput preload input elements, except no need to + // transform them + T in[Width]; +#pragma unroll + for (int i = 0; i < Width - Stride; ++i) { + in[i] = input[batch][startInputFeature + i][dim1Point][dim2Point]; + } + + for (int outputFeature = startOutputFeature; + outputFeature < endOutputFeature; + ++outputFeature) { + // As with updateOutput load the subsequent input elements that we + // need, except no need to transform them + // + // If Stride < Width, we're loading Stride new values starting at + // Width - Stride + // If Stride >= Width, we're loading Width new values starting at 0 + if (Stride < Width) { + int nextInputFeature = outputFeature * Stride + Width - Stride; + +#pragma unroll + for (int i = 0; i < Stride; ++i) { + in[Width - Stride + i] = + input[batch][nextInputFeature + i][dim1Point][dim2Point]; + } + } else { + int nextInputFeature = outputFeature * Stride; + +#pragma unroll + for (int i = 0; i < Width; ++i) { + in[i] = input[batch][nextInputFeature + i][dim1Point][dim2Point]; + } + } + + // A given output feature gradient contributes to `Width` input + // gradients + const T gradOut = + gradOutput[batch][outputFeature][dim1Point][dim2Point]; + + // Load output (f(x_is)). It is possible that this is zero, in + // which case we'll ignore this point. + T out = output[batch][outputFeature][dim1Point][dim2Point]; + if (THCNumerics::eq(out, ScalarConvert::to(0))) { + continue; + } + + int curStartInputFeature = outputFeature * Stride; + int curEndInputFeature = outputFeature * Stride + Width - 1; + + if (curStartInputFeature >= exclusiveStartInputFeature && + curEndInputFeature < exclusiveEndInputFeature) { + // This thread is exclusively responsible for updating these + // input points, so we need not make the addition atomic + for (int i = 0; i < Width; ++i) { + int inputFeature = outputFeature * Stride + i; + + // Calculate grad * (x_i / f(x_is))^(p - 1) + const T val = THCNumerics::mul( + gradOut, + PowerGradFunc(THCNumerics::div(in[i], out), power)); + + gradInput[batch][inputFeature][dim1Point][dim2Point] = + THCNumerics::add( + gradInput[batch][inputFeature][dim1Point][dim2Point], val); + } + } else { + // Handle start and end boundary cases: potential overlap with + // other threads + for (int i = 0; i < Width; ++i) { + int inputFeature = outputFeature * Stride + i; + + // Calculate grad * (x_i / f(x_is))^(p - 1) + T val = THCNumerics::mul( + gradOut, + PowerGradFunc(THCNumerics::div(in[i], out), power)); + + // We don't overlap other threads for this range + if (inputFeature >= exclusiveStartInputFeature && + inputFeature < exclusiveEndInputFeature) { + gradInput[batch][inputFeature][dim1Point][dim2Point] + = THCNumerics::add( + gradInput[batch][inputFeature][dim1Point][dim2Point], val); + } else { + // We are potentially overlapping with threads handling + // features before ourselves, so these need to be added atomically + atomicAdd(&gradInput[batch][inputFeature][dim1Point][dim2Point], + val); + } + } + } + + if (Stride < Width) { + // Shift registers for calculating the next point + RegisterUtils::template shiftLeft(in); + } + } +} + +} // namespace detail + +inline int lpPoolingOutputSize(int inputSize, int width, int stride) { + return ((inputSize - width) / stride) + 1; +} + +template +bool +runFeatureLPPoolingUpdateOutput(THCState* state, + const THCDeviceTensor& input, + THCDeviceTensor& output, + float power, int width, int stride) { + cudaStream_t stream = + THCState_getCurrentStream(state); + const cudaDeviceProp* deviceProperties = + THCState_getCurrentDeviceProperties(state); + + int outputFeatures = ((input.getSize(1) - width) / stride) + 1; + + THAssert(input.getSize(0) == output.getSize(0)); + THAssert(outputFeatures == output.getSize(1)); + THAssert(input.getSize(1) >= width); + + THAssert(input.getSize(2) == output.getSize(2)); + THAssert(input.getSize(3) == output.getSize(3)); + THAssert(power > 0.0f); + THAssert(width >= 1); + THAssert(stride >= 1); + + // Split non-features among threads and grid x + int totalNonFeatureSize = input.getSize(2) * input.getSize(3); + int numWarps = + min(THCCeilDiv(totalNonFeatureSize, deviceProperties->warpSize), + MAX_WARPS_PER_RUN); + int blockSize = deviceProperties->warpSize * numWarps; + + // Split non-features among grid x + int nonFeatureSizeBlocks = THCCeilDiv(totalNonFeatureSize, blockSize); + + // Split features among grid y, up to a maximum number of features per thread + int featureBlocks = THCCeilDiv(outputFeatures, OUTPUT_FEATURES_PER_THREAD); + + // Split batch among grid z. + dim3 grid(nonFeatureSizeBlocks, featureBlocks, input.getSize(0)); + dim3 block(blockSize); + +#define L2_STRIDE_CASE(STRIDE, WIDTH) \ + case STRIDE: \ + detail:: \ + featureLPPoolingUpdateOutput<<>>( \ + input, output, \ + ScalarConvert::to(power)); \ + return true; + +#define L2_WIDTH_CASE(WIDTH) \ + case WIDTH: \ + switch (stride) { \ + L2_STRIDE_CASE(1, WIDTH); \ + L2_STRIDE_CASE(2, WIDTH); \ + L2_STRIDE_CASE(3, WIDTH); \ + L2_STRIDE_CASE(4, WIDTH); \ + } + +#define LP_STRIDE_CASE(STRIDE, WIDTH) \ + case STRIDE: \ + detail:: \ + featureLPPoolingUpdateOutput<<>>( \ + input, output, \ + ScalarConvert::to(power)); \ + return true; + +#define LP_WIDTH_CASE(WIDTH) \ + case WIDTH: \ + switch (stride) { \ + LP_STRIDE_CASE(1, WIDTH); \ + LP_STRIDE_CASE(2, WIDTH); \ + LP_STRIDE_CASE(3, WIDTH); \ + LP_STRIDE_CASE(4, WIDTH); \ + } + + if (power == 2.0f) { + switch (width) { + L2_WIDTH_CASE(2); + L2_WIDTH_CASE(3); + L2_WIDTH_CASE(4); + L2_WIDTH_CASE(5); + L2_WIDTH_CASE(6); + L2_WIDTH_CASE(7); + L2_WIDTH_CASE(8); + L2_WIDTH_CASE(9); + L2_WIDTH_CASE(10); + L2_WIDTH_CASE(11); + L2_WIDTH_CASE(12); + L2_WIDTH_CASE(13); + L2_WIDTH_CASE(14); + L2_WIDTH_CASE(15); + L2_WIDTH_CASE(16); + } + } else { + switch (width) { + LP_WIDTH_CASE(2); + LP_WIDTH_CASE(3); + LP_WIDTH_CASE(4); + LP_WIDTH_CASE(5); + LP_WIDTH_CASE(6); + LP_WIDTH_CASE(7); + LP_WIDTH_CASE(8); + LP_WIDTH_CASE(9); + LP_WIDTH_CASE(10); + LP_WIDTH_CASE(11); + LP_WIDTH_CASE(12); + LP_WIDTH_CASE(13); + LP_WIDTH_CASE(14); + LP_WIDTH_CASE(15); + LP_WIDTH_CASE(16); + } + } + + // Otherwise, we have an unhandled width and/or stride. + return false; + +#undef L2_STRIDE_CASE +#undef L2_WIDTH_CASE +#undef LP_STRIDE_CASE +#undef LP_WIDTH_CASE +} + +template +bool +runFeatureLPPoolingUpdateGradInput(THCState* state, + const THCDeviceTensor& gradOutput, + const THCDeviceTensor& input, + const THCDeviceTensor& output, + THCDeviceTensor& gradInput, + float power, int width, int stride) { + cudaStream_t stream = + THCState_getCurrentStream(state); + const cudaDeviceProp* deviceProperties = + THCState_getCurrentDeviceProperties(state); + + for (int i = 0; i < 4; ++i) { + THAssert(gradOutput.getSize(i) == output.getSize(i)); + THAssert(gradInput.getSize(i) == input.getSize(i)); + } + + int outputFeatures = ((input.getSize(1) - width) / stride) + 1; + + THAssert(gradInput.getSize(0) == gradOutput.getSize(0)); + THAssert(outputFeatures == gradOutput.getSize(1)); + THAssert(gradInput.getSize(1) >= width); + + THAssert(gradInput.getSize(2) == gradOutput.getSize(2)); + THAssert(gradInput.getSize(3) == gradOutput.getSize(3)); + THAssert(power > 0.0f); + THAssert(width >= 1); + THAssert(stride >= 1); + + // Different threads are potentially adding into overlapping input + // points, so we must clear out gradInput before continuing. + gradInput.zero(stream); + + // Split non-features among threads and grid x + int totalNonFeatureSize = input.getSize(2) * input.getSize(3); + int numWarps = + min(THCCeilDiv(totalNonFeatureSize, deviceProperties->warpSize), + MAX_WARPS_PER_RUN); + int blockSize = deviceProperties->warpSize * numWarps; + + // Split non-features among grid x + int nonFeatureSizeBlocks = THCCeilDiv(totalNonFeatureSize, blockSize); + + // Split features among grid y, up to a maximum number of features per thread + int featureBlocks = THCCeilDiv(outputFeatures, OUTPUT_FEATURES_PER_THREAD); + + // Split batch among grid z. + dim3 grid(nonFeatureSizeBlocks, featureBlocks, input.getSize(0)); + dim3 block(blockSize); + +#define L2_STRIDE_CASE(STRIDE, WIDTH) \ + case STRIDE: \ + detail:: \ + featureLPPoolingUpdateGradInput< \ + T, WIDTH, STRIDE, detail::powerGrad2><<>>( \ + gradOutput, input, output, gradInput, \ + ScalarConvert::to(power)); \ + return true; + +#define L2_WIDTH_CASE(WIDTH) \ + case WIDTH: \ + switch (stride) { \ + L2_STRIDE_CASE(1, WIDTH); \ + L2_STRIDE_CASE(2, WIDTH); \ + L2_STRIDE_CASE(3, WIDTH); \ + L2_STRIDE_CASE(4, WIDTH); \ + } + +#define LP_STRIDE_CASE(STRIDE, WIDTH) \ + case STRIDE: \ + detail:: \ + featureLPPoolingUpdateGradInput< \ + T, WIDTH, STRIDE, detail::powerGradN><<>>( \ + gradOutput, input, output, gradInput, \ + ScalarConvert::to(power)); \ + return true; + +#define LP_WIDTH_CASE(WIDTH) \ + case WIDTH: \ + switch (stride) { \ + LP_STRIDE_CASE(1, WIDTH); \ + LP_STRIDE_CASE(2, WIDTH); \ + LP_STRIDE_CASE(3, WIDTH); \ + LP_STRIDE_CASE(4, WIDTH); \ + } + + if (power == 2.0f) { + switch (width) { + L2_WIDTH_CASE(2); + L2_WIDTH_CASE(3); + L2_WIDTH_CASE(4); + L2_WIDTH_CASE(5); + L2_WIDTH_CASE(6); + L2_WIDTH_CASE(7); + L2_WIDTH_CASE(8); + L2_WIDTH_CASE(9); + L2_WIDTH_CASE(10); + L2_WIDTH_CASE(11); + L2_WIDTH_CASE(12); + L2_WIDTH_CASE(13); + L2_WIDTH_CASE(14); + L2_WIDTH_CASE(15); + L2_WIDTH_CASE(16); + } + } else { + switch (width) { + LP_WIDTH_CASE(2); + LP_WIDTH_CASE(3); + LP_WIDTH_CASE(4); + LP_WIDTH_CASE(5); + LP_WIDTH_CASE(6); + LP_WIDTH_CASE(7); + LP_WIDTH_CASE(8); + LP_WIDTH_CASE(9); + LP_WIDTH_CASE(10); + LP_WIDTH_CASE(11); + LP_WIDTH_CASE(12); + LP_WIDTH_CASE(13); + LP_WIDTH_CASE(14); + LP_WIDTH_CASE(15); + LP_WIDTH_CASE(16); + } + } + + // Otherwise, we have an unhandled width and/or stride. + return false; + +#undef L2_STRIDE_CASE +#undef L2_WIDTH_CASE +#undef LP_STRIDE_CASE +#undef LP_WIDTH_CASE +} + +#include "generic/FeatureLPPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/FusedRNNKernel.cu b/aten/src/THCUNN/FusedRNNKernel.cu new file mode 100644 index 0000000..d8b594a --- /dev/null +++ b/aten/src/THCUNN/FusedRNNKernel.cu @@ -0,0 +1,46 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCNumerics.cuh" +#include + +template +struct TensorSigmoidOp { + __device__ __forceinline__ void operator()(T* out, T* in) const { + T one = (T) 1.0; + *out = one / (one + THCNumerics::exp(- *in)); + } + + __device__ __forceinline__ void operator()(T* v) const { + T one = (T) 1.0; + *v = one / (one + THCNumerics::exp(- *v)); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorSigmoidOp { + __device__ __forceinline__ void operator()(half* out, half* in) const { +#ifdef CUDA_HALF_INSTRUCTIONS + half one = ScalarConvert::to(1); + *out = __hdiv(one, __hadd(one, hexp(__hneg(*in)))); +#else + float fin = ScalarConvert::to(*in); + *out = ScalarConvert::to(1.0f / (1.0f + expf(- fin))); +#endif + } + + __device__ __forceinline__ void operator()(half* v) const { +#ifdef CUDA_HALF_INSTRUCTIONS + half one = ScalarConvert::to(1); + *v = __hdiv(one, __hadd(one, hexp(__hneg(*v)))); +#else + float fv = ScalarConvert::to(*v); + *v = ScalarConvert::to(1.0f / (1.0f + expf(- fv))); +#endif + } +}; +#endif + +#include "generic/FusedRNNKernel.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/GatedLinearUnit.cu b/aten/src/THCUNN/GatedLinearUnit.cu new file mode 100644 index 0000000..aba9f1e --- /dev/null +++ b/aten/src/THCUNN/GatedLinearUnit.cu @@ -0,0 +1,37 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include +#include "common.h" + +template +struct gatedLinearCSigMul_functor +{ + __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const + { + const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert::to(-*sigTensor))); + const Dtype mulNum = *mulTensor; + *target = ScalarConvert::to(sigNum * mulNum); + } +}; + + +template +struct gatedLinearDerivative +{ + const int64_t stride_i_; + const int64_t stride_gI_; + gatedLinearDerivative(int64_t stride_i, int64_t stride_gI) + :stride_i_(stride_i), stride_gI_(stride_gI){} + __device__ void operator()(Dtype * gI, const Dtype * gO, const Dtype * input) const + { + const Dtype * sigTensor = input + stride_i_; + const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert::to(-*sigTensor))); + *gI = ScalarConvert::to(sigNum * *gO); + Dtype * gIsecond = gI + stride_gI_; + *gIsecond = ScalarConvert::to((Acctype(1) - sigNum) * sigNum * *gO * *input); + } +}; + +#include "generic/GatedLinearUnit.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu new file mode 100644 index 0000000..539b22f --- /dev/null +++ b/aten/src/THCUNN/HardTanh.cu @@ -0,0 +1,63 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct hardtanhupdateOutput_functor +{ + const T max_val_; + const T min_val_; + + hardtanhupdateOutput_functor(T min_val, T max_val) + : min_val_(min_val) + , max_val_(max_val) + {} + + __device__ void operator()(T *output, const T *input) const + { + if (*input < min_val_) + *output = min_val_; + else if (*input > max_val_) + *output = max_val_; + else + *output = *input; + } + + __device__ void operator()(T *input) const + { + if (*input < min_val_) + *input = min_val_; + else if (*input > max_val_) + *input = max_val_; + } +}; + +template +struct hardtanhupdateGradInput_functor +{ + const T max_val_; + const T min_val_; + + hardtanhupdateGradInput_functor(T min_val, T max_val) + : min_val_(min_val) + , max_val_(max_val) + {} + + __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const + { + if (*input <= min_val_ || *input >= max_val_) + *gradInput = ScalarConvert::to(0); + else + *gradInput = *gradOutput; + } + + __device__ void operator()(T *gradInput, const T *input) const + { + if (*input <= min_val_ || *input >= max_val_) + *gradInput = ScalarConvert::to(0); + } +}; + +#include "generic/HardTanh.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/Im2Col.cu b/aten/src/THCUNN/Im2Col.cu new file mode 100644 index 0000000..95bdcd4 --- /dev/null +++ b/aten/src/THCUNN/Im2Col.cu @@ -0,0 +1,11 @@ +#include "THCUNN.h" +#include "common.h" +#include "im2col.h" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/Im2Col.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/IndexLinear.cu b/aten/src/THCUNN/IndexLinear.cu new file mode 100644 index 0000000..2422af9 --- /dev/null +++ b/aten/src/THCUNN/IndexLinear.cu @@ -0,0 +1,473 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#define divup(a, b) ((a) + (b) - 1) / (b) +const int THREADS_PER_BLOCK = 256; +const int THREADS_X = 32; +const int THREADS_Y = THREADS_PER_BLOCK / THREADS_X; +const int REPEAT = 32; +const int64_t NNZ_PER_BLOCK_MAX = 1024; + +/* sign MACRO */ +#ifndef clamp +#define clamp(a, low, high) max(min((a), (high)), (low)) +#endif + +__device__ double atomicExch(double *address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long res = atomicExch(address_as_ull, __double_as_longlong(val)); + return __longlong_as_double(res); +} + +template +__global__ static +void updateOutput( + Ty *output, + Ty *normalizedValues, + const Ty *values, + const int64_t *cumSumSizes, + const int64_t *keys, + const int64_t batchSize, + const int64_t outDim, + Ty *weight, + const Ty *bias, + const int64_t weightStride, + const int64_t keysOffset, + const int maxNormalize, + const int nnzPerBlock) +{ + /******************************************************* + * Adapted from the following file in arrayfire + * https://github.com/arrayfire/arrayfire/blob/v3.4.1/src/backend/opencl/kernel/csrmm.cl + * + ******************************************************* + * Original copyright notice can be seen below: + * + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + + const int64_t tidx = threadIdx.x; + const int64_t tidy = threadIdx.y; + const int64_t tid = tidy * blockDim.x + tidx; + const int64_t gidx = blockIdx.x * blockDim.x + tidx; + + + Ty *nWeight = weight; + // Offset the number of elements specified by maxNormalize + weight += gidx + maxNormalize; + output += gidx; + + bool within_N = (gidx < outDim); + + __shared__ Ty s_values[THREADS_PER_BLOCK]; + __shared__ int64_t s_keys[THREADS_PER_BLOCK]; + + const int64_t rowId = blockIdx.y; + // if (rowId >= batchSize) return; + + // Load the nonzero column offsets for current row + const int64_t batchStart = (rowId == 0 ? 0 : cumSumSizes[rowId - 1]) + blockIdx.z * nnzPerBlock; + const int64_t batchEnd = min(batchStart + nnzPerBlock, cumSumSizes[rowId]); + const int64_t batchStride = blockDim.x * blockDim.y; + + Ty outVal = 0; + // Since the number of nonzero elements might be greater than local memory available, + // Load only part of the row into local memory, perform partial dot, repeat until done. + for (int64_t id = batchStart; id < batchEnd; id += batchStride) { + // Load the current chunk of the row into local memory + int64_t lim = min(batchEnd - id, (int64_t)batchStride); + + int64_t key = tid < lim ? keys[id + tid] + keysOffset : -1; + Ty val = tid < lim ? values[id + tid] : 0; + int64_t nWeightOffset = key * weightStride; + + if (tid < lim && maxNormalize) { + Ty *nWeightCurr = nWeight + nWeightOffset; + if (train) { + Ty absVal = fabs(val); + Ty maxVal = nWeightCurr[0]; + if (absVal > maxVal) { + // Updating maxVal and invMaxVal. Go hogwild! + Ty invAbsVal = 1.0 / absVal; + atomicExch(nWeightCurr + 0, absVal); + atomicExch(nWeightCurr + 1, invAbsVal); + } + val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3]; + normalizedValues[id + tid] = val; + nWeightCurr[2] = 1; + } else { + val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3]; + } + } + + s_keys[tid] = key; + s_values[tid] = val; + __syncthreads(); + + // Perform a single "dot" operation for each thread + for (int64_t idy = tidy; within_N && idy < lim; idy += blockDim.y) { + outVal += s_values[idy] * weight[weightStride * s_keys[idy]]; + } + __syncthreads(); + } + + // s_values is no longer used at this point. Reuse it for reducing outVal. + // A reduction along the y dimension now gives a single output value along x. + s_values[tid] = outVal; + for (int64_t y = blockDim.y / 2; y >= 1; y /= 2) { + __syncthreads(); + if (tidy < y) s_values[tid] = s_values[tid] + s_values[tid + y * blockDim.x]; + } + + if (within_N && tidy == 0) { + Ty val = s_values[tid] + (blockIdx.z == 0 ? bias[gidx] : 0); + if (gridDim.z == 1) { + output[rowId * outDim] = val; + } else { + atomicAdd(output + rowId * outDim, val); + } + } +} + +// This kernel takes in the following inputs: +// values of size [keysSize x 1] and gradOutput of size [batchSize x outDim], +// to generate gradWeight of size [keysSize x outDim] +// nth block along y dimension computes on the non zero elements from the nth batch. +template +__global__ static +void accGradWeight( + Ty *gradWeight, + const Ty *gradOutput, + const Ty *values, + const int64_t *cumSumSizes, + const int64_t outDim, + const int64_t gradWeightStride, + const Ty scale, + const Ty weightDecay, + const int maxNormalize) +{ + const int64_t bidy = blockIdx.y; + const int64_t tidx = threadIdx.x; + const int64_t tidy = threadIdx.y; + const int64_t tid = tidy * blockDim.x + tidx; + const int64_t ntid = blockDim.x * blockDim.y; + const int64_t gidx = blockIdx.x * blockDim.x + tidx; + + // All the y threads in the block will use the same gradOutput value + gradOutput += bidy * outDim; + Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0); + + // Calculate the amount of work for the current block / batch. + const int64_t batchStart = bidy == 0 ? 0 : cumSumSizes[bidy - 1]; + const int64_t batchEnd = cumSumSizes[bidy]; + const int64_t batchLimit = batchEnd - batchStart; + + // Number of iterations required to finish the work for the current batch. + const int64_t iters = divup(batchLimit, ntid); + + // Offset the values to the current batch. + values += batchStart; + + // When maxNormalize is enabled, gradWeight will be twice the size. + // The first half will contain the gradients required for maxNormalization. + // The second half will contain the gradients required for updating weights. + // if maxNormalize is false, both will evaluate to the same pointer. + Ty *gradWeight0 = gradWeight + batchStart * gradWeightStride + gidx; + Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0); + + __shared__ Ty s_values[THREADS_PER_BLOCK]; + + // Using iters to avoid divergence + synchtreads + for (int64_t n = 0; n < iters; n++) { + int64_t off = n * ntid; + int64_t id = off + tid; + int64_t lim = min(ntid, batchLimit - off); + + // Read the values required for the current iteration. + s_values[tid] = id < batchLimit ? values[id] : 0; + __syncthreads(); + + if (gidx < outDim) { + if (maxNormalize) { + for (int64_t idy = tidy; idy < lim; idy += blockDim.y) { + // gradOutVal is already scaled + gradWeight0[(off + idy) * gradWeightStride] = gradOutVal; + } + } + + for (int64_t idy = tidy; idy < lim; idy += blockDim.y) { + gradWeight1[(off + idy) * gradWeightStride] = s_values[idy] * gradOutVal; + } + } + __syncthreads(); + } +} + +// The gradBias is just a reduction of gradOutput along the batches. +// There is only one block along y dimension performing the reduction. +template +__global__ static +void accGradBias( + Ty *buffer, + const Ty *gradOutput, + const int64_t outDim, + const int64_t batchSize, + const Ty scale, + const Ty weightDecay) +{ + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * blockDim.x + tidx; + const int64_t idx = blockIdx.x * blockDim.x + tidx; + + + Ty gradBiasVal = 0; + gradOutput += idx; + __shared__ Ty s_gradBiasVals[THREADS_PER_BLOCK]; + + // Each thread along y calculates the partial sum. + if (idx < outDim) { + for (int64_t idy = tidy; idy < batchSize; idy += blockDim.y) { + gradBiasVal += gradOutput[idy * outDim]; + } + } + s_gradBiasVals[tid] = gradBiasVal * scale; + __syncthreads(); + + // Perform reduction is performed along y. + for (int y = blockDim.y / 2; y >= 1; y /= 2) { + if (tidy < y) { + s_gradBiasVals[tid] += s_gradBiasVals[tid + y * blockDim.x]; + } + __syncthreads(); + } + + // Write the output only from the first lane. + if (tidy == 0 && idx < outDim) { + if (update) { + // If performing inplace update, subtract from bias. + Ty *bias = buffer; + bias[idx] = (bias[idx] - s_gradBiasVals[tid]); + } else { + // If just accumulating gradients, write to gradBias. + Ty *gradBias = buffer; + gradBias[idx] = s_gradBiasVals[tid]; + } + } +} + +// Use gradWeight from accGradWeight to update the weight. +// This kernel is launched batchSize number of times. +// At each step in the iteration, the weights are updated in a sparse manner. +template +__global__ static +void updateWeight( + Ty *weight, + const Ty *gradWeight, + const int64_t *keys, + const int64_t *cumSumSizes, + const int64_t outDim, + const int64_t gradWeightStride, + const int64_t weightStride, + const int64_t keysOffset, + const Ty learningRate, + const Ty weightDecay, + const int maxNormalize, + const int64_t batchId) +{ + int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y; + + // Find the limits of the work to be done + const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1]; + const int64_t batchEnd = cumSumSizes[batchId]; + + // When maxNormalize is turned on, the weight tensor will contain + // an extra "maxNormalize" number of terms per output at the beginning. + // When maxNormalize is false, both will evaluate to same pointer. + // when maxNormalize is true, + // - nWeight[2] will contain the individual scaling factor. + // - nWeight[3] will contain the individual bias for the normalized input. + Ty *nWeight = weight; + weight += maxNormalize + gidx; + + // When maxNormalize is enabled, gradWeight will be twice the size. + // The first half will contain the gradients required for maxNormalization. + // The second half will contain the gradients required for updating weights. + // if maxNormalize is false, both will evaluate to the same pointer. + const Ty *gradWeight0 = gradWeight + gidx; + const Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0); + + if (gidx >= outDim) return; + for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) { + Ty lr = learningRate; + Ty wd = weightDecay; + int64_t weightOffset = (keys[id] + keysOffset) * weightStride; + Ty weightVal = weight[weightOffset]; + + if (maxNormalize) { + Ty scale = nWeight[weightOffset + 2]; + lr *= scale; + wd *= scale; + // nWeight[3] needs to be updated in the following manner for a given input. + // nWeight[3] = nWeight[3] - sum(gradWeight0[gidx] * weight[gidx]); + // Since problem is parallelized along gidx, use atomicAdd for the update. + Ty gradNormBias = lr * weightVal * gradWeight0[id * gradWeightStride]; + atomicAdd(nWeight + weightOffset + 3, -gradNormBias); + } + + // Perform the regular update + Ty gradWeightVal = lr * gradWeight1[id * gradWeightStride]; + if (weightDecay == 0) { + weight[weightOffset] = weightVal - gradWeightVal; + } else { + weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal; + } + } +} + +// This kernel is launched batchSize number of times. +// At each step in the iteration, the weights are updated in place in a sparse manner. +template +__global__ static +void accUpdateWeight( + Ty *weight, + const int64_t weightStride, + const Ty *gradOutput, + const int64_t outDim, + const Ty *values, + const int64_t *cumSumSizes, + const int64_t *keys, + const int64_t keysOffset, + const Ty scale, + const Ty weightDecay, + const int maxNormalize, + const int64_t batchId) +{ + // Parallel along outDim. + int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x; + // Parallel along the sparse input size for current batch. + int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y; + + if (gidx >= outDim) return; + + // Find the limits of the work to be done. + const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1]; + const int64_t batchEnd = cumSumSizes[batchId]; + + gradOutput += batchId * outDim; + Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0); + + // When maxNormalize is turned on, the weight tensor will contain + // an extra "maxNormalize" number of terms per output at the beginning. + // When maxNormalize is false, both will evaluate to same pointer. + // when maxNormalize is true, + // - nWeight[2] will contain the individual scaling factor. + // - nWeight[3] will contain the individual bias for the normalized input. + Ty *nWeight = weight; + weight += maxNormalize + gidx; + + for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) { + Ty wd = weightDecay; + int64_t weightOffset = (keys[id] + keysOffset) * weightStride; + Ty gradWeightVal = gradOutVal * values[id]; + Ty weightVal = weight[weightOffset]; + + if (maxNormalize) { + Ty nScale = nWeight[weightOffset + 2]; + gradWeightVal *= nScale; + wd *= nScale; + // nWeight[3] needs to be updated in the following manner for a given input. + // nWeight[3] = nWeight[3] - sum(gradOut[gidx] * weight[gidx]); + // Since problem is parallelized along gidx, use atomicAdd for the update. + Ty gradNormBias = nScale * weightVal * gradOutVal; + atomicAdd(nWeight + weightOffset + 3, -gradNormBias); + } + + // Perform the regular update + if (weightDecay == 0) { + weight[weightOffset] = weightVal - gradWeightVal; + } else { + weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal; + } + } +} + + +#ifdef CUDA_HALF_TENSOR +void THNN_CudaHalfIndexLinear_updateOutput( + THCState *state, + THCudaLongTensor *keys, + int64_t keysOffset, + THCudaHalfTensor *values, + THCudaLongTensor *sizes, + THCudaLongTensor *cumSumSizes, + THCudaHalfTensor *output, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + THCudaHalfTensor *normalizedValues, + int train) { + THError("THCudaHalfTensor not supported with IndexLinear"); +} + +void THNN_CudaHalfIndexLinear_accGradParameters( + THCState *state, + THCudaLongTensor *keys, + int64_t keysOffset, + THCudaHalfTensor *values, + THCudaLongTensor *sizes, + THCudaLongTensor *cumSumSizes, + THCudaHalfTensor *gradOutput, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + THCudaHalfTensor* valuesBuffer, + float weightDecay, + float scale) { + THError("THCudaHalfTensor not supported with IndexLinear"); +} + +void THNN_CudaHalfIndexLinear_accUpdateGradParameters( + THCState *state, + THCudaLongTensor *keys, + int64_t keysOffset, + THCudaHalfTensor *values, + THCudaLongTensor *sizes, + THCudaLongTensor *cumSumSizes, + THCudaHalfTensor *gradOutput, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + float weightDecay, + float scale) { + THError("THCudaHalfTensor not supported with IndexLinear"); +} + +void THNN_CudaHalfIndexLinear_updateParameters( + THCState *state, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + THCudaLongTensor *runningKeys, + THCudaLongTensor *cumSumSizes, + int64_t keysOffset, + float weightDecay, + float learningRate) { + THError("THCudaHalfTensor not supported with IndexLinear"); +} +#endif + +#include "generic/IndexLinear.cu" +#include "THCGenerateFloatType.h" +#include "generic/IndexLinear.cu" +#include "THCGenerateDoubleType.h" diff --git a/aten/src/THCUNN/L1Cost.cu b/aten/src/THCUNN/L1Cost.cu new file mode 100644 index 0000000..eda58c1 --- /dev/null +++ b/aten/src/THCUNN/L1Cost.cu @@ -0,0 +1,34 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include +#include +#include + +template +struct l1cost_functor +{ + __host__ __device__ Acctype operator()(Dtype x) const + { + return THCNumerics::abs(ScalarConvert::to(x)); + } +}; + +template +struct l1cost_updateGradInput_functor +{ + __host__ __device__ Dtype operator()(Dtype x) const + { + if (x > 0) + return ScalarConvert::to(1); + else if (x < 0) + return ScalarConvert::to(-1); + else + return ScalarConvert::to(0); + } +}; + +#include "generic/L1Cost.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/LeakyReLU.cu b/aten/src/THCUNN/LeakyReLU.cu new file mode 100644 index 0000000..ec9efb8 --- /dev/null +++ b/aten/src/THCUNN/LeakyReLU.cu @@ -0,0 +1,74 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct LeakyReLUUpdateOutput +{ + const T negval_; + + LeakyReLUUpdateOutput(T negval) + : negval_(negval) + {} + + __device__ __forceinline__ void operator()(T *out, T *in) + { + T x = *in; + *out = (x > 0) ? x : x * negval_; + } +}; + +// in-place variant +template +struct LeakyReLUUpdateOutputIP +{ + const T negval_; + + LeakyReLUUpdateOutputIP(T negval) + : negval_(negval) + {} + + __device__ __forceinline__ void operator()(T *x) + { + *x = (*x > 0) ? *x : negval_ * (*x); + } +}; + +template +struct LeakyReLUUpdateGradInput +{ + const T negval_; + + LeakyReLUUpdateGradInput(T negval) + : negval_(negval) + {} + + __device__ __forceinline__ void operator()( + T* gradInput, + T* input, + T* gradOutput) const + { + *gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_; + } +}; + +template +struct LeakyReLUUpdateGradInputIP +{ + const T negval_; + + LeakyReLUUpdateGradInputIP(T negval) + : negval_(negval) + {} + + __device__ __forceinline__ void operator()( + T* gradOutput, + T* input) const + { + *gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_; + } +}; + +#include "generic/LeakyReLU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/LogSigmoid.cu b/aten/src/THCUNN/LogSigmoid.cu new file mode 100644 index 0000000..357b7bf --- /dev/null +++ b/aten/src/THCUNN/LogSigmoid.cu @@ -0,0 +1,98 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +#if defined(_MSC_VER) || defined(__HIP_PLATFORM_HCC__) +#define ZERO_MACRO zero() +template +inline __device__ typename std::enable_if::value, T>::type zero() { + return 0.; +} + +template +inline __device__ typename std::enable_if::value, T>::type zero() { + return 0.f; +} +#else +#define ZERO_MACRO 0.f +#endif + +template +struct logSigmoid_updateOutput_functor +{ + __device__ void operator()(T *output, const T *input) const { + const T max = fmaxType(ZERO_MACRO, -*input); + const T z = THCNumerics::exp(-max) + THCNumerics::exp(-*input -max); + *output = -(max + THCNumerics::log(z)); + } +}; + + +template +struct logSigmoid_updateGradInput_functor +{ + __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const { + const T max = fmaxType(ZERO_MACRO, -*input); + const T z = THCNumerics::exp(-max) + THCNumerics::exp(-*input -max); + T max_deriv = 0.f; + T sign = -1.f; + if (*input < 0.f){ + max_deriv = -1.f; + sign = 1.f; + } + *gradInput = *gradOutput * (-max_deriv - sign*((z - 1.f)/z)); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct logSigmoid_updateOutput_functor { + __device__ __forceinline__ void operator()(half* output, const half *input) const { +#ifdef CUDA_HALF_INSTRUCTIONS + const half max = fmaxType(__float2half(0.f), __hneg(*input)); + const half z = THCNumerics::exp(__hneg(max)) + THCNumerics::exp(__hneg(*input) - max); + *output = __hneg(max + THCNumerics::log(z)); +#else + float in = __half2float(*input); + float max = fmaxType(0.f, -in); + float z = THCNumerics::exp(-max) + THCNumerics::exp(-in - max); + *output = __float2half(-(max + THCNumerics::log(z))); +#endif + } +}; + +template <> +struct logSigmoid_updateGradInput_functor { + __device__ __forceinline__ void operator()(half* gradInput, const half *input, const half *gradOutput) const { +#ifdef CUDA_HALF_INSTRUCTIONS + const half one = __float2half(1.f); + const half zero = __float2half(0.f); + const half max = fmaxType(zero, __hneg(*input)); + const half z = THCNumerics::exp(__hneg(max)) + THCNumerics::exp(__hneg(*input) - max); + half max_deriv = zero; + half sign = __hneg(one); + if(*input < zero){ + max_deriv = __hneg(one); + sign = one; + } + *gradInput = __hmul(*gradOutput, (__hneg(max_deriv) - __hmul(sign, __hdiv(z - one, z)))); +#else + const float in = __half2float(*input); + const float max = fmaxType(0.f, -in); + const float z = THCNumerics::exp(-max) + THCNumerics::exp(-in - max); + const float go = __half2float(*gradOutput); + float max_deriv = 0.f; + float sign = -1.f; + if(in < 0.f){ + max_deriv = -1.f; + sign = 1.f; + } + *gradInput = __float2half(go * (-max_deriv - sign*((z - 1.f)/z))); +#endif + } +}; +#endif + +#include "generic/LogSigmoid.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu new file mode 100644 index 0000000..59aa7e8 --- /dev/null +++ b/aten/src/THCUNN/LookupTable.cu @@ -0,0 +1,227 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCThrustAllocator.cuh" +#include +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensorSort.cuh" +#include "../THC/THCTensorMathReduce.cuh" + +const int WARP_SIZE = 32; + +template + +__global__ void cunn_LookupTable_accGradParametersKernelByFeature + (int64_t *indices, + Dtype *grad, + Dtype *grad_weight, + Dtype scale, + ptrdiff_t n, + int64_t stride, + int padding_idx) +{ + extern __shared__ char buf[]; + Acctype* smem = (Acctype*)buf; + Acctype* my_s = smem + WARP_SIZE*threadIdx.y; + int* indices_batch = (int*)(buf + sizeof(Acctype)*WARP_SIZE*blockDim.y); + + const int s = (int)stride; // OK to make int, we don't expect 2 billion+ embedding row size + + const int f = threadIdx.x + blockIdx.x*blockDim.x; // feature_dim + + for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y) + { + // Entire block cooperates to load a batch of 1024 indices to process + int tid = threadIdx.x + threadIdx.y*blockDim.x; + if(batch_start + tid < n) + indices_batch[tid] = (int)(indices[batch_start + tid] - TH_INDEX_BASE); + + // Loop over the batch of <= 1024 loaded indices in chunks of blockDim.y = 32 + for(int chunk_start = batch_start; chunk_start < n; chunk_start += blockDim.y) + { + // This does double duty: it makes sure indices_batch is ready, and it makes sure match-group + // leaders are done with their accumulates before other warps start loading again. + __syncthreads(); + + int n_this_chunk = (n - chunk_start) < blockDim.y ? (n - chunk_start) : blockDim.y; + + int src_row = chunk_start + threadIdx.y; + int dst_row = indices_batch[src_row - batch_start]; // This warp's target row in grad_weight + + // All warps load their smem segments with incoming grad data + if(src_row < n && f < s && dst_row != padding_idx - TH_INDEX_BASE) + my_s[threadIdx.x] = ScalarConvert::to(scale*grad[src_row*stride + f]); + + __syncthreads(); + + // To ensure determinism, we can't just have each warp add its grad data to its dst_row. + // We need to check if any other warps pulled grad data targeting dst_row. + // If so, we elect the first warp in each matching group as the leader. + // Each leader warp serializes the accumulates targeting dst_row in shared memory, + // then finishes by adding the accumulated buffer to dst_row in grad_weight. + if(dst_row != padding_idx - TH_INDEX_BASE && src_row < n) // Per-warp exit condition + { + int match_found_this_thread = + (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]); + if(threadIdx.x >= n_this_chunk) + match_found_this_thread = 0; + unsigned int matchmask = WARP_BALLOT(match_found_this_thread); + + int first_remaining_peer = __ffs(matchmask) - 1; + + if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader + { + matchmask ^= (1 << first_remaining_peer); + while(matchmask) + { + first_remaining_peer = __ffs(matchmask) - 1; + my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer]; + matchmask ^= (1 << first_remaining_peer); + } + if(f < s) + grad_weight[dst_row*stride + f] += ScalarConvert::to(my_s[threadIdx.x]); + } + } + } + } +} + +template +__global__ void cunn_LookupTable_accGradParametersKernel( + int64_t *input, int64_t *indices, Dtype *gradOutput, Dtype *gradWeight, + int64_t *count, Dtype defaultScale, ptrdiff_t numel, int64_t stride, int paddingValue) { + + int idx = blockIdx.x * 4 + threadIdx.y; + + // Each warp is responsible for an input into the LookupTable. + // If the preceding input has the same as this input, then the warp + // exits immediately. The warp also processes subsequent inputs with the + // same value. + // + // Input Warp + // 1 + // 1 ( exits without doing any work) + // 5 + // 8 + + // Number of values proceessed by each thread (grain size) + const int SZ = 4; + + if (idx < numel + && (idx == 0 || input[idx] != input[idx - 1]) + && input[idx] != paddingValue) { + do { + const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; + const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride; + const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride; + const Acctype scale = count ? ScalarConvert::to(defaultScale) / count[idx] : ScalarConvert::to(defaultScale); + + Acctype gradient[SZ]; + Acctype weight[SZ]; + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) + { + gradient[ii] = ScalarConvert::to(gradOutput[gradOutputRow + featureDim]); + weight[ii] = ScalarConvert::to(gradWeight[weightRow + featureDim]); + } + } + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + weight[ii] += gradient[ii] * scale; + } + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) + { + gradWeight[weightRow + featureDim] = ScalarConvert::to(weight[ii]); + } + } + + idx++; + } while (idx < numel && input[idx] == input[idx - 1]); + } +} + +template +struct FastPow +{ + __host__ __device__ + static inline AccType pow(DType x, AccType norm) { + AccType xA = ScalarConvert::to(x); + return std::pow(std::abs(xA), norm); + } +}; + +template +struct FastPow +{ + __host__ __device__ + static inline AccType pow(DType x, AccType _) { + AccType xA = ScalarConvert::to(x); + return std::abs(xA); + } +}; + +template +struct FastPow +{ + __host__ __device__ + static inline AccType pow(DType x, AccType _) { + AccType xA = ScalarConvert::to(x); + return xA * xA; + } +}; + +/* Calculate norms of the rows of weight_ptr given by idx_ptr and capture them in norms */ +template +__global__ +void calculate_norms_and_renorm(DType *weights, + THCIndex_t *indices, + AccType normType, + AccType maxNorm, + IndexType dim) +{ + // Some casting hacks since dynamic shared memory and templates don't work together: + extern __shared__ unsigned char smem[]; + AccType *sdata = reinterpret_cast(smem); + + IndexType tid = threadIdx.x; + IndexType baseIndex = (indices[blockIdx.x] - TH_INDEX_BASE) * dim; + + AccType accZero = ScalarConvert::to(0); + AccType v = accZero; + for (IndexType i = tid; i < dim; i += blockDim.x) { + v += FastPow::pow(weights[baseIndex + i], normType); + } + + v = reduceBlock> + (sdata, blockDim.x, v, ReduceAdd(), accZero); + + if (tid == 0) { + sdata[0] = std::pow(v, + THCNumerics::div(ScalarConvert::to(1), normType) + ); + } + __syncthreads(); + // now we renormalize the blocks that need it + if (sdata[0] > maxNorm) { + DType factor = ScalarConvert::to(maxNorm / (sdata[0] + 1e-7)); + for (IndexType i = tid; i < dim; i += blockDim.x) { + weights[baseIndex + i] *= factor; + } + } + +} + +#include "generic/LookupTable.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu new file mode 100644 index 0000000..c2ba9f5 --- /dev/null +++ b/aten/src/THCUNN/LookupTableBag.cu @@ -0,0 +1,143 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCTensor.hpp" + +#include "THCThrustAllocator.cuh" +#include +#include +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif +#include +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensorSort.cuh" + +const int WARP_SIZE = 32; +const int MODE_SUM = 0; +const int MODE_MEAN = 1; + +template +__global__ void cunn_LookupTableBag_updateOutputKernel( + int64_t *input, int64_t *offsets, Dtype *weight, Dtype *output, + int64_t *offset2bag, int64_t numIndices, int64_t numBags, int64_t stride, int mode, + int64_t *bag_size) { + + // the strategy here is that each bag x feature is handled by a single thread + + int64_t chunksPerBag = THCCeilDiv(stride, (int64_t) blockDim.x); + int64_t numChunks = numBags * chunksPerBag; + int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y; + int64_t chunkStride = gridDim.x * blockDim.y; + + for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) { + int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x; + if (featureDim < stride) { + int64_t bag = chunk / chunksPerBag; + Dtype* weightFeat = weight + featureDim; + int64_t begin = offsets[bag] - TH_INDEX_BASE; + int64_t end = (bag < numBags - 1) ? (offsets[bag + 1] - TH_INDEX_BASE) : numIndices; + assert(end >= begin); + Acctype weightFeatSum = ScalarConvert::to(0); + int64_t bag_size_ = 0; + for (int64_t emb = begin; emb < end; emb++) { + const int weightRow = ((int) input[emb] - TH_INDEX_BASE) * stride; + weightFeatSum += ScalarConvert::to(weightFeat[weightRow]); + bag_size_ ++; + if (featureDim == 0) { + offset2bag[emb] = bag + TH_INDEX_BASE; + } + } + if (mode == MODE_MEAN) { + weightFeatSum = weightFeatSum / ScalarConvert::to(bag_size_); + bag_size[bag] = bag_size_; + } + (void) MODE_SUM; //silence warnings about unused MODE_SUM; + output[bag * stride + featureDim] = ScalarConvert::to(weightFeatSum); + } + } +} + +// FIXME: removed the accGradParametersKernelByFeature case present in +// LookupTable. That kernel is faster at small sizes (<768 indices), which +// does not need LookupTableBag (LookupTable + Sum works fine), but would +// still be nice to not be slow in that case. + +template +__global__ void cunn_LookupTableBag_accGradParametersKernel( + int64_t *input, int64_t *indices, Dtype *gradOutput, Dtype *gradWeight, int64_t *offset2bag, + int64_t *count, Dtype defaultScale, ptrdiff_t numel, int64_t stride, + int mode, int64_t *bag_size) { + + int idx = blockIdx.x * 4 + threadIdx.y; + + // Each warp is responsible for an input into the LookupTable. + // If the preceding input has the same as this input, then the warp + // exits immediately. The warp also processes subsequent inputs with the + // same value. + // + // Input Warp + // 1 + // 1 ( exits without doing any work) + // 5 + // 8 + + // Number of values proceessed by each thread (grain size) + const int SZ = 4; + + if (idx < numel + && (idx == 0 || input[idx] != input[idx - 1])) { + do { + const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; + const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride; + + // Note: only this line changes from LookupTable_accgradParametersKernel + const int origRow = ((int) indices[idx] - TH_INDEX_BASE); + const int seq_number = offset2bag[origRow] - TH_INDEX_BASE; + const int gradOutputRow = ((int) seq_number) * stride; + + const Acctype scale = count ? ScalarConvert::to(defaultScale) / count[idx] : ScalarConvert::to(defaultScale); + + Acctype gradient[SZ]; + Acctype weight[SZ]; + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) + { + gradient[ii] = ScalarConvert::to(gradOutput[gradOutputRow + featureDim]); + if (mode == MODE_MEAN) { + gradient[ii] /= bag_size[seq_number]; + } + weight[ii] = ScalarConvert::to(gradWeight[weightRow + featureDim]); + } + } + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + weight[ii] += gradient[ii] * scale; + } + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * WARP_SIZE; + if (featureDim < stride) + { + gradWeight[weightRow + featureDim] = ScalarConvert::to(weight[ii]); + } + } + + idx++; + } while (idx < numel && input[idx] == input[idx - 1]); + } +} + + +#include "generic/LookupTableBag.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu new file mode 100644 index 0000000..e9571fe --- /dev/null +++ b/aten/src/THCUNN/MSECriterion.cu @@ -0,0 +1,62 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCThrustAllocator.cuh" +#include "THCApply.cuh" + +#include +#include +#include +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif + +template +struct mse_functor +{ + mse_functor() {} + + __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const + { + Acctype z = ScalarConvert::to(x)-y; + return z*z; + } +}; + + +template +struct mse_updateOutput_functor +{ + mse_updateOutput_functor() {} + + __device__ void operator()( + const Dtype *input, + const Dtype *target, + Dtype *output) + { + Dtype diff = THCNumerics::sub(*input, *target); + *output = THCNumerics::mul(diff, diff); + } +}; + + +template +struct mse_updateGradInput_functor +{ + const Acctype norm; + + mse_updateGradInput_functor(Acctype norm_) + : norm(norm_) + {} + + __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const + { + return ScalarConvert::to(norm * (ScalarConvert::to(x) - y)); + } +}; + +#include "generic/MSECriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/MarginCriterion.cu b/aten/src/THCUNN/MarginCriterion.cu new file mode 100644 index 0000000..7ccdbb7 --- /dev/null +++ b/aten/src/THCUNN/MarginCriterion.cu @@ -0,0 +1,45 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include +#include +#include +#include +#include + +template +struct margin_functor +{ + margin_functor(Acctype margin) + : margin(margin) + {} + + __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const + { + Acctype z = margin - ScalarConvert::to(x) * y; + return z >= 0 ? z : 0; + } + + const Acctype margin; +}; + +template +struct margin_updateGradInput_functor +{ + const Acctype margin, norm; + + margin_updateGradInput_functor(Acctype margin_, Acctype norm_) + : margin(margin_) + , norm(norm_) + {} + + __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const + { + return ScalarConvert::to((ScalarConvert::to(x) * y) < margin ? -norm * y : 0); + } +}; + +#include "generic/MarginCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/MultiLabelMarginCriterion.cu new file mode 100644 index 0000000..13b432c --- /dev/null +++ b/aten/src/THCUNN/MultiLabelMarginCriterion.cu @@ -0,0 +1,152 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCReduceApplyUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include + +#define MULTILABELMARGIN_THREADS 1024 + +template +__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(Dtype *output, + Dtype *input, + THCIndex_t *target, + Dtype *istarget, + int nframe, + int dim, + int sizeaverage) +{ + // Temporary sums (for mapreduce) + __shared__ Acctype sums[MULTILABELMARGIN_THREADS]; + + // vectors: + int k = blockIdx.x; + Dtype *input_k = input + k*dim; + THCIndex_t *target_k = target + k*dim; + Dtype *output_k = output + k; + Dtype *istarget_k = istarget + k*dim; + + // zero istarget + for (int d = threadIdx.x; d < dim; d += blockDim.x) { + istarget_k[d] = ScalarConvert::to(0); + } + __syncthreads(); + + // mark targets in istarget + if (threadIdx.x == 0) { + for (int dt = 0; dt < dim; dt++) { + int target_idx = target_k[dt] - TH_INDEX_BASE; + if (target_idx < 0) break; + istarget_k[target_idx] = ScalarConvert::to(1); + } + } + __syncthreads(); + + // iterate over targets + Acctype sum = 0; + for (int dt = 0; dt < dim; dt++) { + // next target: + int target_idx = target_k[dt] - TH_INDEX_BASE; + if (target_idx < 0) break; + + // current value for target + Dtype input_target_k = input_k[target_idx]; + + // compare to all inputs (multithreaded): + for (int d = threadIdx.x; d < dim; d += blockDim.x) { + // contribute to loss only if not a target + if (!ScalarConvert::to(istarget_k[d])) { + Dtype z = 1 - input_target_k + input_k[d]; + if (z > 0) + sum += z; + } + } + } + + // reduce + Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus(), (Acctype)0); + if (threadIdx.x == 0) { + if (sizeaverage) { + *output_k = ScalarConvert::to((totalSum / dim) / nframe); + } else { + *output_k = ScalarConvert::to(totalSum / dim); + } + } +} + +template +__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(Dtype *gradInput, + Dtype *gradOutput, + Dtype *input, + THCIndex_t *target, + Dtype *istarget, + int nframe, + int dim, + int sizeaverage, + int reduce) +{ + // Temporary sums (for mapreduce) + __shared__ Acctype sums[MULTILABELMARGIN_THREADS]; + + // vectors: + int k = blockIdx.x; + Dtype *input_k = input + k*dim; + Dtype *gradInput_k = gradInput + k*dim; + THCIndex_t *target_k = target + k*dim; + Dtype *istarget_k = istarget + k*dim; + + Dtype *gradOutput_k = gradOutput; + if (!reduce) { + gradOutput_k += k; + } + + // gain: + Dtype g = ScalarConvert::to( sizeaverage && reduce ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim) ); + + // zero gradients: + for (int d = threadIdx.x; d < dim; d += blockDim.x) { + gradInput_k[d] = ScalarConvert::to(0); + } + __syncthreads(); + + // iterate over targets + for (int dt = 0; dt < dim; dt++) { + // next target: + int target_idx = (int)target_k[dt] - TH_INDEX_BASE; + if (target_idx < 0) break; + + // current value for target + Dtype input_target_k = input_k[target_idx]; + + // compare to all inputs (multithreaded): + Acctype sum = 0; + for (int d = threadIdx.x; d < dim; d += blockDim.x) { + // contribute to loss only if not a target + if (!ScalarConvert::to(istarget_k[d])) { + Dtype z = 1 - input_target_k + input_k[d]; + if (z > 0) { + sum -= g; + gradInput_k[d] += g; + } + } + } + __syncthreads(); + + // reduce sum + Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus(), (Acctype)0); + if (threadIdx.x == 0) { + gradInput_k[target_idx] += ScalarConvert::to(totalSum); + } + } + + for (int d = threadIdx.x; d < dim; d += blockDim.x) { + gradInput_k[d] *= *gradOutput_k; + } +} + +#include "generic/MultiLabelMarginCriterion.cu" +#include "THCGenerateFloatTypes.h" + +#undef MULTILABELMARGIN_THREADS diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu new file mode 100644 index 0000000..c2fa213 --- /dev/null +++ b/aten/src/THCUNN/MultiMarginCriterion.cu @@ -0,0 +1,122 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#define MULTIMARGIN_THREADS 128 + +template +__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin) +{ + __shared__ Acctype buffer[MULTIMARGIN_THREADS]; + int k = blockIdx.x; + Dtype *input_k = input + k*dim; + Dtype *output_k = output + k; + int target_k = ((int)target[k]) - TH_INDEX_BASE; + Dtype input_target_k = input_k[target_k]; + + int i_start = threadIdx.x; + int i_end = dim; + int i_step = blockDim.x; + + buffer[threadIdx.x] = 0; + for (int i = i_start; i < i_end; i += i_step) + { + Dtype z = margin - input_target_k + input_k[i]; + if (i == target_k) + continue; + + if (z > 0) { + Dtype h = (P==1) ? z : z*z; + if(weights) + h *= weights[target_k]; + buffer[threadIdx.x] += h; + } + } + __syncthreads(); + + // reduce + if (threadIdx.x == 0) + { + Acctype sum = 0; + for (int i=0; i < blockDim.x; i++) + sum += buffer[i]; + + *output_k = ScalarConvert::to(sum/dim); + if(sizeAverage) + *output_k /= nframe; + } +} + +template +__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, + Dtype *gradOutput, + Dtype *input, + THCIndex_t *target, + Dtype *weights, + int nframe, + int dim, + bool sizeAverage, + Dtype margin, + int reduce) +{ + __shared__ Acctype buffer[MULTIMARGIN_THREADS]; + int k = blockIdx.x; + Dtype *input_k = input + k*dim; + Dtype *gradInput_k = gradInput + k*dim; + int target_k = ((int)target[k]) - TH_INDEX_BASE; + Dtype input_target_k = input_k[target_k]; + + Dtype *gradOutput_k = gradOutput; + if (!reduce) { + gradOutput_k += k; + } + + Acctype g = (sizeAverage && reduce ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim)); + + int i_start = threadIdx.x; + int i_end = dim; + int i_step = blockDim.x; + + buffer[threadIdx.x] = 0; + for (int i=i_start; i 0) + { + Dtype h = ScalarConvert::to((P == 1) ? g : 2*g*z); + if(weights) + h *= weights[target_k]; + buffer[threadIdx.x] -= h; + gradInput_k[i] = h; + } + else + gradInput_k[i] = ScalarConvert::to(0); + } + + __syncthreads(); + + // reduce + if (threadIdx.x == 0) + { + Acctype gradInput_target_k = 0; + for (int i=0; i::to(gradInput_target_k); + } + + for (int i=i_start; i +#include "THCTensor.hpp" + +#include "common.h" + +template +struct PReLUUpdateOutput +{ + T* weight_; + + PReLUUpdateOutput(T* weight) + : weight_(weight) + {} + + __device__ __forceinline__ void operator()(T *out, T *in) + { + T x = *in; + *out = (x > 0) ? x : weight_[0] * x; + } +}; + +template +__global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize) +{ + CUDA_KERNEL_LOOP(i, n) + { + int positionInSample = i % nElemsPerSample; + int mapNumber = positionInSample / mapSize; + output[i] = input[i] > 0 ? input[i] : input[i] * weight[mapNumber]; + } +} + +template +struct PReLUUpdateGradInput +{ + T *weight_; + + PReLUUpdateGradInput(T *weight) + : weight_(weight) + {} + + __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input) + { + *gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_; + } +}; + +template +__global__ void preluBackward( + T *gradInput, + const T *input, + const T *weight, + const T *gradOutput, + int n, int nElemsPerSample, int mapSize) +{ + CUDA_KERNEL_LOOP(i, n) + { + int positionInSample = i % nElemsPerSample; + int mapNumber = positionInSample / mapSize; + gradInput[i] = input[i] > 0 ? gradOutput[i] : gradOutput[i] * weight[mapNumber]; + } +} + +template +struct PReLUAccGradParametersShared +{ + __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) + { + *gradInput = (*input) * (*gradOutput) * (*input <= 0); + } +}; + +template +struct PReLUAccGradParameters +{ + T scale; + + PReLUAccGradParameters(T scale) + : scale(scale) + {} + + __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) + { + *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0); + } +}; + +template +struct PReLUAccGradParameters1to1 +{ + T scale; + + PReLUAccGradParameters1to1(T scale) + : scale(scale) + {} + + __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput) + { + *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0); + } +}; + +#include "generic/PReLU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu new file mode 100644 index 0000000..bf45035 --- /dev/null +++ b/aten/src/THCUNN/RReLU.cu @@ -0,0 +1,124 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include +#include "common.h" +#include +#include + +// copied from cutorch/lib/THC/THCTensorRandom.cu +#define MAX_NUM_BLOCKS 64 +#define BLOCK_SIZE 256 +#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS) + +template +inline T __device__ curand_uniform_type(curandStateMtgp32 *state); + +#ifdef CUDA_HALF_TENSOR +template <> +inline half __device__ curand_uniform_type(curandStateMtgp32 *state) { + return ScalarConvert::to(curand_uniform(state)); +} +#endif + +template <> +inline float __device__ curand_uniform_type(curandStateMtgp32 *state) { + return curand_uniform(state); +} + +template <> +inline double __device__ curand_uniform_type(curandStateMtgp32 *state) { + return curand_uniform_double(state); +} + +template +__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state, + T *input, T* noise, T *output, double a, double b) +{ + CUDA_KERNEL_LOOP(i, n) + { + if (input[i] <= 0) + { + T r = curand_uniform_type(&state[blockIdx.x]); + r = ScalarConvert::to(r * (b-a) + a); + output[i] = input[i] * r; + noise[i] = r; + } + else + { + output[i] = input[i]; + noise[i] = ScalarConvert::to(1); + } + } +} + +template +struct RReLUUpdateOutputEval_functor +{ + const T negSlope_; + + RReLUUpdateOutputEval_functor(T negSlope) + : negSlope_(negSlope) + {} + + __device__ __forceinline__ void operator()(T *out, T *in) + { + const T x = *in; + const T r = x <= 0 ? negSlope_ : ScalarConvert::to(1); + *out = x * r; + } +}; + +template +struct RReLUUpdateOutputEvalIP_functor +{ + const T negSlope_; + + RReLUUpdateOutputEvalIP_functor(T negSlope) + : negSlope_(negSlope) + {} + + __device__ __forceinline__ void operator()(T *x) + { + if (*x <= 0) + { + *x = *x * negSlope_; + } + } +}; + +template +struct RReLUupdateGradInputEval_functor +{ + const T negSlope_; + + RReLUupdateGradInputEval_functor(T negSlope) + : negSlope_(negSlope) + {} + + __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in) + { + *gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut); + } +}; + +template +struct RReLUupdateGradInputEvalIP_functor +{ + const T negSlope_; + + RReLUupdateGradInputEvalIP_functor(T negSlope) + : negSlope_(negSlope) + {} + + __device__ __forceinline__ void operator()(T *gradOut, T *in) + { + if (*in <= 0) + { + *gradOut = (*gradOut) * negSlope_; + } + } +}; + +#include "generic/RReLU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SharedMem.cuh b/aten/src/THCUNN/SharedMem.cuh new file mode 100644 index 0000000..070d269 --- /dev/null +++ b/aten/src/THCUNN/SharedMem.cuh @@ -0,0 +1,45 @@ +// Based on the simpleTempltes CUDA example + +#ifndef THCUNN_SHAREDMEM_H +#define THCUNN_SHAREDMEM_H + +template +struct SharedMem { + __device__ T *getPointer() + { + extern __device__ void error(void); + error(); + return NULL; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct SharedMem +{ + __device__ half *getPointer() { + extern __shared__ half s_half[]; + return s_half; + } +}; +#endif + +template <> +struct SharedMem +{ + __device__ float *getPointer() { + extern __shared__ float s_float[]; + return s_float; + } +}; + +template <> +struct SharedMem +{ + __device__ double *getPointer() { + extern __shared__ double s_double[]; + return s_double; + } +}; + +#endif diff --git a/aten/src/THCUNN/Sigmoid.cu b/aten/src/THCUNN/Sigmoid.cu new file mode 100644 index 0000000..85bda93 --- /dev/null +++ b/aten/src/THCUNN/Sigmoid.cu @@ -0,0 +1,30 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct sigmoid_updateGradInput_functor { + __device__ __forceinline__ void operator()(T* gradInput, const T *output, const T *gradOutput) const { + *gradInput = *gradOutput * (1.f - *output) * (*output); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct sigmoid_updateGradInput_functor { + __device__ __forceinline__ void operator()(half* gradInput, const half *output, const half *gradOutput) const { +#ifdef CUDA_HALF_INSTRUCTIONS + const half one = __float2half(1.f); + *gradInput = __hmul(*gradOutput, __hmul(__hadd(one, __hneg(*output)), *output)); +#else + const float out = __half2float(*output); + const float go = __half2float(*gradOutput); + *gradInput = __float2half(go * (1.f - out) * out); +#endif + } +}; +#endif + +#include "generic/Sigmoid.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu new file mode 100644 index 0000000..c8018d9 --- /dev/null +++ b/aten/src/THCUNN/SmoothL1Criterion.cu @@ -0,0 +1,91 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCThrustAllocator.cuh" +#include "THCApply.cuh" + +#include +#include +#include +#include +#include +#if CUDA_VERSION >= 7000 +#include +#endif + +template +struct smoothl1_functor +{ + smoothl1_functor() {} + + __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const + { + Acctype z = ScalarConvert::to(THCNumerics::abs(x-y)); + return z < Acctype(1) ? 0.5f*z*z : z - 0.5f; + } +}; + +template +struct smoothl1_updateOutput_no_reduce_functor +{ + smoothl1_updateOutput_no_reduce_functor() {} + + __forceinline__ __host__ __device__ void operator()( + const Dtype *x, + const Dtype *y, + Dtype *out) const + { + Dtype oneHalf = ScalarConvert::to(0.5f); + Dtype z = THCNumerics::abs(*x - *y); + *out = z < ScalarConvert::to(1) ? oneHalf * z * z : z - oneHalf; + } +}; + +template +struct smoothl1_updateGradInput_no_reduce_functor +{ + smoothl1_updateGradInput_no_reduce_functor() {} + + __host__ __device__ void operator()( + const Dtype *x, + const Dtype *y, + Dtype *gradInput) const + { + Dtype z = *x - *y; + Dtype one = ScalarConvert::to(1); + Dtype minusOne = ScalarConvert::to(-1); + if (z < minusOne) { + *gradInput = minusOne; + } else if (z > one) { + *gradInput = one; + } else { + *gradInput = z; + } + } +}; + +template +struct smoothl1_updateGradInput_functor +{ + const Dtype norm; + const Dtype gradOutput; + + smoothl1_updateGradInput_functor(Dtype norm_, Dtype gradOutput_) + : norm(norm_), gradOutput(gradOutput_) + {} + + __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const + { + Dtype z = x - y; + if (z < ScalarConvert::to(-1)) + return -norm * gradOutput; + else if (z > ScalarConvert::to(1)) + return norm * gradOutput; + else + return norm * z * gradOutput; + } +}; + +#include "generic/SmoothL1Criterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu new file mode 100644 index 0000000..ee53e76 --- /dev/null +++ b/aten/src/THCUNN/SoftMarginCriterion.cu @@ -0,0 +1,65 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCApply.cuh" + +#include +#include +#include +#include +#include + +template +struct softmargin_functor +{ + __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const + { + return log(1 + exp(ScalarConvert::to(-x)*y)); + } +}; + +template +struct softmargin_no_reduce_functor +{ + __host__ __device__ void operator()( + const Dtype *x, + const Dtype *y, + Dtype *out) const + { + *out = ScalarConvert::to(log(ScalarConvert::to(1) + + exp(ScalarConvert::to(-*x) * *y))); + } +}; + +template +struct softmargin_updateGradInput_functor +{ + const Acctype norm; + const Dtype gradOutput; + + softmargin_updateGradInput_functor(Acctype norm_, Dtype gradOutput_) : + norm(norm_), gradOutput(gradOutput_) {} + + __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const + { + Acctype temp = exp(ScalarConvert::to(-x)*y); + return ScalarConvert::to(-y*temp*norm/(ScalarConvert::to(1) + temp) * gradOutput); + } +}; + +template +struct softmargin_updateGradInput_no_reduce_functor +{ + __forceinline__ __host__ __device__ void operator()( + const Dtype *x, + const Dtype *y, + Dtype *gradInput) const + { + Acctype temp = exp(ScalarConvert::to(-*x) * *y); + *gradInput = ScalarConvert::to(-*y * temp / (ScalarConvert::to(1) + temp)); + } +}; + +#include "generic/SoftMarginCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SoftPlus.cu b/aten/src/THCUNN/SoftPlus.cu new file mode 100644 index 0000000..42b2c3c --- /dev/null +++ b/aten/src/THCUNN/SoftPlus.cu @@ -0,0 +1,43 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct softPlusupdateOutput_functor +{ + const T threshold; + const T beta; + + softPlusupdateOutput_functor(T threshold_, T beta_) + : threshold(threshold_) + , beta(beta_) + {} + + __device__ void operator()(T *output, const T *input) const { + T betain = beta * (*input); + *output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain)); + } +}; + +template +struct softPlusupdateGradInput_functor +{ + const T threshold; + const T beta; + + softPlusupdateGradInput_functor(T threshold_, T beta_) + : threshold(threshold_) + , beta(beta_) + {} + + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const + { + T betaout = beta * (*output); + T exp_bo = exp(betaout); + *gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo; + } +}; + +#include "generic/SoftPlus.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SoftShrink.cu b/aten/src/THCUNN/SoftShrink.cu new file mode 100644 index 0000000..a4e45d8 --- /dev/null +++ b/aten/src/THCUNN/SoftShrink.cu @@ -0,0 +1,44 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct SoftShrinkUpdateOutput +{ + const T lambda_; + + SoftShrinkUpdateOutput(T lambda) + : lambda_(lambda) + {} + + __device__ __forceinline__ void operator()(T *out, T *in) + { + T x = *in; + if (x > lambda_) *out = x - lambda_; + else if (x < -lambda_) *out = x + lambda_; + else *out = ScalarConvert::to(0); + } +}; + +template +struct SoftShrinkUpdateGradInput +{ + const T lambda_; + + SoftShrinkUpdateGradInput(T lambda) + : lambda_(lambda) + {} + + __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const + { + T x = *input; + if (x > lambda_ || x < -lambda_) + *gradInput = *gradOutput; + else + *gradInput = ScalarConvert::to(0); + } +}; + +#include "generic/SoftShrink.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu new file mode 100644 index 0000000..cd9b659 --- /dev/null +++ b/aten/src/THCUNN/SparseLinear.cu @@ -0,0 +1,87 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" + +#include + +static cusparseHandle_t cusparse_handle = 0; + +static void init_cusparse() { + if (cusparse_handle == 0) { + cusparseStatus_t status = cusparseCreate(&cusparse_handle); + if (status != CUSPARSE_STATUS_SUCCESS) { + THError("CUSPARSE Library initialization failed"); + } + } +} + +#ifdef CUDA_HALF_TENSOR +void THNN_CudaHalfSparseLinear_updateOutput( + THCState *state, + THCudaHalfTensor *input, + THCudaHalfTensor *output, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias) { + THError("THCudaHalfTensor not supported with SparseLinear"); +} + +void THNN_CudaHalfSparseLinear_accGradParameters( + THCState *state, + THCudaHalfTensor *input, + THCudaHalfTensor *gradOutput, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + float weightDecay, + float scale) { + THError("THCudaHalfTensor not supported with SparseLinear"); +} + +void THNN_CudaHalfSparseLinear_legacyUpdateOutput( + THCState *state, + THCudaHalfTensor *input, + THCudaHalfTensor *output, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias) { + THError("THCudaHalfTensor not supported with SparseLinear"); +} + +void THNN_CudaHalfSparseLinear_legacyAccGradParameters( + THCState *state, + THCudaHalfTensor *input, + THCudaHalfTensor *gradOutput, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + float weightDecay, + float scale) { + THError("THCudaHalfTensor not supported with SparseLinear"); +} + +void THNN_CudaHalfSparseLinear_zeroGradParameters( + THCState *state, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *lastInput) { + THError("THCudaHalfTensor not supported with SparseLinear"); +} + +void THNN_CudaHalfSparseLinear_updateParameters( + THCState *state, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *lastInput, + float learningRate) { + THError("THCudaHalfTensor not supported with SparseLinear"); +} +#endif + +#include "generic/SparseLinear.cu" +#include "THCGenerateFloatType.h" +#include "generic/SparseLinear.cu" +#include "THCGenerateDoubleType.h" diff --git a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu new file mode 100644 index 0000000..2c671da --- /dev/null +++ b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu @@ -0,0 +1,197 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + + +#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit + +// 4d tensor B x D x H x W +// All kernels view batch dim B and feature dim D as collapsed. + +/* + * Description: + * this function adaptively average pools an input 4D tensor along dimensions 2 and 3 + * 4D input, 4D output + */ + template +__global__ void adaptiveaveragepool(T *input, T *output, + int isizeH, int isizeW, + int osizeH, int osizeW, + int64_t istrideD, int64_t istrideH, int64_t istrideW) +{ + // iterators on output pixels + int oh, ow; + + // select input/output plane based on thread/block ID + int o_plane = blockIdx.x; + int i_plane = o_plane; + + output = output + o_plane*osizeH*osizeW; + input = input + i_plane*istrideD; + + int ostartH = blockDim.y*blockIdx.y + threadIdx.y; + int oendH = osizeH; + const int ostepH = blockDim.y*gridDim.y; + + int ostartW = threadIdx.x; + int oendW = osizeW; + const int ostepW = blockDim.x; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the average pooling over corresponding input pixels + T *ptr_input = input + istartH*istrideH + istartW*istrideW; + T *ptr_output = output + oh*osizeW + ow; + T sum = ScalarConvert::to(0); + int ih, iw; + for(ih = 0; ih < kH; ++ih) { + for(iw = 0; iw < kW; ++iw) { + T val = ptr_input[iw*istrideW]; + sum += val; + } + ptr_input += istrideH; // next input line + } + // Update output + *ptr_output = sum / kH / kW; + } + } +} + +/* + * Description: + * this function computes the gradInput from gradOutput + */ + template +__global__ void adaptiveaveragegradinput( + T *gradInput, T *gradOutput, + int isizeH, int isizeW, int osizeH, int osizeW +) +{ + // iterators on input pixels + int ih, iw; + + // select input/output plane based on thread/block ID + int i_plane = blockIdx.x; + int o_plane = i_plane; + + gradOutput = gradOutput + o_plane*osizeH*osizeW; + gradInput = gradInput + i_plane*isizeH*isizeW; + + int istartH = blockDim.y*blockIdx.y + threadIdx.y; + int iendH = isizeH; + int istepH = blockDim.y*gridDim.y; + + int istartW = threadIdx.x; + int iendW = isizeW; + int istepW = blockDim.x; + + // compute gradInput + for(ih = istartH; ih < iendH; ih += istepH) { + + int ostartH = START_IND(ih, isizeH, osizeH); + int oendH = END_IND(ih, isizeH, osizeH); + + for(iw = istartW; iw < iendW; iw += istepW) { + + int ostartW = START_IND(iw, isizeW, osizeW); + int oendW = END_IND(iw, isizeW, osizeW); + + // Compute the gradients over corresponding output pixels + T *ptr_gradInput = gradInput + ih*isizeW + iw; + + int oh, ow; + for(oh = ostartH; oh < oendH; ++oh) { + int kH = START_IND(oh, osizeH, isizeH) - END_IND(oh, osizeH, isizeH); + for(ow = ostartW; ow < oendW; ++ow) { + int kW = START_IND(ow, osizeW, isizeW) - END_IND(ow, osizeW, isizeW); + T grad_delta = gradOutput[ow + oh*osizeW] / kH / kW; + *ptr_gradInput += grad_delta; + } + } + } + } +} + +/* + * Description: + * this function computes the gradInput from gradOutput + * (uses atomic add) + */ + template +__global__ void atomicadaptiveaveragegradinput( + T *gradInput, T *gradOutput, + int isizeH, int isizeW, int osizeH, int osizeW +) +{ + // iterators on output indices + int oh, ow; + + // select input/output plane based on thread/block ID + int o_plane = blockIdx.x; + int i_plane = o_plane; + + gradOutput = gradOutput + o_plane*osizeW*osizeH; + gradInput = gradInput + i_plane*isizeW*isizeH; + + int ostartH = blockDim.y*blockIdx.y + threadIdx.y; + int oendH = osizeH; + int ostepH = blockDim.y*gridDim.y; + + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the gradients for over corresponding input pixels + T *ptr_gradInput = gradInput + istartH*isizeW + istartW; + T *ptr_gradOutput = gradOutput + oh*osizeW + ow; + T grad_delta = *ptr_gradOutput / kW / kH; + + int ih, iw; + for(ih = 0; ih < kH; ++ih) { + for(iw = 0; iw < kW; ++iw) { + // atomic add since different threads could update same variable + atomicAdd(&(ptr_gradInput[iw]), grad_delta); + } + ptr_gradInput += isizeW; // next input line + } + } + } +} + +#include "generic/SpatialAdaptiveAveragePooling.cu" +#include "THCGenerateFloatTypes.h" + +#undef CUDA_MAX_THREADS +#undef START_IND +#undef END_IND diff --git a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu new file mode 100644 index 0000000..592e6fd --- /dev/null +++ b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu @@ -0,0 +1,180 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +#include "THCTensor.hpp" + +#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +// 4d tensor B x D x H x W + +/* + * Description: + * this function adaptively maxpools an input 4D tensor along dimensions 2 and 3 + * 4D input, 4D output, 4D argmax x and y + */ + template +__global__ void adaptivemaxpool(T *input, T *output, THCIndex_t *indices, + int isizeH, int isizeW, + int osizeH, int osizeW, + int64_t istrideD, int64_t istrideH, int64_t istrideW) +{ + // iterators + int oh, ow; + + // compute offsets based on thread/block ID + int o_plane = blockIdx.x; + int i_plane = o_plane; + + int ostartW = threadIdx.x; + int oendW = osizeW; + const int ostepW = blockDim.x; + + int ostartH = blockDim.y*blockIdx.y + threadIdx.y; + int oendH = osizeH; + const int ostepH = blockDim.y*gridDim.y; + // select input/output plane + output = output + o_plane*osizeH*osizeW; + input = input + i_plane*istrideD; + indices = indices + o_plane*osizeH*osizeW; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = ostartW; ow < oendW; ow += ostepW) { + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + + int kW = iendW - istartW; + + // Compute the mean of the input image... + T *ptr_input = input + istartH*istrideH + istartW*istrideW; + T *ptr_output = output + oh*osizeW + ow; + THCIndex_t *ptr_ind = indices + oh*osizeW + ow; + int argmax = -1; + T max = THCNumerics::min(); + int ih, iw; + for(ih = 0; ih < kH; ih++) { + for(iw = 0; iw < kW; iw++) { + T val = ptr_input[iw*istrideW]; + if ((val > max) || THCNumerics::isnan(val)) { + max = val; + argmax = (ih+istartH)*isizeW + iw+istartW; + } + } + ptr_input += istrideH; // next input line + } + // Update output and argmax + *ptr_output = max; + *ptr_ind = argmax + TH_INDEX_BASE; + } + } +} + +/* + * Description: + * this function computes the gradInput from weight and gradOutput + */ + template +__global__ void adaptivemaxgradinput(T *gradInput, T *gradOutput, THCIndex_t *indices, + int isizeH, int isizeW, + int osizeH, int osizeW) +{ + // iterators + int oh, ow; + + // compute offsets based on thread/block ID + int o_plane = blockIdx.x; + int i_plane = o_plane; + //int k = blockIdx.x % sizeD; + + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + int ostartH = blockDim.y*blockIdx.y + threadIdx.y; + int oendH = osizeH; + int ostepH = blockDim.y*gridDim.y; + + // select input/output plane + gradOutput = gradOutput + o_plane*osizeH*osizeW; + gradInput = gradInput + i_plane*isizeH*isizeW; + indices = indices + o_plane*osizeH*osizeW; + + // compute gradInput + for(oh = ostartH; oh < oendH; oh += ostepH) { + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + T *ptr_gradOutput = gradOutput + oh*osizeW + ow; + THCIndex_t *ptr_ind = indices + oh*osizeW + ow; + T z = *ptr_gradOutput; + + int argmax = (*ptr_ind) - TH_INDEX_BASE; + + gradInput[argmax] += z; + } + } +} + +/* + * Description: + * this function computes the gradInput from weight and gradOutput + * when kH != dH or kW != dW (uses atomic add) + */ + template +__global__ void atomicadaptivemaxgradinput( + T *gradInput, T *gradOutput, THCIndex_t *indices, + int isizeH, int isizeW, int osizeH, int osizeW +) +{ + // iterators + int oh, ow; + + // compute offsets based on thread/block ID + int o_plane = blockIdx.x; + int i_plane = o_plane; + + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + int ostartH = blockDim.y*blockIdx.y + threadIdx.y; + int oendH = osizeH; + int ostepH = blockDim.y*gridDim.y; + + // select input/output plane + gradOutput = gradOutput + o_plane*osizeH*osizeW; + gradInput = gradInput + i_plane*isizeH*isizeW; + indices = indices + o_plane*osizeH*osizeW; + + // compute gradInput + for(oh = ostartH; oh < oendH; oh += ostepH) { + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + T *ptr_gradOutput = gradOutput + oh*osizeW + ow; + THCIndex_t *ptr_ind = indices + oh*osizeW + ow; + T z = *ptr_gradOutput; + + int argmax = (*ptr_ind) - TH_INDEX_BASE; + + // atomic add since different threads could update same variable + atomicAdd(&(gradInput[argmax]), z); + } + } +} + +#include "generic/SpatialAdaptiveMaxPooling.cu" +#include "THCGenerateFloatTypes.h" + +#undef CUDA_MAX_THREADS diff --git a/aten/src/THCUNN/SpatialAveragePooling.cu b/aten/src/THCUNN/SpatialAveragePooling.cu new file mode 100644 index 0000000..ce9941a --- /dev/null +++ b/aten/src/THCUNN/SpatialAveragePooling.cu @@ -0,0 +1,86 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "common.h" + +template +__global__ void AvePoolForward(const int nthreads, + const Dtype* const bottom_data, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + Dtype* const top_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + Acctype aveval = Acctype(0); + const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_slice[h * width + w]; + } + } + if(COUNT_INCLUDE_PAD) + top_data[index] = ScalarConvert::to(aveval / pool_size); + else + top_data[index] = ScalarConvert::to(aveval / ((hend - hstart) * (wend - wstart))); + } +} + +template +__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + Dtype* const bottom_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int h = (index / width) % height + pad_h; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Acctype gradient = Acctype(0); + const Dtype* const top_diff_slice = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + if(COUNT_INCLUDE_PAD) + gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; + else + gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart)); + } + } + bottom_diff[index] = ScalarConvert::to(gradient); + } +} + +#include "generic/SpatialAveragePooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/SpatialClassNLLCriterion.cu new file mode 100644 index 0000000..83addd0 --- /dev/null +++ b/aten/src/THCUNN/SpatialClassNLLCriterion.cu @@ -0,0 +1,161 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include + +#include + +template +__global__ void SpatialClassNLLCriterion_updateOutput_no_reduce_kernel( + int64_t nthreads, + THCDeviceTensor input, + THCDeviceTensor target, + THCDeviceTensor output, + Dtype *weights, + int64_t ignore_index) { + int64_t batch_size = input.getSize(0); + int64_t H = input.getSize(2); + int64_t W = input.getSize(3); + + CUDA_KERNEL_LOOP(index, nthreads) { + const int64_t b = index % batch_size; + const int64_t h = (index / batch_size) % H; + const int64_t w = (index / (batch_size * H)) % W; + + int64_t cur_target = target[b][h][w] - TH_INDEX_BASE; + if (cur_target == ignore_index) { + output[b][h][w] = ScalarConvert::to(0); + continue; + } + Dtype value = input[b][cur_target][h][w]; + Dtype weight = + weights ? weights[cur_target] : ScalarConvert::to(1); + output[b][h][w] = -value * weight; + } +} + +template +__global__ void SpatialClassNLLCriterion_updateGradInput_no_reduce_kernel( + int64_t nthreads, + THCDeviceTensor target, + THCDeviceTensor gradOutput, + THCDeviceTensor gradInput, + Dtype *weights, + int64_t ignore_index) { + int64_t batch_size = target.getSize(0); + int64_t H = target.getSize(1); + int64_t W = target.getSize(2); + + CUDA_KERNEL_LOOP(index, nthreads) { + const int64_t b = index % batch_size; + const int64_t h = (index / batch_size) % H; + const int64_t w = (index / (batch_size * H)) % W; + + int64_t cur_target = target[b][h][w] - TH_INDEX_BASE; + if (cur_target == ignore_index) { + continue; + } + Dtype value = + -(weights ? weights[cur_target] : ScalarConvert::to(1)); + gradInput[b][cur_target][h][w] = value * gradOutput[b][h][w]; + } +} + +template +__global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel( + T *output, + T *total_weight, + T *input, + THCIndex_t *target, + T *weights, + int size_average, + int batch_size, + int n_classes, + int map_nelem, + int blocks_per_sample, + int64_t ignore_index) +{ + __shared__ AccumT partial_sums[CUDA_NUM_THREADS]; + + int i, t; + T cur_weight; + AccumT input_sum = 0; + AccumT acc_weight = 0; + + int sample = blockIdx.x / blocks_per_sample; + int toffset = sample * map_nelem; + int ioffset = sample * map_nelem * n_classes; + int step = blockDim.x * blocks_per_sample; + for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x; + i < map_nelem; + i += step) { + t = target[toffset + i] - TH_INDEX_BASE; + if (t != ignore_index) { + assert(t >= 0 && t < n_classes); + cur_weight = weights ? weights[t] : ScalarConvert::to(1); + input_sum -= input[ioffset + i + map_nelem * t] * cur_weight; + acc_weight += cur_weight; + } + } + + input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus(), AccumT(0)); + __syncthreads(); + acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus(), AccumT(0)); + + if (threadIdx.x == 0) { + atomicAdd(total_weight, ScalarConvert::to(acc_weight)); + atomicAdd(output, ScalarConvert::to(input_sum)); + } +} + +template +__global__ void cunn_SpatialClassNLLCriterion_sizeAverage_kernel( + T *output, + T *total_weight) +{ + if (*total_weight > 0) + *output = THCNumerics::div(*output, *total_weight); +} + +template +__global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel( + T *gradInput, + T *gradOutput, + THCIndex_t *target, + T *weights, + T *total_weight, + int size_average, + int batch_size, + int n_classes, + int map_nelem, + int blocks_per_sample, + int64_t ignore_index) +{ + if (*total_weight <= 0) + return; + + int i, t; + T norm = size_average ? (ScalarConvert::to(1) / *total_weight) : ScalarConvert::to(1); + + int sample = blockIdx.x / blocks_per_sample; + int step = blockDim.x * blocks_per_sample; + int toffset = sample * map_nelem; + int ioffset = sample * map_nelem * n_classes; + for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x; + i < map_nelem; + i += step) { + t = (int)target[toffset + i] - TH_INDEX_BASE; + if (t != ignore_index) { + assert(t >= 0 && t < n_classes); + gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert::to(1)) * norm * gradOutput[0]; + } + } +} + +#include "generic/SpatialClassNLLCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialConvolutionLocal.cu b/aten/src/THCUNN/SpatialConvolutionLocal.cu new file mode 100644 index 0000000..17801d5 --- /dev/null +++ b/aten/src/THCUNN/SpatialConvolutionLocal.cu @@ -0,0 +1,11 @@ +#include "THCUNN.h" +#include "common.h" +#include "im2col.h" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/SpatialConvolutionLocal.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu new file mode 100644 index 0000000..4a59acb --- /dev/null +++ b/aten/src/THCUNN/SpatialConvolutionMM.cu @@ -0,0 +1,10 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "im2col.h" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/SpatialConvolutionMM.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialCrossMapLRN.cu b/aten/src/THCUNN/SpatialCrossMapLRN.cu new file mode 100644 index 0000000..cd6f081 --- /dev/null +++ b/aten/src/THCUNN/SpatialCrossMapLRN.cu @@ -0,0 +1,126 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" +#include "common.h" + +template +__global__ void +#if __CUDA_ARCH__ >= 320 +__launch_bounds__(CUDA_NUM_THREADS) +#endif +LRNFillScale(const int nthreads, const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale) { + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + const Dtype* const in_off = in + offset; + Dtype* const scale_off = scale + offset; + int head = 0; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + Acctype accum_scale = Acctype(0); + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad && head < channels) { + accum_scale += in_off[head * step] * in_off[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in_off[head * step] * in_off[head * step]; + if (head - size >= 0) { + accum_scale -= in_off[(head - size) * step] + * in_off[(head - size) * step]; + } + scale_off[(head - post_pad) * step] = ScalarConvert::to(k + accum_scale * alpha_over_size); + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_scale -= in_off[(head - size) * step] + * in_off[(head - size) * step]; + } + scale_off[(head - post_pad) * step] = ScalarConvert::to(k + accum_scale * alpha_over_size); + ++head; + } + } +} + +template +__global__ void LRNComputeOutput(const int nthreads, const Dtype* in, + const Dtype* scale, const Dtype negative_beta, Dtype* out) { + CUDA_KERNEL_LOOP(index, nthreads) { + out[index] = in[index] * pow(scale[index], negative_beta); + } +} + +template +__global__ void LRNComputeDiff(const int nthreads, + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + const Dtype* const bottom_off = bottom_data + offset; + const Dtype* const top_off = top_data + offset; + const Dtype* const scale_off = scale + offset; + const Dtype* const top_diff_off = top_diff + offset; + Dtype* const bottom_diff_off = bottom_diff + offset; + int head = 0; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + Acctype accum_ratio = Acctype(0); + // accumulate values + while (head < post_pad && head < channels) { + accum_ratio += top_diff_off[head * step] * top_off[head * step] / + scale_off[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff_off[head * step] * top_off[head * step] / + scale_off[head * step]; + if (head - size >= 0) { + accum_ratio -= top_diff_off[(head - size) * step] * + top_off[(head - size) * step] / scale_off[(head - size) * step]; + } + bottom_diff_off[(head - post_pad) * step] = + ScalarConvert::to(top_diff_off[(head - post_pad) * step] + * pow(scale_off[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio); + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_ratio -= top_diff_off[(head - size) * step] * + top_off[(head - size) * step] / scale_off[(head - size) * step]; + } + bottom_diff_off[(head - post_pad) * step] = + ScalarConvert::to(top_diff_off[(head - post_pad) * step] + * pow(scale_off[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio); + ++head; + } + } +} + + +#include "generic/SpatialCrossMapLRN.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu new file mode 100644 index 0000000..a0231aa --- /dev/null +++ b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu @@ -0,0 +1,258 @@ +// updateOutput, updateGradInput Kernels ported from Sergey Zagoruyko's pyinn, which itself was a +// port from Caffe + +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCNumerics.cuh" +#include "THCReduceApplyUtils.cuh" +#include "THCSortUtils.cuh" +#include "THCTensorMathReduce.cuh" +#include "SharedMem.cuh" +#include "common.h" +#include + + +const int WARP_SIZE = 32; +// Crude benchmarks suggest 256 is better than 512 and 1024 +// TODO: Autotune/use better heuristics, improve speed more. +const int MAX_BLOCK_SIZE = 256; + +static int getGradParamsNumThreads(int batchSize){ +//warp per item in a batch, up to a maximum + return std::min(batchSize * WARP_SIZE, MAX_BLOCK_SIZE); + +} + +template +__global__ void spatialDepthwiseConvolutionUpdateOutput( + const THCDeviceTensor input, + THCDeviceTensor output, + const THCDeviceTensor weight, + const THCDeviceTensor bias, + bool biasEnabled, + IndexType totalElements, + const int outputChannels, + const int depthwiseMultiplier, + const int inputWidth, const int inputHeight, + const int outputWidth, const int outputHeight, + const int kernelWidth, const int kernelHeight, + const int strideWidth, const int strideHeight, + const int padWidth, const int padHeight, + const int dilationWidth, const int dilationHeight) +{ + const int KW_LIMIT = (kSize !=0) ? kSize : kernelWidth; + const int KH_LIMIT = (kSize !=0) ? kSize : kernelHeight; + + + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + //calculate n,c,h,w indices, replacing modulos by divide and multiply add, + //result is same as would be in the code below + //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth + //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth + //const int h = (linearIndex / outputWidth) % outputHeight; + //const int w = linearIndex % outputWidth; + + int indtmp1 = linearIndex/outputWidth; + const int w = linearIndex - indtmp1 * outputWidth; + int indtmp2 = indtmp1/outputHeight; + const int h = indtmp1 - indtmp2 * outputHeight; + indtmp1 = indtmp2; + indtmp2 = indtmp1/outputChannels; + const int c = indtmp1 - indtmp2 * outputChannels; + const int n = indtmp2; + + int inputChannel = c; + int inputChannels = outputChannels; + if (depthwiseMultiplier !=1) { + inputChannel /= depthwiseMultiplier; + inputChannels /= depthwiseMultiplier; + } + + int weightOffset = c * kernelHeight * kernelWidth; + + AccT value = biasEnabled ? ScalarConvert::to(bias.data()[c]) : ScalarConvert::to(0); + const IndexType offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth; +#pragma unroll + for (int kH = 0; kH < KH_LIMIT; ++kH) { +#pragma unroll + for (int kW = 0; kW < KW_LIMIT; ++kW) { + const int h_in = -padHeight + h * strideHeight + kH * dilationHeight; + const int w_in = -padWidth + w * strideWidth + kW * dilationWidth; + + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && (w_in < inputWidth)) { + const IndexType offset = offset0 + h_in * inputWidth + w_in; + value = THCNumerics::add( + value, + THCNumerics::mul( + ScalarConvert::to(weight.data()[weightOffset]), + ScalarConvert::to(input.data()[offset]))); + } + ++weightOffset; + } + } + output.data()[linearIndex] = ScalarConvert::to(value); + } +} + +template +__global__ void spatialDepthwiseConvolutionUpdateGradInput( + const THCDeviceTensor gradOutput, + THCDeviceTensor gradInput, + const THCDeviceTensor weight, + IndexType totalElements, + const int inputChannels, + const int depthwiseMultiplier, + const int outputChannels, + const int inputWidth, const int inputHeight, + const int outputWidth, const int outputHeight, + const int kernelWidth, const int kernelHeight, + const int strideWidth, const int strideHeight, + const int padWidth, const int padHeight, + const int dilationWidth, const int dilationHeight) +{ + const int KW_LIMIT = (kSize !=0) ? kSize : kernelWidth; + const int KH_LIMIT = (kSize !=0) ? kSize : kernelHeight; + const int strideW = (stride !=0) ? stride : strideWidth; + const int strideH = (stride !=0) ? stride : strideHeight; + + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + + int indtmp1 = linearIndex/inputWidth; + const int w = linearIndex - indtmp1 * inputWidth; + int indtmp2 = indtmp1/inputHeight; + const int h = indtmp1 - indtmp2 * inputHeight; + indtmp1 = indtmp2; + indtmp2 = indtmp1/inputChannels; + const int c = indtmp1 - indtmp2 * inputChannels; + const int n = indtmp2; + + AccT value = ScalarConvert::to(0); + +#pragma unroll + for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) { + int och = (c * depthwiseMultiplier) + multiplier; + int weightOffset = och * kernelHeight * kernelWidth; +#pragma unroll + for (int kh = 0; kh < KH_LIMIT; ++kh) { +#pragma unroll + for (int kw = 0; kw < KW_LIMIT; ++kw) { + int h_out = h + padHeight - kh * dilationHeight; + int w_out = w + padWidth - kw * dilationWidth; + if ((h_out % strideH == 0) && (w_out % strideW == 0)) { + h_out = h_out / strideH; + w_out = w_out / strideW; + + if ((h_out >= 0) && (h_out < outputHeight) + && (w_out >= 0) && (w_out < outputWidth)) { + + const int offset = ((n * outputChannels + och) * outputHeight + h_out) + * outputWidth + w_out; + value = THCNumerics::add( + value, + THCNumerics::mul( + ScalarConvert::to(weight.data()[weightOffset]), + ScalarConvert::to(gradOutput.data()[offset]))); + } + } + ++weightOffset; + } + } + } + gradInput.data()[linearIndex] = ScalarConvert::to(value); + } +} + + +template +__global__ void spatialDepthwiseConvolutionAccGradParameters( + const THCDeviceTensor gradOutput, + const THCDeviceTensor input, + THCDeviceTensor gradWeight, + const int batchSize, + const int inputChannels, + const int kernelChannels, + const int depthwiseMultiplier, + const int inputWidth, const int inputHeight, + const int outputWidth, const int outputHeight, + const int kernelWidth, const int kernelHeight, + const int strideWidth, const int strideHeight, + const int padWidth, const int padHeight, + const int dilationWidth, const int dilationHeight) +{ + const int channelStride = kernelWidth * kernelHeight; + + // Have to use a statically typed Shared Memory pointer + SharedMem smem; + + // Each Block is responsible for accumulating over a permutation of + // (channels x kH x kW), use blockIdx to determine which one + int bidx = blockIdx.x; + int kW = bidx % kernelWidth; + int kH = (bidx / kernelWidth) % kernelHeight; + int ch = (bidx / channelStride); + + // Need to calculate which input channel is associated with this filter + // channel + int inputCh = ch / depthwiseMultiplier; + + AccT grad = ScalarConvert::to(0.0); + + const int laneId = threadIdx.x % WARP_SIZE; + const int batch = threadIdx.x / WARP_SIZE; + const int nwarps = blockDim.x / WARP_SIZE; + const int imageElements = outputWidth * outputHeight; + // Use warp per item. In the original kernel, a threadblock was used to sum over NHW. + // Here, we use a warp to sum values over HW dimension, and if batchSize is larger than the + // number of warps, a warp would loop over remaining batch items (e.g. if there are 8 warps, + // warp 0 would go over 0-8-16 etc image, warp 1 over 1-9-17 etc). Later in blockReduce, + // all the warps will be reduced anyway, thus the full reduction will be over NHW, like it + // should be. That allows to get rid of one modulo operation inside the loop (because n/batchIdx + // now does not have to be computed through modulo, you are just looping over it), and + // bring a nice speed-up. + for (int batchIdx = batch; batchIdx < batchSize; batchIdx += nwarps){ + // Warp-stride loop over elements in a batch item + for (IndexType idx = laneId; idx < imageElements; idx += WARP_SIZE) { + // Need to calculate the following: batch position, and offset into the gradOutput + // in height, and width. We can intuit the corresponding position in the input from + // the other parameters we have + int go_w_offset = idx % outputWidth; + int go_h_offset = (idx / outputWidth); + + int i_w_offset = (go_w_offset * strideWidth) + (kW * dilationWidth) - padWidth; + int i_h_offset = (go_h_offset * strideHeight) + (kH * dilationHeight) - padHeight; + + if (i_w_offset >= 0 && i_h_offset >= 0 && i_w_offset < inputWidth && i_h_offset < inputHeight) { + int inputOffset = ((batchIdx * inputChannels + inputCh) * inputHeight + i_h_offset) * inputWidth + i_w_offset; + int outputOffset = ((batchIdx * kernelChannels + ch) * outputHeight ) * outputWidth + idx; + grad = THCNumerics::add( + grad, + THCNumerics::mul( + ScalarConvert::to(input.data()[inputOffset]), + ScalarConvert::to(gradOutput.data()[outputOffset]))); + } + } + } + __syncthreads(); + + // At this point each thread in the block has a local gradient, which we need to + // accumulate prior to writing the global value + AccT *buf = smem.getPointer(); + AccT tval = reduceBlock>( + buf, blockDim.x, grad, ReduceAdd(), ScalarConvert::to(0)); + + // After reduction, first thread in the block has the gradient, so its responsible + // for writing it to gradWeight + if (threadIdx.x == 0) { + int weightOffset = kW + (kernelWidth * kH) + (kernelWidth * kernelHeight * ch); + gradWeight.data()[weightOffset] = ScalarConvert::to(tval); + } +} + +#include "generic/SpatialDepthwiseConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialDilatedConvolution.cu b/aten/src/THCUNN/SpatialDilatedConvolution.cu new file mode 100644 index 0000000..b8e9602 --- /dev/null +++ b/aten/src/THCUNN/SpatialDilatedConvolution.cu @@ -0,0 +1,11 @@ +#include "THCUNN.h" +#include "common.h" +#include "im2col.h" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/SpatialDilatedConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu new file mode 100644 index 0000000..6732e4f --- /dev/null +++ b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu @@ -0,0 +1,116 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCNumerics.cuh" +#include "common.h" + +// kernels borrowed from Caffe +template +__global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, Dtype* top_data, + int64_t* top_mask) { + CUDA_KERNEL_LOOP(index, nthreads) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); + int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); + while(hstart < 0) + hstart += dilation_h; + while(wstart < 0) + wstart += dilation_w; + AccType maxval = THCNumerics::min(); + int maxidx = -1; + bottom_data += (n * channels + c) * height * width; + for (int h = hstart; h < hend; h += dilation_h) { + for (int w = wstart; w < wend; w += dilation_w) { + Dtype val = bottom_data[h * width + w]; + if ((ScalarConvert::to(val) > maxval) || THCNumerics::isnan(val)) { + maxidx = h * width + w; + maxval = ScalarConvert::to(val); + } + } + } + top_data[index] = ScalarConvert::to(maxval); + top_mask[index] = maxidx + TH_INDEX_BASE; + } +} + +const int BACKWARD_THREADS = 256; + +template +__launch_bounds__(BACKWARD_THREADS,2048/BACKWARD_THREADS) +__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff, + const int64_t* top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, + Dtype* bottom_diff) { + CUDA_KERNEL_LOOP(index, height*width) { + int h = index/width; + int w = index - h * width; +//get some templating performance benefits without actually templating + int phstart, phend, pwstart, pwend; + if (stride_h == 1) { + phstart = + (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) + 1; + phend = min((h + pad_h) + 1, pooled_height); + } else if (stride_h == 2) { + phstart = + (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2 + 1; + phend = min((h + pad_h) / 2 + 1, pooled_height); + } else { + phstart = + (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h + 1; + phend = min((h + pad_h) / stride_h + 1, pooled_height); + } + if (stride_w == 1) { + pwstart = + (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1; + pwend = min((w + pad_w) + 1, pooled_width); + } else if (stride_w == 2) { + pwstart = + (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1; + pwend = min((w + pad_w) / 2 + 1, pooled_width); + } else { + pwstart = + (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1; + pwend = min((w + pad_w) / stride_w + 1, pooled_width); + } + for (int n = blockIdx.y; n < num; n += gridDim.y) + for (int c = blockIdx.z; c < channels; c+= gridDim.z) { + + AccType gradient = AccType(0); + int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff += offset; + top_mask += offset; +//get some templating performance benefits without actually templating + if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) { + gradient += ScalarConvert::to(top_diff[ph * pooled_width + pw]); + } + } + } + } else { + if (top_mask[phstart * pooled_width + pwstart] - TH_INDEX_BASE == h * width + w) { + gradient += ScalarConvert::to(top_diff[phstart * pooled_width + pwstart]); + } + } + bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert::to(gradient); + } + } +} + +#include "generic/SpatialDilatedMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu new file mode 100644 index 0000000..f3ca162 --- /dev/null +++ b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu @@ -0,0 +1,113 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#include + +template +__device__ inline int getInterval(Acctype sample, + int index, + int inputSize, + int outputSize, + int poolSize) { + Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1); + if (index == outputSize - 1) { + return inputSize - poolSize; + } else { + return (int) ((index + sample) * alpha) - (int) (sample * alpha); + } +} + +// We template on poolSizeW to allow the innermost loop to be unrolled +template +__global__ void SpatialFractionalMaxPooling_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + THCDeviceTensor indices, + THCDeviceTensor samples, + int poolSizeW, int poolSizeH) { + + // Output (h, w) point that this thread is responsible for + int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + + // Each thread generates a specific output point + if (ourOutputPoint < output.getSize(2) * output.getSize(3)) { + int outputW = ourOutputPoint % output.getSize(3); + int outputH = ourOutputPoint / output.getSize(3); + + int poolW = getInterval(ScalarConvert::to(samples[batch][plane][0]), outputW, + input.getSize(3), output.getSize(3), poolSizeW); + int poolH = getInterval(ScalarConvert::to(samples[batch][plane][1]), outputH, + input.getSize(2), output.getSize(2), poolSizeH); + + Dtype maxVal = THCNumerics::min(); + int maxIndex = -1; + + for (int h = poolH; h < poolH + poolSizeH; ++h) { + if (PoolSizeWStatic == -1) { + for (int w = poolW; w < poolW + poolSizeW; ++w) { + Dtype val = input[batch][plane][h][w]; + // for consistency with THNN, favor the first max + if (val > maxVal) { + maxIndex = h * input.getSize(3) + w; + maxVal = val; + } + } + } else { +#pragma unroll + for (int i = 0; i < PoolSizeWStatic; ++i) { + int w = i + poolW; + Dtype val = input[batch][plane][h][w]; + // for consistency with THNN, favor the first max + if (val > maxVal) { + maxIndex = h * input.getSize(3) + w; + maxVal = val; + } + } + } + } + + assert(THCNumerics::ne(maxVal, THCNumerics::min())); + assert(maxIndex != -1); + + // +1 for Lua index + indices[batch][plane][outputH][outputW] = maxIndex + TH_INDEX_BASE; + output[batch][plane][outputH][outputW] = maxVal; + } +} + +template +__global__ void SpatialFractionalMaxPooling_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + THCDeviceTensor indices) { + // Output (h, w) point that this thread is responsible for + int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + + // Each thread generates a specific output point + if (ourOutputPoint < gradOutput.getSize(2) * gradOutput.getSize(3)) { + int outputW = ourOutputPoint % gradOutput.getSize(3); + int outputH = ourOutputPoint / gradOutput.getSize(3); + + int index = indices[batch][plane][outputH][outputW] - TH_INDEX_BASE; + assert(index >= 0); + int inputW = index % gradInput.getSize(3); + int inputH = index / gradInput.getSize(3); + assert(inputH < gradInput.getSize(2)); + + atomicAdd(gradInput[batch][plane][inputH][inputW].data(), + gradOutput[batch][plane][outputH][outputW]); + } +} + +#include "generic/SpatialFractionalMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialFullConvolution.cu b/aten/src/THCUNN/SpatialFullConvolution.cu new file mode 100644 index 0000000..4e37ecf --- /dev/null +++ b/aten/src/THCUNN/SpatialFullConvolution.cu @@ -0,0 +1,8 @@ +#include "THCUNN.h" +#include "im2col.h" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/SpatialFullConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu new file mode 100644 index 0000000..61e1fe5 --- /dev/null +++ b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu @@ -0,0 +1,9 @@ +#include "THCUNN.h" +#include "im2col.h" +#include "THCTensor.hpp" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/SpatialFullDilatedConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu new file mode 100644 index 0000000..30a1a5d --- /dev/null +++ b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu @@ -0,0 +1,243 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < W && y >= 0 && y < H) +#define SAFE_ADD(input, x, y, n, c, H, W, value) \ + do { \ + if (WITHIN_BOUNDS(x, y, H, W)) { \ + atomicAdd(&input[n][c][y][x], value); \ + } \ + } while(0) + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +const int MODE_BORDER = 1; + + +template +__launch_bounds__(1024) +__global__ void SpatialGridSamplerBilinear_updateOutput_kernel( + const int nthreads, + THCDeviceTensor input, + THCDeviceTensor grid, + THCDeviceTensor output, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int IH = input.getSize(2); + int IW = input.getSize(3); + int H = grid.getSize(1); + int W = grid.getSize(2); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int h = (index / N) % H; + const int w = (index / (N * H)) % W; + int c; + + // get the corresponding input x, y co-ordinates from grid + Dtype ix = grid[n][h][w][0]; + Dtype iy = grid[n][h][w][1]; + + // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ScalarConvert::to(ix)); + int iy_nw = floor(ScalarConvert::to(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + Dtype nw = (ix_se - ix) * (iy_se - iy); + Dtype ne = (ix - ix_sw) * (iy_sw - iy); + Dtype sw = (ix_ne - ix) * (iy - iy_ne); + Dtype se = (ix - ix_nw) * (iy - iy_nw); + + // calculate bilinear weighted pixel value and set output pixel + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_nw, ix_nw, IW); + CLIP_COORDINATES(iy_nw, iy_nw, IH); + CLIP_COORDINATES(ix_ne, ix_ne, IW); + CLIP_COORDINATES(iy_ne, iy_ne, IH); + CLIP_COORDINATES(ix_sw, ix_sw, IW); + CLIP_COORDINATES(iy_sw, iy_sw, IH); + CLIP_COORDINATES(ix_se, ix_se, IW); + CLIP_COORDINATES(iy_se, iy_se, IH); + } + + Dtype out_val; + for (c = 0; c < C; ++c) { + out_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_nw, iy_nw, IH, IW)) { + out_val += input[n][c][iy_nw][ix_nw] * nw; + } + if (WITHIN_BOUNDS(ix_ne, iy_ne, IH, IW)) { + out_val += input[n][c][iy_ne][ix_ne] * ne; + } + if (WITHIN_BOUNDS(ix_sw, iy_sw, IH, IW)) { + out_val += input[n][c][iy_sw][ix_sw] * sw; + } + if (WITHIN_BOUNDS(ix_se, iy_se, IH, IW)) { + out_val += input[n][c][iy_se][ix_se] * se; + } + output[n][c][h][w] = out_val; + } + } +} + +template +__launch_bounds__(1024) +__global__ void SpatialGridSamplerBilinear_updateGradInput_kernel( + const int nthreads, + THCDeviceTensor input, THCDeviceTensor gradInput, + THCDeviceTensor grid, THCDeviceTensor gradGrid, + THCDeviceTensor gradOutput, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int IH = input.getSize(2); + int IW = input.getSize(3); + int H = grid.getSize(1); + int W = grid.getSize(2); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int h = (index / N) % H; + const int w = (index / (N * H)) % W; + + // get the corresponding input x, y co-ordinates from grid + Dtype ix = grid[n][h][w][0]; + Dtype iy = grid[n][h][w][1]; + + Dtype gix = ScalarConvert::to(0); + Dtype giy = ScalarConvert::to(0); + + // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1));; + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ScalarConvert::to(ix)); + int iy_nw = floor(ScalarConvert::to(iy));; + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + Dtype nw = (ix_se - ix) * (iy_se - iy); + Dtype ne = (ix - ix_sw) * (iy_sw - iy); + Dtype sw = (ix_ne - ix) * (iy - iy_ne); + Dtype se = (ix - ix_nw) * (iy - iy_nw); + + Dtype gradout; + Dtype nw_val; + Dtype ne_val; + Dtype sw_val; + Dtype se_val; + + int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; + + if (padding_mode==MODE_BORDER){ + // get clipped NE, NW, SE, SW pixel values from (x, y) + CLIP_COORDINATES(ix_nw, ix_nw_cl, IW); + CLIP_COORDINATES(iy_nw, iy_nw_cl, IH); + CLIP_COORDINATES(ix_ne, ix_ne_cl, IW); + CLIP_COORDINATES(iy_ne, iy_ne_cl, IH); + CLIP_COORDINATES(ix_sw, ix_sw_cl, IW); + CLIP_COORDINATES(iy_sw, iy_sw_cl, IH); + CLIP_COORDINATES(ix_se, ix_se_cl, IW); + CLIP_COORDINATES(iy_se, iy_se_cl, IH); + } + else { + ix_nw_cl = ix_nw; + iy_nw_cl = iy_nw; + ix_ne_cl = ix_ne; + iy_ne_cl = iy_ne; + ix_sw_cl = ix_sw; + iy_sw_cl = iy_sw; + ix_se_cl = ix_se; + iy_se_cl = iy_se; + } + + for (int c = 0; c < C; ++c) { + gradout = gradOutput[n][c][h][w]; + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout); + SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout); + SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout); + SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout); + + // calculate gradGrid + nw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_nw_cl, iy_nw_cl, IH, IW)) { + nw_val = input[n][c][iy_nw_cl][ix_nw_cl]; + } + ne_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_ne_cl, iy_ne_cl, IH, IW)) { + ne_val = input[n][c][iy_ne_cl][ix_ne_cl]; + } + sw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_sw_cl, iy_sw_cl, IH, IW)) { + sw_val = input[n][c][iy_sw_cl][ix_sw_cl]; + } + se_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_se_cl, iy_se_cl, IH, IW)) { + se_val = input[n][c][iy_se_cl][ix_se_cl]; + } + + gix += ScalarConvert::to(-1)*(nw_val * (iy_se - iy) * gradout); + gix += ne_val * (iy_sw - iy) * gradout; + gix += ScalarConvert::to(-1)*(sw_val * (iy - iy_ne) * gradout); + gix += se_val * (iy - iy_nw) * gradout; + + giy += ScalarConvert::to(-1)*(nw_val * (ix_se - ix) * gradout); + giy += ScalarConvert::to(-1)*(ne_val * (ix - ix_sw) * gradout); + giy += sw_val * (ix_ne - ix) * gradout; + giy += se_val * (ix - ix_nw) * gradout; + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + + Dtype gix_old = gradGrid[n][h][w][0]; + Dtype giy_old = gradGrid[n][h][w][1]; + + gradGrid[n][h][w][0] = gix_old + gix; + gradGrid[n][h][w][1] = giy_old + giy; + } +} + +#undef MIN +#undef MAX +#undef CLIP_COORDINATES +#undef WITHIN_BOUNDS +#undef SAFE_ADD + +#include "generic/SpatialGridSamplerBilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialMaxPooling.cu b/aten/src/THCUNN/SpatialMaxPooling.cu new file mode 100644 index 0000000..90e6fe4 --- /dev/null +++ b/aten/src/THCUNN/SpatialMaxPooling.cu @@ -0,0 +1,4 @@ +#include "THCUNN.h" + +#include "generic/SpatialMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialMaxUnpooling.cu b/aten/src/THCUNN/SpatialMaxUnpooling.cu new file mode 100644 index 0000000..56488fd --- /dev/null +++ b/aten/src/THCUNN/SpatialMaxUnpooling.cu @@ -0,0 +1,32 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" + +template +__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const int64_t* bottom_mask, + const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) { + CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels + int c = (index / iwidth / iheight) % channels; + int n = index / iwidth / iheight / channels; + top_data += (n*channels + c)*oheight*owidth; + int maxind = bottom_mask[index] - TH_INDEX_BASE; + + top_data[maxind] = bottom_data[index]; + } +} + +template +__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const int64_t* bottom_mask, + const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + int c = (index / iwidth / iheight) % channels; + int n = index / iwidth / iheight / channels; + top_diff += (n*channels + c)*oheight*owidth; + int maxind = bottom_mask[index] - TH_INDEX_BASE; + + bottom_diff[index] = top_diff[maxind]; + } +} + +#include "generic/SpatialMaxUnpooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialReflectionPadding.cu b/aten/src/THCUNN/SpatialReflectionPadding.cu new file mode 100644 index 0000000..96472ee --- /dev/null +++ b/aten/src/THCUNN/SpatialReflectionPadding.cu @@ -0,0 +1,87 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void SpatialReflectionPadding_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + int padT, int padB, int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= output.getSize(2) * output.getSize(3)) { + return; + } + int outputPointX = outputPointId % output.getSize(3); + int outputPointY = outputPointId / output.getSize(3); + + int iStartX = max(0, -padL); + int iStartY = max(0, -padT); + int oStartX = max(0, padL); + int oStartY = max(0, padT); + + int inputPointX = abs(outputPointX - padL) + - abs(outputPointX - (input.getSize(3) + padL - 1)) + - outputPointX + + 2 * padL + input.getSize(3) - 1 + - oStartX + iStartX; + + int inputPointY = abs(outputPointY - padT) + - abs(outputPointY - (input.getSize(2) + padT - 1)) + - outputPointY + + 2 * padT + input.getSize(2) - 1 + - oStartY + iStartY; + + Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX]; + output[batch][plane][outputPointY][outputPointX] = valueToCopy; +} + +template +__global__ void SpatialReflectionPadding_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + int padT, int padB, int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) { + return; + } + int outputPointX = outputPointId % gradOutput.getSize(3); + int outputPointY = outputPointId / gradOutput.getSize(3); + + int iStartX = max(0, -padL); + int iStartY = max(0, -padT); + int oStartX = max(0, padL); + int oStartY = max(0, padT); + + int inputPointX = abs(outputPointX - padL) + - abs(outputPointX - (gradInput.getSize(3) + padL - 1)) + - outputPointX + + 2 * padL + gradInput.getSize(3) - 1 + - oStartX + iStartX; + + int inputPointY = abs(outputPointY - padT) + - abs(outputPointY - (gradInput.getSize(2) + padT - 1)) + - outputPointY + + 2 * padT + gradInput.getSize(2) - 1 + - oStartY + iStartY; + + Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; + atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy); +} + +#include "generic/SpatialReflectionPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialReplicationPadding.cu b/aten/src/THCUNN/SpatialReplicationPadding.cu new file mode 100644 index 0000000..f63c209 --- /dev/null +++ b/aten/src/THCUNN/SpatialReplicationPadding.cu @@ -0,0 +1,70 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void SpatialReplicationPadding_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + int padT, int padB, int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= output.getSize(2) * output.getSize(3)) { + return; + } + int outputPointX = outputPointId % output.getSize(3); + int outputPointY = outputPointId / output.getSize(3); + + int iStartX = max(0, -padL); + int iStartY = max(0, -padT); + int oStartX = max(0, padL); + int oStartY = max(0, padT); + + int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX; + int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY; + + Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX]; + output[batch][plane][outputPointY][outputPointX] = valueToCopy; +} + +template +__global__ void SpatialReplicationPadding_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + int padT, int padB, int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) { + return; + } + int outputPointX = outputPointId % gradOutput.getSize(3); + int outputPointY = outputPointId / gradOutput.getSize(3); + + int iStartX = max(0, -padL); + int iStartY = max(0, -padT); + int oStartX = max(0, padL); + int oStartY = max(0, padT); + + int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX; + int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY; + + Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; + atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy); +} + + +#include "generic/SpatialReplicationPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialSubSampling.cu b/aten/src/THCUNN/SpatialSubSampling.cu new file mode 100644 index 0000000..bb04846 --- /dev/null +++ b/aten/src/THCUNN/SpatialSubSampling.cu @@ -0,0 +1,265 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit + +/* + * Description: + * this function subsamples an input 3D tensor along dimensions 1 and 2 + * 3D input, 3D output, 1D weight, 1D bias + */ + template +__global__ void subsample(Dtype *input, Dtype *output, Dtype *weight, Dtype *bias, + int input_n, int input_h, int input_w, + int kH, int kW, int dH, int dW) +{ + // iterators + int xx, yy; + + // output size + int output_w = (input_w - kW) / dW + 1; + int output_h = (input_h - kH) / dH + 1; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + int k = blockIdx.x % input_n; + + int xx_start = threadIdx.x; + int xx_end = output_w; + int xx_step = blockDim.x; + + int yy_start = blockDim.y*blockIdx.y + threadIdx.y; + int yy_end = output_h; + int yy_step = blockDim.y*gridDim.y; + + // select input/output plane + output = output + o*output_w*output_h; + input = input + i*input_w*input_h; + + // Get the good mask for (k,i) (k out, i in) + Dtype the_weight = weight[k]; + + // Initialize to the bias + Dtype the_bias = bias[k]; + + // For all output pixels... + for(yy = yy_start; yy < yy_end; yy+=yy_step) { + for(xx = xx_start; xx < xx_end; xx+=xx_step) { + // Compute the mean of the input image... + Dtype *ptr_input = input + yy*dH*input_w + xx*dW; + Dtype *ptr_output = output + yy*output_w + xx; + Acctype sum = 0; + int kx, ky; + for(ky = 0; ky < kH; ky++) { + for(kx = 0; kx < kW; kx++) + sum += ptr_input[kx]; + ptr_input += input_w; // next input line + } + // Update output + *ptr_output = ScalarConvert::to(the_weight*sum + the_bias); + } + } +} + +/* + * Description: + * this function computes the gradWeight from input and gradOutput + */ + template +__global__ void subgradweight(Dtype *input, Dtype *gradOutput, Dtype *gradWeight, Dtype *gradBias, + int input_n, int input_h, int input_w, + int kH, int kW, int dH, int dW, + float scale) +{ + // iterators + int xx, yy; + + // output size + int output_w = (input_w - kW) / dW + 1; + int output_h = (input_h - kH) / dH + 1; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + int k = blockIdx.x % input_n; + + int xx_start = threadIdx.x; + int xx_end = output_w; + int xx_step = blockDim.x; + + int yy_start = threadIdx.y; + int yy_end = output_h; + int yy_step = blockDim.y; + + // select input/output plane + gradOutput = gradOutput + o*output_w*output_h; + input = input + i*input_w*input_h; + + // thread ID + int tid = blockDim.x*threadIdx.y + threadIdx.x; + + // create array to hold partial sums + __shared__ Acctype sums[CUDA_MAX_THREADS]; + sums[tid] = 0; + + // compute partial sums + for(yy = yy_start; yy < yy_end; yy+=yy_step) { + for(xx = xx_start; xx < xx_end; xx+=xx_step) { + Dtype *ptr_input = input + yy*dH*input_w + xx*dW; + Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx; + Dtype z = *ptr_gradOutput; + int64_t kx, ky; + for(ky = 0; ky < kH; ky++) { + for(kx = 0; kx < kW; kx++) { + sums[tid] += z * ptr_input[kx]; + } + ptr_input += input_w; + } + } + } + __syncthreads(); + + // reduce: accumulate all partial sums to produce final gradWeight + if ((threadIdx.x == 0) && (threadIdx.y == 0)) { + Acctype scaledSums = Acctype(0); + for(int i = 0; i < blockDim.x*blockDim.y; i++) { + scaledSums += scale*sums[i]; + } + gradWeight[k] += ScalarConvert::to(scaledSums); + } + __syncthreads(); + + // compute gradBias + sums[tid] = 0; + for (int i=tid; i::to(scaledSums); + } +} + +/* + * Description: + * this function computes the gradInput from weight and gradOutput + */ + template +__global__ void subgradinput(Dtype *gradInput, Dtype *gradOutput, Dtype *weight, + int input_n, int input_h, int input_w, + int kH, int kW, int dH, int dW) +{ + // iterators + int xx, yy; + + // output size + int output_w = (input_w - kW) / dW + 1; + int output_h = (input_h - kH) / dH + 1; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + int k = blockIdx.x % input_n; + + int xx_start = threadIdx.x; + int xx_end = output_w; + int xx_step = blockDim.x; + + int yy_start = blockDim.y*blockIdx.y + threadIdx.y; + int yy_end = output_h; + int yy_step = blockDim.y*gridDim.y; + + // select input/output plane + gradOutput = gradOutput + o*output_w*output_h; + gradInput = gradInput + i*input_w*input_h; + + // get weight + Dtype the_weight = weight[k]; + + // compute gradInput + for(yy = yy_start; yy < yy_end; yy+=yy_step) { + for(xx = xx_start; xx < xx_end; xx+=xx_step) { + Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW; + Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx; + Dtype z = *ptr_gradOutput * the_weight; + int kx, ky; + for(ky = 0; ky < kH; ky++) { + for(kx = 0; kx < kW; kx++) { + // FIXME: should this be done at accreal precision? + ptr_gradInput[kx] += z; + } + ptr_gradInput += input_w; + } + } + } +} + +/* + * Description: + * this function computes the gradInput from weight and gradOutput + */ + template +__global__ void subgradinputAtomic(Dtype *gradInput, Dtype *gradOutput, Dtype *weight, + int input_n, int input_h, int input_w, + int kH, int kW, int dH, int dW) +{ + // iterators + int xx, yy; + + // output size + int output_w = (input_w - kW) / dW + 1; + int output_h = (input_h - kH) / dH + 1; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + int k = blockIdx.x % input_n; + + int xx_start = threadIdx.x; + int xx_end = output_w; + int xx_step = blockDim.x; + + int yy_start = blockDim.y*blockIdx.y + threadIdx.y; + int yy_end = output_h; + int yy_step = blockDim.y*gridDim.y; + + // select input/output plane + gradOutput = gradOutput + o*output_w*output_h; + gradInput = gradInput + i*input_w*input_h; + + // get weight + Dtype the_weight = weight[k]; + + // compute gradInput + for(yy = yy_start; yy < yy_end; yy+=yy_step) { + for(xx = xx_start; xx < xx_end; xx+=xx_step) { + Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW; + Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx; + Dtype z = *ptr_gradOutput * the_weight; + int kx, ky; + for(ky = 0; ky < kH; ky++) { + for(kx = 0; kx < kW; kx++) { + // FIXME: should this be done at accreal precision? + atomicAdd(&(ptr_gradInput[kx]), z); + } + ptr_gradInput += input_w; + } + } + } +} + + +#include "generic/SpatialSubSampling.cu" +#include "THCGenerateFloatTypes.h" + +#undef CUDA_MAX_THREADS diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu new file mode 100644 index 0000000..07daa0e --- /dev/null +++ b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu @@ -0,0 +1,124 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "linear_upsampling.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void caffe_gpu_interp2_kernel(const int n, + const Acctype rheight, const Acctype rwidth, const bool align_corners, + const THCDeviceTensor data1, THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int height1 = data1.getSize(2); + const int width1 = data1.getSize(3); + const int height2 = data2.getSize(2); + const int width2 = data2.getSize(3); + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][h1][w1]; + data2[n][c][h2][w2] = val; + } + } + return; + } + // + const Acctype h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const Acctype h1lambda = h1r - h1; + const Acctype h0lambda = Acctype(1) - h1lambda; + // + const Acctype w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; + // + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1] + + w1lambda * data1[n][c][h1][w1+w1p]) + + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1] + + w1lambda * data1[n][c][h1+h1p][w1+w1p]); + data2[n][c][h2][w2] = ScalarConvert::to(val); + } + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +__global__ void caffe_gpu_interp2_kernel_backward(const int n, + const Acctype rheight, const Acctype rwidth, const bool align_corners, + THCDeviceTensor data1, const THCDeviceTensor data2){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int height1 = data1.getSize(2); + const int width1 = data1.getSize(3); + const int height2 = data2.getSize(2); + const int width2 = data2.getSize(3); + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][h1][w1]; + data1[n][c][h2][w2] += val; + } + } + return; + } + // + const Acctype h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const Acctype h1lambda = h1r - h1; + const Acctype h0lambda = Acctype(1) - h1lambda; + // + const Acctype w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; + // + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype d2val = data2[n][c][h2][w2]; + atomicAdd(data1[n][c][h1][w1].data(), + ScalarConvert::to(h0lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][h1][w1+w1p].data(), + ScalarConvert::to(h0lambda * w1lambda * d2val)); + atomicAdd(data1[n][c][h1+h1p][w1].data(), + ScalarConvert::to(h1lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(), + ScalarConvert::to(h1lambda * w1lambda * d2val)); + } + } + } +} + + +#include "generic/SpatialUpSamplingBilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu new file mode 100644 index 0000000..889d64e --- /dev/null +++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu @@ -0,0 +1,102 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCTensor.hpp" + +#include "linear_upsampling.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void nearest_neighbor_4d_kernel( + const int n, + const THCDeviceTensor data1, + THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int height1 = data1.getSize(2); + const int width1 = data1.getSize(3); + const int height2 = data2.getSize(2); + const int width2 = data2.getSize(3); + const float height_scale = (float) height1 / (float) height2; + const float width_scale = (float) width1 / (float) width2; + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][h1][w1]; + data2[n][c][h2][w2] = val; + } + } + return; + } + // + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][h1][w1]; + data2[n][c][h2][w2] = val; + } + } + } +} + +// Backward operation +template +__global__ void nearest_neighbor_4d_kernel_backward( + const int n, + THCDeviceTensor data1, + const THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int height1 = data1.getSize(2); + const int width1 = data1.getSize(3); + const int height2 = data2.getSize(2); + const int width2 = data2.getSize(3); + const float height_scale = (float) height1 / (float) height2; + const float width_scale = (float) width1 / (float) width2; + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][h2][w2]; + data1[n][c][h1][w1] = val; + } + } + return; + } + // + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype d2val = data2[n][c][h2][w2]; + atomicAdd(data1[n][c][h1][w1].data(), d2val); + } + } + } +} + + +#include "generic/SpatialUpSamplingNearest.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/Sqrt.cu b/aten/src/THCUNN/Sqrt.cu new file mode 100644 index 0000000..a52ce34 --- /dev/null +++ b/aten/src/THCUNN/Sqrt.cu @@ -0,0 +1,33 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct sqrtupdateOutput_functor +{ + const T bias; + + sqrtupdateOutput_functor(T bias_) + : bias(bias_) + {} + + __device__ void operator()(T *output, const T *input) const + { + *output = sqrt(*input + bias); + } +}; + +template +struct sqrtupdateGradInput_functor +{ + sqrtupdateGradInput_functor() {} + + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const + { + *gradInput = (THCNumerics::eq(*output,ScalarConvert::to(0.0f))) ? ScalarConvert::to(0.0f) : ((ScalarConvert::to(0.5f) * *gradOutput) / *output); + } +}; + +#include "generic/Sqrt.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/Square.cu b/aten/src/THCUNN/Square.cu new file mode 100644 index 0000000..66bbec4 --- /dev/null +++ b/aten/src/THCUNN/Square.cu @@ -0,0 +1,25 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct squareupdateOutput_functor +{ + __device__ void operator()(T* output, const T* input) const + { + *output = (*input) * (*input); + } +}; + +template +struct squareupdateGradInput_functor +{ + __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const + { + *gradInput = ScalarConvert::to(2.0) * (*gradOutput) * (*input); + } +}; + +#include "generic/Square.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh new file mode 100644 index 0000000..c17f09d --- /dev/null +++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh @@ -0,0 +1,248 @@ +#ifndef THC_HALF_AUTO_NUMERICS_INC +#define THC_HALF_AUTO_NUMERICS_INC + +#include "THCHalf.h" +#include "THCNumerics.cuh" + +// Half numerics functions defined as free functions, so cunn code can be +//written generically, i.e. without excessive calling of THCNumerics functions. + +// these functions should move to THCNumerics + +#ifdef CUDA_HALF_TENSOR +inline __host__ __device__ half fmaxType(half x, half y) { + return THCNumerics::ge(x, y) ? x : y; +} + +inline __host__ __device__ float fmaxType(float x, half y) { + return fmaxf(x, ScalarConvert::to(y)); +} +#endif + +inline __host__ __device__ float fmaxType(float x, float y) { + return fmaxf(x, y); +} + +inline __host__ __device__ double fmaxType(double x, double y) { + return fmax(x, y); +} + +#ifdef CUDA_HALF_TENSOR + +// arithmetic functions + +inline __host__ __device__ half operator+(half a, half b) { + return THCNumerics::add(a, b); +} + +inline __host__ __device__ float operator+(half a, float b) { + return ScalarConvert::to(a) + b; +} + +inline __host__ __device__ float operator+(float a, half b) { + return a + ScalarConvert::to(b); +} + +inline __host__ __device__ double operator+(double a, half b) { + return a + ScalarConvert::to(b); +} + +inline __host__ __device__ half operator-(half a) { + return THCNumerics::neg(a); +} + +inline __host__ __device__ half operator-(half a, half b) { + return THCNumerics::add(a, THCNumerics::neg(b)); +} + +inline __host__ __device__ half operator-(half a, int b) { + return THCNumerics::add(a, THCNumerics::neg(ScalarConvert::to(b))); +} + +inline __host__ __device__ float operator-(half a, float b) { + return ScalarConvert::to(a) - b; +} + +inline __host__ __device__ double operator-(half a, double b) { + return ScalarConvert::to(a) - b; +} + +inline __host__ __device__ half operator-(int a, half b) { + return THCNumerics::add(ScalarConvert::to(a), THCNumerics::neg(b)); +} + +inline __host__ __device__ float operator-(float a, half b) { + return a - ScalarConvert::to(b); +} + +inline __host__ __device__ double operator-(double a, half b) { + return a - ScalarConvert::to(b); +} + +inline __host__ __device__ half operator*(half a, half b) { + return THCNumerics::mul(a, b); +} + +inline __host__ __device__ float operator*(half a, float b) { + return ScalarConvert::to(a) * b; +} + +inline __host__ __device__ double operator*(half a, double b) { + return ScalarConvert::to(a) * b; +} + +inline __host__ __device__ half operator*(half a, int b) { + return a * ScalarConvert::to(b); +} + +inline __host__ __device__ float operator*(float a, half b) { + return a * ScalarConvert::to(b); +} + +inline __host__ __device__ double operator*(double a, half b) { + return a * ScalarConvert::to(b); +} + +inline __host__ __device__ half operator/(half a, half b) { + return THCNumerics::div(a, b); +} + +inline __host__ __device__ float operator/(float a, half b) { + return a / ScalarConvert::to(b); +} + +inline __host__ __device__ double operator/(double a, half b) { + return a / ScalarConvert::to(b); +} + +inline __host__ __device__ half operator/(int a, half b) { + return ScalarConvert::to(a) / b; +} + +inline __host__ __device__ float operator/(half a, float b) { + return ScalarConvert::to(a) / b; +} + +inline __host__ __device__ double operator/(half a, double b) { + return ScalarConvert::to(a) / b; +} + +inline __host__ __device__ half operator/(half a, int b) { + return a / ScalarConvert::to(b); +} + +inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) { + lhs = lhs + rhs; + return lhs; +} +inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) { + lhs = lhs + rhs; + return lhs; +} + +inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) { + lhs = lhs - rhs; + return lhs; +} + +inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) { + lhs = lhs * rhs; + return lhs; +} + +inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) { + lhs = lhs / rhs; + return lhs; +} + +inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) { + lhs = lhs / rhs; + return lhs; +} + +inline __host__ __device__ half abs(half a) { + return THCNumerics::abs(a); +} + +inline __host__ __device__ half exp(half a) { + return THCNumerics::exp(a); +} + +inline __host__ __device__ half log10(half a) { + return THCNumerics::log10(a); +} + +inline __host__ __device__ half log1p(half a) { + return THCNumerics::log1p(a); +} + +inline __host__ __device__ half log2(half a) { + return THCNumerics::log2(a); +} + +inline __host__ __device__ half expm1(half a) { + return THCNumerics::expm1(a); +} + +inline __host__ __device__ half pow(half a, half b) { + return THCNumerics::pow(a, b); +} + +inline __host__ __device__ half sqrt(half a) { + return THCNumerics::sqrt(a); +} + +inline __host__ __device__ half tanh(half a) { + return THCNumerics::tanh(a); +} + +#if defined(_MSC_VER) && CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) +inline __host__ __device__ half operator+(half a, int b) { + return THCNumerics::add(a, ScalarConvert::to(b)); +} + +inline __host__ __device__ double operator+(half a, double b) { + return ScalarConvert::to(a) + b; +} + +inline __host__ __device__ half operator*(half a, bool b) { + return THCNumerics::mul(a, ScalarConvert::to(b)); +} +#endif + +// comparison functions + +inline __host__ __device__ bool operator<(half a, half b) { + return THCNumerics::lt(a, b); +} + +inline __host__ __device__ bool operator<=(half a, half b) { + return THCNumerics::le(a, b); +} + +inline __host__ __device__ bool operator<=(half a, int b) { + return THCNumerics::le(a, ScalarConvert::to(b)); +} + +inline __host__ __device__ bool operator<(half a, int b) { + return THCNumerics::lt(a, ScalarConvert::to(b)); +} + +inline __host__ __device__ bool operator>(half a, half b) { + return THCNumerics::gt(a, b); +} + +inline __host__ __device__ bool operator>(half a, int b) { + return THCNumerics::gt(a, ScalarConvert::to(b)); +} + +inline __host__ __device__ bool operator>=(half a, half b) { + return THCNumerics::ge(a, b); +} + +inline __host__ __device__ bool operator>=(half a, int b) { + return THCNumerics::ge(a, ScalarConvert::to(b)); +} + +#endif +#endif diff --git a/aten/src/THCUNN/THCUNN.h b/aten/src/THCUNN/THCUNN.h new file mode 100644 index 0000000..09070b1 --- /dev/null +++ b/aten/src/THCUNN/THCUNN.h @@ -0,0 +1,10 @@ +#include + +#define THCIndexTensor THCudaLongTensor +#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME +typedef int64_t THCIndex_t; + +#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME) + +#include "generic/THCUNN.h" +#include diff --git a/aten/src/THCUNN/Tanh.cu b/aten/src/THCUNN/Tanh.cu new file mode 100644 index 0000000..6781f33 --- /dev/null +++ b/aten/src/THCUNN/Tanh.cu @@ -0,0 +1,35 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct tanh_updateGradInput_functor +{ + __device__ __forceinline__ void operator()(T *gradInput, + const T *output, const T *gradOutput) const { + *gradInput = *gradOutput * (1.f - *output * *output); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct tanh_updateGradInput_functor +{ + __device__ __forceinline__ void operator()(half *gradInput, + const half *output, const half *gradOutput) const { +#ifdef CUDA_HALF_INSTRUCTIONS + const half one = __float2half(1.f); + const half out_square = __hmul(*output, *output); + *gradInput = __hmul(*gradOutput, __hadd(one, __hneg(out_square))); +#else + const float out = __half2float(*output); + const float go = __half2float(*gradOutput); + *gradInput = __float2half(go * (1.f - out * out)); +#endif + } +}; +#endif + +#include "generic/Tanh.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalConvolution.cu b/aten/src/THCUNN/TemporalConvolution.cu new file mode 100644 index 0000000..af12169 --- /dev/null +++ b/aten/src/THCUNN/TemporalConvolution.cu @@ -0,0 +1,8 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" + +#include "generic/TemporalConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalMaxPooling.cu b/aten/src/THCUNN/TemporalMaxPooling.cu new file mode 100644 index 0000000..2508f83 --- /dev/null +++ b/aten/src/THCUNN/TemporalMaxPooling.cu @@ -0,0 +1,86 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#define TEMPORAL_MAX_POOLING_THREADS 1024 + +template +__global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { + // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index + Dtype *input_data = input + blockIdx.x * input_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; + Dtype *output_data = output + blockIdx.x * output_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; + THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; + + int feat = 0; + int time = 0; + int max_time = input_n * kW; + + Dtype max_value; + THCIndex_t max_index = 0; + + if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { + // For all features + for (feat = 0; feat < input_n; ++feat) { + max_value = THCNumerics::min(); + // For all values in the kernel space + for (time = 0; time < max_time; time += input_n) { + if (max_value < input_data[time + feat]) { + max_value = input_data[time + feat]; + max_index = time / input_n; + } + } + output_data[feat] = max_value; + indices_data[feat] = max_index; + } + } +} + +template +__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { + // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index + Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; + Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; + THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; + + int feat = 0; + + if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { + // For all features + for (feat = 0; feat < input_n; ++feat) { + gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat]; + } + } +} + +template +__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { + // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index + Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; + Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; + THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( + threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; + + int feat = 0; + + if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { + // For all features + for (feat = 0; feat < input_n; ++feat) { + atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]); + } + } +} + +#include "generic/TemporalMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalReflectionPadding.cu b/aten/src/THCUNN/TemporalReflectionPadding.cu new file mode 100644 index 0000000..4dd4da8 --- /dev/null +++ b/aten/src/THCUNN/TemporalReflectionPadding.cu @@ -0,0 +1,70 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void TemporalReflectionPadding_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= output.getSize(2)) { + return; + } + int outputPointX = outputPointId % output.getSize(2); + + int iStartX = max(0, -padL); + int oStartX = max(0, padL); + + int inputPointX = abs(outputPointX - padL) + - abs(outputPointX - (input.getSize(2) + padL - 1)) + - outputPointX + + 2 * padL + input.getSize(2) - 1 + - oStartX + iStartX; + + Dtype valueToCopy = input[batch][plane][inputPointX]; + output[batch][plane][outputPointX] = valueToCopy; +} + +template +__global__ void TemporalReflectionPadding_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= gradOutput.getSize(2)) { + return; + } + int outputPointX = outputPointId % gradOutput.getSize(2); + + int iStartX = max(0, -padL); + int oStartX = max(0, padL); + + int inputPointX = abs(outputPointX - padL) + - abs(outputPointX - (gradInput.getSize(2) + padL - 1)) + - outputPointX + + 2 * padL + gradInput.getSize(2) - 1 + - oStartX + iStartX; + + Dtype valueToCopy = gradOutput[batch][plane][outputPointX]; + atomicAdd(&gradInput[batch][plane][inputPointX], valueToCopy); +} + +#include "generic/TemporalReflectionPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalReplicationPadding.cu b/aten/src/THCUNN/TemporalReplicationPadding.cu new file mode 100644 index 0000000..2c812bd --- /dev/null +++ b/aten/src/THCUNN/TemporalReplicationPadding.cu @@ -0,0 +1,62 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void TemporalReplicationPadding_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= output.getSize(2)) { + return; + } + int outputPointX = outputPointId % output.getSize(2); + + int iStartX = max(0, -padL); + int oStartX = max(0, padL); + + int inputPointX = min(max(padL, outputPointX), input.getSize(2) + padL - 1) - oStartX + iStartX; + + Dtype valueToCopy = input[batch][plane][inputPointX]; + output[batch][plane][outputPointX] = valueToCopy; +} + +template +__global__ void TemporalReplicationPadding_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + int padL, int padR) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= gradOutput.getSize(2)) { + return; + } + int outputPointX = outputPointId % gradOutput.getSize(2); + + int iStartX = max(0, -padL); + int oStartX = max(0, padL); + + int inputPointX = min(max(padL, outputPointX), gradInput.getSize(2) + padL - 1) - oStartX + iStartX; + + Dtype valueToCopy = gradOutput[batch][plane][outputPointX]; + atomicAdd(&gradInput[batch][plane][inputPointX], valueToCopy); +} + + +#include "generic/TemporalReplicationPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalRowConvolution.cu b/aten/src/THCUNN/TemporalRowConvolution.cu new file mode 100644 index 0000000..745fef8 --- /dev/null +++ b/aten/src/THCUNN/TemporalRowConvolution.cu @@ -0,0 +1,12 @@ +#include "THCUNN.h" +#include "common.h" +#include "row2col.h" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCTensor.hpp" +#include "THCStorage.hpp" + +#include "generic/TemporalRowConvolution.cu" + +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu new file mode 100644 index 0000000..89b0c37 --- /dev/null +++ b/aten/src/THCUNN/TemporalUpSamplingLinear.cu @@ -0,0 +1,98 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "linear_upsampling.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void caffe_gpu_interp2_kernel(const int n, + const Acctype rwidth, const bool align_corners, + const THCDeviceTensor data1, THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int width1 = data1.getSize(2); + const int width2 = data2.getSize(2); + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][w1]; + data2[n][c][w2] = val; + } + } + return; + } + // + const Acctype w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; + // + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Acctype val = w0lambda * data1[n][c][w1] + + w1lambda * data1[n][c][w1+w1p]; + data2[n][c][w2] = ScalarConvert::to(val); + } + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +__global__ void caffe_gpu_interp2_kernel_backward(const int n, + const Acctype rwidth, const bool align_corners, + THCDeviceTensor data1, const THCDeviceTensor data2){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int width1 = data1.getSize(2); + const int width2 = data2.getSize(2); + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][w1]; + data1[n][c][w2] += val; + } + } + return; + } + // + const Acctype w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; + // + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype d2val = data2[n][c][w2]; + atomicAdd(data1[n][c][w1].data(), + ScalarConvert::to(w0lambda * d2val)); + atomicAdd(data1[n][c][w1+w1p].data(), + ScalarConvert::to(w1lambda * d2val)); + } + } + } +} + + +#include "generic/TemporalUpSamplingLinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu new file mode 100644 index 0000000..c87129d --- /dev/null +++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu @@ -0,0 +1,89 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCTensor.hpp" + +#include "linear_upsampling.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void nearest_neighbor_3d_kernel( + const int n, + const THCDeviceTensor data1, + THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int width1 = data1.getSize(2); + const int width2 = data2.getSize(2); + const float scale = (float) width1 / (float) width2; + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][w1]; + data2[n][c][w2] = val; + } + } + return; + } + // + const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1); + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][w1]; + data2[n][c][w2] = val; + } + } + } +} + +// Backward operation +template +__global__ void nearest_neighbor_3d_kernel_backward( + const int n, + THCDeviceTensor data1, + const THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int width1 = data1.getSize(2); + const int width2 = data2.getSize(2); + const float scale = (float) width1 / (float) width2; + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][w1]; + data1[n][c][w2] = val; + } + } + return; + } + // + const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1); + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype d2val = data2[n][c][w2]; + atomicAdd(data1[n][c][w1].data(), d2val); + } + } + } +} + + +#include "generic/TemporalUpSamplingNearest.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/Threshold.cu b/aten/src/THCUNN/Threshold.cu new file mode 100644 index 0000000..e7757eb --- /dev/null +++ b/aten/src/THCUNN/Threshold.cu @@ -0,0 +1,75 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include + +template +struct ThresholdUpdateOutput +{ + const T threshold_; + const T val_; + + ThresholdUpdateOutput(T threshold, T val) + : threshold_(threshold) + , val_(val) + {} + + __device__ __forceinline__ void operator()(T *out, T *in) + { + T x = *in; + *out = (x > threshold_) ? x : val_; + } +}; + +// in-place variant +template +struct ThresholdUpdateOutputIP +{ + const T threshold_; + const T val_; + + ThresholdUpdateOutputIP(T threshold, T val) + : threshold_(threshold) + , val_(val) + {} + + __device__ __forceinline__ void operator()(T *x) + { + *x = (*x > threshold_) ? *x : val_; + } +}; + +template +struct ThresholdUpdateGradInput +{ + const T threshold_; + + ThresholdUpdateGradInput(T threshold) + : threshold_(threshold) + {} + + __device__ __forceinline__ void operator()( + T *gradInput, T *input, T *gradOutput) const + { + *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert::to(0); + } +}; + +template +struct ThresholdUpdateGradInputIP +{ + const T threshold_; + + ThresholdUpdateGradInputIP(T threshold) + : threshold_(threshold) + {} + + __device__ __forceinline__ void operator()( + T *gradOutput, T *input) const + { + *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert::to(0); + } +}; + +#include "generic/Threshold.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu new file mode 100644 index 0000000..84e2c7f --- /dev/null +++ b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu @@ -0,0 +1,248 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + + +#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit + +// 5d tensor B x D x T x H x W +// All kernels view batch dim B and feature dim D as collapsed. + +/* + * Description: + * This function adaptively average pools an input 5D tensor along dimensions + * 2, 3 and 4. + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + */ + template +__global__ void cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel( + T *input, T *output, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t istrideD, + int64_t istrideT, int64_t istrideH, int64_t istrideW, + int64_t offsetZ) +{ + // iterators on output pixels + int ot, oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + ot = o_plane % osizeT; // output frame/time + int d = o_plane / osizeT; // slice/feature + + // input frame/time ramge is fixed. + int istartT = START_IND(ot, osizeT, isizeT); + int iendT = END_IND(ot, osizeT, isizeT); + int kT = iendT - istartT; + + // input offset by slice/feature and earliest relevant frame/time + T *input_dt = input + d*istrideD + istartT*istrideT; + // output offset by slice/feature and frame/time + T *output_dt = output + o_plane*osizeH*osizeW; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the average pooling from corresponding input pixels + T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW; + T *ptr_output = output_dt + oh*osizeW + ow; + T sum = ScalarConvert::to(0); + + int it, ih, iw; + for(it = 0; it < kT; ++it) { + for(ih = 0; ih < kH; ++ih) { + for(iw = 0; iw < kW; ++iw) { + T val = ptr_input[ih*istrideH + iw*istrideW]; + sum += val; + } + } + ptr_input += istrideT; // next input frame + } + // Update output + *ptr_output = sum / kT / kH / kW; + } + } +} + +/* + * Description: + * This function computes the gradInput from gradOutput. + * + * gridDim.y blocks work together on a single 2D input plane specified by + * (blockIdx.x + offsetZ). + */ + template +__global__ void cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel( + T *gradInput, T *gradOutput, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t offsetZ +) +{ + // iterators on input pixels + int it, ih, iw; + + // compute offsets based on thread/block ID + int istartH = blockIdx.y * blockDim.y + threadIdx.y; + int iendH = isizeH; + int istepH = gridDim.y * blockDim.y; + int istartW = threadIdx.x; + int iendW = isizeW; + int istepW = blockDim.x; + + // select input plane + int64_t i_plane = blockIdx.x + offsetZ; + it = i_plane % isizeT; // output frame/time + int d = i_plane / isizeT; // slice/feature + + // output frame/time ramge is fixed. + int ostartT = START_IND(it, isizeT, osizeT); + int oendT = END_IND(it, isizeT, osizeT); + + // gradInput offset by slice/feature and frame/time + T *gradInput_dt = gradInput + i_plane*isizeH*isizeW; + // gradOutput offset by slice/feature and earliest relevant frame/time + T *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW; + + // For all input pixels... + for(ih = istartH; ih < iendH; ih += istepH) { + + int ostartH = START_IND(ih, isizeH, osizeH); + int oendH = END_IND(ih, isizeH, osizeH); + + for(iw = istartW; iw < iendW; iw += istepW) { + + int ostartW = START_IND(iw, isizeW, osizeW); + int oendW = END_IND(iw, isizeW, osizeW); + + // Compute the gradients from corresponding output pixels + T *ptr_gradInput = gradInput_dt + ih*isizeW + iw; + T *ptr_gradOutput = gradOutput_dt; + + // for all relevant output pixels + int ot, oh, ow; + for(ot = ostartT; ot < oendT; ++ot) { + int kT = END_IND(ot, osizeT, isizeT) - START_IND(ot, osizeT, isizeT); + for(oh = ostartH; oh < oendH; ++oh) { + int kH = END_IND(oh, osizeH, isizeH) - START_IND(oh, osizeH, isizeH); + for(ow = ostartW; ow < oendW; ++ow) { + int kW = END_IND(ow, osizeW, isizeW) - START_IND(ow, osizeW, isizeW); + T grad_delta = ptr_gradOutput[oh*osizeW + ow] / kW / kH / kT; + *ptr_gradInput += grad_delta; + } + } + ptr_gradOutput += osizeH*osizeW; // next output frame + } + } + } +} + +/* + * Description: + * This function computes the gradInput from gradOutput without assuming + * dependencies between input pixels and output pixels. + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + * + * (uses atomic add) + */ + template +__global__ void cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel( + T *gradInput, T *gradOutput, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t offsetZ +) +{ + // iterators on output pixels + int ot, oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + ot = o_plane % osizeT; // output frame/time + int d = o_plane / osizeT; // output slice/feature + + // input frame/time ramge is fixed. + int istartT = START_IND(ot, osizeT, isizeT); + int iendT = END_IND(ot, osizeT, isizeT); + int kT = iendT - istartT; + + // gradInput offset by slice/feature and earliest relevant frame/time + T *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW; + // gradOutput offset by slice/feature and frame/time + T *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the gradients from corresponding input pixels + T *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW; + T *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow; + T grad_delta = *ptr_gradOutput / kT / kH / kW; + + int it, ih, iw; + for(it = 0; it < kT; ++it) { + for(ih = 0; ih < kH; ++ih) { + for(iw = 0; iw < kW; ++iw) { + atomicAdd(&(ptr_gradInput[ih*isizeW + iw]), grad_delta); + } + } + ptr_gradInput += isizeH*isizeW; // next input frame + } + } + } +} + +#include "generic/VolumetricAdaptiveAveragePooling.cu" +#include "THCGenerateFloatTypes.h" + +#undef CUDA_MAX_THREADS +#undef START_IND +#undef END_IND diff --git a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu new file mode 100644 index 0000000..6d542ba --- /dev/null +++ b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu @@ -0,0 +1,207 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +#include "THCTensor.hpp" + +#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +// 5d tensor B x D x T x H x W + +/* + * Description: + * this function adaptively maxpools an input 4D tensor along dimensions 2 and 3 + * 4D input, 4D output, 4D argmax x and y + */ + template +__global__ void cunn_VolumetricAdaptiveMaxPooling_updateOutput_kernel( + T *input, T *output, THCIndex_t *indices, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t istrideD, + int64_t istrideT, int64_t istrideH, int64_t istrideW, + int64_t offsetZ) +{ + // iterators on output pixels + int ot, oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + ot = o_plane % osizeT; // output frame/time + int d = o_plane / osizeT; // slice/feature + + // input frame/time ramge is fixed. + int istartT = START_IND(ot, osizeT, isizeT); + int iendT = END_IND(ot, osizeT, isizeT); + int kT = iendT - istartT; + + // input offset by slice/feature and earliest relevant frame/time + T *input_dt = input + d*istrideD + istartT*istrideT; + // output offset by slice/feature and frame/time + T *output_dt = output + o_plane*osizeH*osizeW; + // indices offset by slice/feature and frame/time + THCIndex_t *indices_dt = indices + o_plane*osizeH*osizeW; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = ostartW; ow < oendW; ow += ostepW) { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the average pooling from corresponding input pixels + T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW; + T *ptr_output = output_dt + oh*osizeW + ow; + THCIndex_t *ptr_ind = indices_dt + oh*osizeW + ow; + int64_t argmax = -1; + T max = THCNumerics::min(); + + int it, ih, iw; + for(it = 0; it < kT; ++it) { + for(ih = 0; ih < kH; ++ih) { + for(iw = 0; iw < kW; ++iw) { + T val = ptr_input[ih*istrideH + iw*istrideW]; + if ((val > max) || THCNumerics::isnan(val)) { + max = val; + argmax = (it+istartT)*isizeH*isizeW + (ih+istartH)*isizeW + iw+istartW; + } + } + } + ptr_input += istrideT; // next input frame + } + // Update output and argmax + *ptr_output = max; + *ptr_ind = argmax + TH_INDEX_BASE; + } + } +} + +/* + * Description: + * This function computes the gradInput from gradOutput. + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + * + * Assumes that input size can be perfectly divided by output size, i.e. + * each input pixel can only be argmax of one output pixel. + */ + template +__global__ void cunn_VolumetricAdaptiveMaxPooling_updateGradInput_kernel( + T *gradInput, T *gradOutput, THCIndex_t *indices, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t offsetZ +) +{ + // iterators on output pixels + int oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + int d = o_plane / osizeT; // output slice/feature + + // gradInput offset by slice/feature + T *gradInput_d = gradInput + d*isizeT*isizeH*isizeW; + // gradOutput offset by slice/feature and frame/otme + T *gradOutput_dt = gradOutput + o_plane*osizeH*osizeW; + // indices offset by slice/feature and frame/otme + THCIndex_t *indices_dt = indices + o_plane*osizeH*osizeW; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + for(ow = ostartW; ow < oendW; ow += ostepW) { + // Compute the gradients for the argmax input pixel + T *ptr_gradOutput = gradOutput_dt + oh*osizeW + ow; + THCIndex_t *ptr_ind = indices_dt + oh*osizeW + ow; + T grad_delta = *ptr_gradOutput; + int argmax = (*ptr_ind) - TH_INDEX_BASE; + gradInput_d[argmax] += grad_delta; + } + } +} + + +/* + * Description: + * This function computes the gradInput from gradOutput. + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + * + * Uses atomic add. + */ + template +__global__ void cunn_atomic_VolumetricAdaptiveMaxPooling_updateGradInput_kernel( + T *gradInput, T *gradOutput, THCIndex_t *indices, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t offsetZ +) +{ + // iterators on output pixels + int oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + int d = o_plane / osizeT; // output slice/feature + + // gradInput offset by slice/feature + T *gradInput_d = gradInput + d*isizeT*isizeH*isizeW; + // gradOutput offset by slice/feature and frame/otme + T *gradOutput_dt = gradOutput + o_plane*osizeH*osizeW; + // indices offset by slice/feature and frame/otme + THCIndex_t *indices_dt = indices + o_plane*osizeH*osizeW; + + // For all output pixels... + for(oh = ostartH; oh < oendH; oh += ostepH) { + for(ow = ostartW; ow < oendW; ow += ostepW) { + // Compute the gradients for the argmax input pixel + T *ptr_gradOutput = gradOutput_dt + oh*osizeW + ow; + THCIndex_t *ptr_ind = indices_dt + oh*osizeW + ow; + T grad_delta = *ptr_gradOutput; + int64_t argmax = (*ptr_ind) - TH_INDEX_BASE; + atomicAdd(&(gradInput_d[argmax]), grad_delta); + } + } +} + +#include "generic/VolumetricAdaptiveMaxPooling.cu" +#include "THCGenerateFloatTypes.h" + +#undef CUDA_MAX_THREADS diff --git a/aten/src/THCUNN/VolumetricAveragePooling.cu b/aten/src/THCUNN/VolumetricAveragePooling.cu new file mode 100644 index 0000000..610127c --- /dev/null +++ b/aten/src/THCUNN/VolumetricAveragePooling.cu @@ -0,0 +1,279 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void cuda_VolumetricAveragePooling_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + int kT, int kH, int kW, + int dT, int dH, int dW, + int padT, int padH, int padW, + bool count_include_pad, int offsetZ) +{ + int oCol = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time + int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature + + if (oRow < output.getSize(2) && oCol < output.getSize(3)) + { + Acctype sum = 0.0; + + int tstart = oFrame * dT - padT; + int hstart = oRow * dH - padH; + int wstart = oCol * dW - padW; + int tend = min(tstart + kT, input.getSize(1) + padT); + int hend = min(hstart + kH, input.getSize(2) + padH); + int wend = min(wstart + kW, input.getSize(3) + padW); + int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart); + tstart = max(tstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + tend = min(tend, input.getSize(1)); + hend = min(hend, input.getSize(2)); + wend = min(wend, input.getSize(3)); + + Acctype divide_factor; + if (count_include_pad) + divide_factor = static_cast(pool_size); + else + divide_factor = static_cast((tend - tstart) * (hend - hstart) * (wend - wstart)); + + int ti, hi, wi; + for (ti = tstart; ti < tend; ++ti) + { + for (hi = hstart; hi < hend; ++hi) + { + for (wi = wstart; wi < wend; ++wi) + { + Dtype val = input[slice][ti][hi][wi]; + sum += val; + } + } + } + + output[slice][oFrame][oRow][oCol] = ScalarConvert::to(sum / divide_factor); + } +} + +// Inner-most loop size (kW) passed as template parameter for +// performance reasons. +// +template +__global__ void cuda_VolumetricAveragePooling_updateOutput_fixedKW( + THCDeviceTensor input, + THCDeviceTensor output, + int kT, int kH, + int dT, int dH, int dW, + int padT, int padH, int padW, + bool count_include_pad, int offsetZ) +{ + int oCol = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time + int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature + + if (oRow < output.getSize(2) && oCol < output.getSize(3)) + { + Acctype sum = 0.0; + + int tstart = oFrame * dT - padT; + int hstart = oRow * dH - padH; + int wstart = oCol * dW - padW; + int tend = min(tstart + kT, input.getSize(1) + padT); + int hend = min(hstart + kH, input.getSize(2) + padH); + int wend = min(wstart + KERNEL_WIDTH, input.getSize(3) + padW); + int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart); + tstart = max(tstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + tend = min(tend, input.getSize(1)); + hend = min(hend, input.getSize(2)); + wend = min(wend, input.getSize(3)); + + Acctype divide_factor; + if (count_include_pad) + divide_factor = static_cast(pool_size); + else + divide_factor = static_cast((tend - tstart) * (hend - hstart) * (wend - wstart)); + + int ti, hi, wi; + for (ti = tstart; ti < tend; ++ti) + { + for (hi = hstart; hi < hend; ++hi) + { + for (wi = wstart; wi < wend; ++wi) + { + Dtype val = input[slice][ti][hi][wi]; + sum += val; + } + } + } + + output[slice][oFrame][oRow][oCol] = ScalarConvert::to(sum / divide_factor); + } +} + +#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ + cuda_VolumetricAveragePooling_updateOutput_fixedKW \ + <<>>( \ + cudaInput, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW, count_include_pad, offsetZ); \ + break + +template +__global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1( + THCDeviceTensor gradOutput, + THCDeviceTensor gradInput, + int kT, int kH, int kW, + Acctype normFactor, int offsetZ) +{ + int iCol = blockIdx.x * blockDim.x + threadIdx.x; + int iRow = blockIdx.y * blockDim.y + threadIdx.y; + int iFrame = (blockIdx.z + offsetZ) % gradInput.getSize(1); // input frame/time + int slice = (blockIdx.z + offsetZ) / gradInput.getSize(1); // input slice/feature + + // guard against over-tiled threads + if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3)) + { + Acctype sum = 0.0; + Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)] + [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)]; + int frameOffset = 0; + for (int oFrame = max(0, iFrame - kT + 1); + oFrame < min(iFrame + 1, gradOutput.getSize(1)); + ++oFrame) + { + int rowOffset = frameOffset; + for (int oRow = max(0, iRow - kH + 1); + oRow < min(iRow + 1, gradOutput.getSize(2)); + ++oRow) + { + int colOffset = rowOffset; + for (int oCol = max(0, iCol - kW + 1); + oCol < min(iCol + 1, gradOutput.getSize(3)); + ++oCol) + { + sum += gOut[colOffset]; + ++colOffset; + } + rowOffset += gradOutput.getSize(3); + } + frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3); + } + gradInput[slice][iFrame][iRow][iCol] = ScalarConvert::to(sum * normFactor); + } +} + +template +__global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd( + THCDeviceTensor gradOutput, + THCDeviceTensor gradInput, + int kT, int kH, int kW, + int dT, int dH, int dW, + int padT, int padH, int padW, + bool count_include_pad, int offsetZ) +{ + int oCol = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time + int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature + + // guard against over-tiled threads + if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3)) + { + int tstart = oFrame * dT - padT; + int hstart = oRow * dH - padH; + int wstart = oCol * dW - padW; + int tend = min(tstart + kT, gradInput.getSize(1) + padT); + int hend = min(hstart + kH, gradInput.getSize(2) + padH); + int wend = min(wstart + kW, gradInput.getSize(3) + padW); + int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart); + tstart = max(tstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + tend = min(tend, gradInput.getSize(1)); + hend = min(hend, gradInput.getSize(2)); + wend = min(wend, gradInput.getSize(3)); + + Acctype divide_factor; + if (count_include_pad) + divide_factor = static_cast(pool_size); + else + divide_factor = static_cast((tend - tstart) * (hend - hstart) * (wend - wstart)); + + Dtype val = ScalarConvert::to( + ScalarConvert::to(gradOutput[slice][oFrame][oRow][oCol]) / divide_factor); + for (int iFrame = tstart; iFrame < tend; ++iFrame) + { + for (int iRow = hstart; iRow < hend; ++iRow) + { + for (int iCol = wstart; iCol < wend; ++iCol) + { + atomicAdd(&gradInput[slice][iFrame][iRow][iCol], val); + } + } + } + } +} + +template +__global__ void cuda_VolumetricAveragePooling_updateGradInput( + THCDeviceTensor gradOutput, + THCDeviceTensor gradInput, + int kT, int kH, int kW, + int dT, int dH, int dW, + int padT, int padH, int padW, + bool count_include_pad, int offsetZ) +{ + int oCol = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // gradOutput frame/time + int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // gradOutput slice/feature + + // guard against over-tiled threads + if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3)) + { + int tstart = oFrame * dT - padT; + int hstart = oRow * dH - padH; + int wstart = oCol * dW - padW; + int tend = min(tstart + kT, gradInput.getSize(1) + padT); + int hend = min(hstart + kH, gradInput.getSize(2) + padH); + int wend = min(wstart + kW, gradInput.getSize(3) + padW); + int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart); + tstart = max(tstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + tend = min(tend, gradInput.getSize(1)); + hend = min(hend, gradInput.getSize(2)); + wend = min(wend, gradInput.getSize(3)); + + Acctype divide_factor; + if (count_include_pad) + divide_factor = static_cast(pool_size); + else + divide_factor = static_cast((tend - tstart) * (hend - hstart) * (wend - wstart)); + + Dtype val = ScalarConvert::to( + ScalarConvert::to(gradOutput[slice][oFrame][oRow][oCol]) / divide_factor); + for (int iFrame = tstart; iFrame < tend; ++iFrame) + { + for (int iRow = hstart; iRow < hend; ++iRow) + { + for (int iCol = wstart; iCol < wend; ++iCol) + { + gradInput[slice][iFrame][iRow][iCol] = val; + } + } + } + } +} + +#include "generic/VolumetricAveragePooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu new file mode 100644 index 0000000..da66140 --- /dev/null +++ b/aten/src/THCUNN/VolumetricConvolution.cu @@ -0,0 +1,159 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +// Kernel for fast unfold+copy +// Borrowed from Theano +// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas +template +__global__ void im3d2col_kernel(const int64_t n, const Dtype* data_im, + const int64_t height, const int64_t width, const int64_t depth, + const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d, + const int64_t pad_h, const int64_t pad_w, const int64_t pad_d, + const int64_t stride_h, const int64_t stride_w, const int64_t stride_d, + const int64_t height_col, const int64_t width_col, const int64_t depth_col, + Dtype* data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + int64_t d_out = index % depth_col; + int64_t w_index = index / depth_col; + int64_t w_out = w_index % width_col; + int64_t h_index = w_index / width_col; + int64_t h_out = h_index % height_col; + + int64_t channel_in = h_index / height_col; + //channel_in = 1; + + int64_t channel_out = channel_in * kernel_h * kernel_w * kernel_d; + + int64_t h_in = h_out * stride_h - pad_h; + int64_t w_in = w_out * stride_w - pad_w; + int64_t d_in = d_out * stride_d - pad_d; + + Dtype* data_col_ptr = data_col; + data_col_ptr += channel_out * (height_col * width_col * depth_col) + + h_out * (width_col * depth_col) + w_out * depth_col + d_out; + + const Dtype* data_im_ptr = data_im; + data_im_ptr += channel_in * (height * width * depth) + + h_in * (width * depth) + w_in * depth + d_in; + + for (int64_t i = 0; i < kernel_h; ++i) + { + int64_t h = h_in + i; + for (int64_t j = 0; j < kernel_w; ++j) + { + int64_t w = w_in + j; + for (int64_t k = 0; k < kernel_d; ++k) + { + int64_t d = d_in + k; + *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 && + h < height && w < width && d < depth) ? + data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert::to(0); + data_col_ptr += height_col * width_col * depth_col; + } + } + } + } +} + +template +void im3d2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels, + const int64_t height, const int64_t width, const int64_t depth, + const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d, + const int64_t pad_h, const int64_t pad_w, const int64_t pad_d, + const int64_t stride_h, const int64_t stride_w, const int64_t stride_d, + Dtype* data_col) +{ + // We are going to launch channels * height_col * width_col * depth_col kernels, each + // kernel responsible for copying a single-channel grid. + int64_t height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int64_t width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int64_t depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1; + int64_t num_kernels = channels * height_col * width_col * depth_col; + im3d2col_kernel<<>>(num_kernels, data_im, + height, width, depth, + kernel_h, kernel_w, kernel_d, + pad_h, pad_w, pad_d, + stride_h, stride_w, stride_d, + height_col, width_col, depth_col, + data_col); + THCudaCheck(cudaGetLastError()); +} + +template +__global__ void col2im3d_kernel(const int64_t n, const Dtype* data_col, + const int64_t height, const int64_t width, const int64_t depth, + const int64_t channels, + const int64_t patch_h, const int64_t patch_w, const int64_t patch_d, + const int64_t pad_h, const int64_t pad_w, const int64_t pad_d, + const int64_t stride_h, const int64_t stride_w, const int64_t stride_d, + const int64_t height_col, const int64_t width_col, const int64_t depth_col, + Dtype* data_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + Acctype val = 0; + int64_t d = index % depth + pad_d; + int64_t w_index = index / depth; + int64_t w = w_index % width + pad_w; + int64_t h_index = w_index / width; + int64_t h = h_index % height + pad_h; + int64_t c = h_index / height; + + // compute the start and end of the output + int64_t d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1; + int64_t d_col_end = min(d / stride_d + 1, depth_col); + int64_t w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int64_t w_col_end = min(w / stride_w + 1, width_col); + int64_t h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int64_t h_col_end = min(h / stride_h + 1, height_col); + + int64_t offset = + (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col; + + int64_t coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col; + int64_t coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col; + int64_t coeff_d_col = (1 - stride_d * height_col * width_col * depth_col); + for (int64_t d_col = d_col_start; d_col < d_col_end; ++d_col) + for (int64_t h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int64_t w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col]; + } + } + data_im[index] = ScalarConvert::to(val); + } +} + +template +void col2im3d(cudaStream_t stream, const Dtype* data_col, const int64_t channels, + const int64_t height, const int64_t width, const int64_t depth, + const int64_t patch_h, const int64_t patch_w, const int64_t patch_d, + const int64_t pad_h, const int64_t pad_w, const int64_t pad_d, + const int64_t stride_h, const int64_t stride_w, const int64_t stride_d, + Dtype* data_im) +{ + int64_t height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int64_t width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int64_t depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1; + int64_t num_kernels = channels * height * width * depth; + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im3d_kernel<<>>(num_kernels, data_col, + height, width, depth, channels, + patch_h, patch_w, patch_d, + pad_h, pad_w, pad_d, + stride_h, stride_w, stride_d, + height_col, width_col, depth_col, + data_im); + THCudaCheck(cudaGetLastError()); +} + +#include "generic/VolumetricConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/VolumetricDilatedConvolution.cu new file mode 100644 index 0000000..8a32c70 --- /dev/null +++ b/aten/src/THCUNN/VolumetricDilatedConvolution.cu @@ -0,0 +1,9 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "vol2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/VolumetricDilatedConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu new file mode 100644 index 0000000..1a0f2f6 --- /dev/null +++ b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu @@ -0,0 +1,161 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#include + +template +__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( + Dtype* inputData, int inputT, int inputH, int inputW, + THCDeviceTensor indices, + THCDeviceTensor output, + int kT, int kH, int kW, + int dT, int dH, int dW, + int padT, int padH, int padW, + int dilationT, int dilationH, int dilationW, + int offsetZ) +{ + int oColumn = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time + int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature + + if (oRow < output.getSize(2) && oColumn < output.getSize(3)) + { + int tStart = oFrame * dT - padT; + int hStart = oRow * dH - padH; + int wStart = oColumn * dW - padW; + int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT); + int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH); + int wEnd = fminf(wStart + (kW - 1) * dilationW + 1, inputW); + + while(tStart < 0) + tStart += dilationT; + while(hStart < 0) + hStart += dilationH; + while(wStart < 0) + wStart += dilationW; + + int index = 0; + int maxIndex = -1; + inputData += slice * inputT * inputH * inputW; + + Dtype max = THCNumerics::min(); + + for (int t = tStart; t < tEnd; t += dilationT) + { + for (int h = hStart; h < hEnd; h += dilationH) + { + for (int w = wStart; w < wEnd; w += dilationW) + { + index = t * inputH * inputW + h * inputW + w; + Dtype val = inputData[index]; + + if ((max < val) || THCNumerics::isnan(val)) + { + max = val; + maxIndex = index; + } + } + } + } + + output[slice][oFrame][oRow][oColumn] = max; + indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE; + } +} + +template +__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( + Dtype* inputData, int inputT, int inputH, int inputW, + THCDeviceTensor indices, + THCDeviceTensor output, + int kT, int kH, + int dT, int dH, int dW, + int padT, int padH, int padW, + int dilationT, int dilationH, int dilationW, + int offsetZ) +{ + int oColumn = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % output.getSize(1); // output frame/time + int slice = (blockIdx.z + offsetZ) / output.getSize(1); // output slice/feature + + if (oRow < output.getSize(2) && oColumn < output.getSize(3)) + { + int tStart = oFrame * dT - padT; + int hStart = oRow * dH - padH; + int wStart = oColumn * dW - padW; + int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT); + int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH); + int wEnd = fminf(wStart + (KERNEL_WIDTH - 1) * dilationW + 1, inputW); + + while(tStart < 0) + tStart += dilationT; + while(hStart < 0) + hStart += dilationH; + while(wStart < 0) + wStart += dilationW; + + int index = 0; + int maxIndex = -1; + + Dtype max = THCNumerics::min(); + + for (int t = tStart; t < tEnd; t += dilationT) + { + for (int h = hStart; h < hEnd; h += dilationH) + { + for (int w = wStart; w < wEnd; w += dilationW) + { + index = t * inputH * inputW + h * inputW + w; + Dtype val = inputData[slice * inputT * inputH * inputW + index]; + + if (max < val) + { + max = val; + maxIndex = index; + } + } + } + } + + output[slice][oFrame][oRow][oColumn] = max; + indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE; + } +} + +template +__global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput( + THCDeviceTensor gradOutput, + THCDeviceTensor indices, + Dtype* gradInputData, + int inputT, int inputH, int inputW, + int dT, int dH, int dW, + int padT, int padH, int padW, + int dilationT, int dilationH, int dilationW, + int offsetZ) +{ + int oColumn = blockIdx.x * blockDim.x + threadIdx.x; + int oRow = blockIdx.y * blockDim.y + threadIdx.y; + int oFrame = (blockIdx.z + offsetZ) % gradOutput.getSize(1); // output frame/time + int slice = (blockIdx.z + offsetZ) / gradOutput.getSize(1); // output slice/feature + + if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3)) + { + int maxIndex = indices[slice][oFrame][oRow][oColumn] - TH_INDEX_BASE; + if (maxIndex != -1) { + atomicAdd(&gradInputData[slice * inputT * inputH * inputW + maxIndex], + gradOutput[slice][oFrame][oRow][oColumn]); + } + } +} + +#include "generic/VolumetricDilatedMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu new file mode 100644 index 0000000..e6260ce --- /dev/null +++ b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu @@ -0,0 +1,120 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#include + +template +__device__ inline int getInterval(Acctype sample, + int index, + int inputSize, + int outputSize, + int poolSize) { + Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1); + if (index == outputSize - 1) { + return inputSize - poolSize; + } else { + return (int) ((index + sample) * alpha) - (int) (sample * alpha); + } +} + +// We template on poolSizeW to allow the innermost loop to be unrolled +template +__global__ void VolumetricFractionalMaxPooling_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + THCDeviceTensor indices, + THCDeviceTensor samples, + int poolSizeT, int poolSizeW, int poolSizeH) { + + // Output (h, w) point that this thread is responsible for + int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + + // Each thread generates a specific output point + if (ourOutputPoint < output.getSize(2) * output.getSize(3) * output.getSize(4)){ + int outputT = ourOutputPoint % output.getSize(4); + int outputW = (ourOutputPoint / output.getSize(4)) % output.getSize(3); + int outputH = ourOutputPoint / (output.getSize(3)*output.getSize(4)); + + int poolT = getInterval(ScalarConvert::to(samples[batch][plane][0]), outputT, + input.getSize(4), output.getSize(4), poolSizeT); + int poolW = getInterval(ScalarConvert::to(samples[batch][plane][1]), outputW, + input.getSize(3), output.getSize(3), poolSizeW); + int poolH = getInterval(ScalarConvert::to(samples[batch][plane][2]), outputH, + input.getSize(2), output.getSize(2), poolSizeH); + + Dtype maxVal = THCNumerics::min(); + int maxIndex = -1; + + for (int h = poolH; h < poolH + poolSizeH; ++h) { + for (int w = poolW; w < poolW + poolSizeW; ++w) { + if (PoolSizeTStatic == -1) { + for (int t = poolT; t < poolT + poolSizeT; ++t) { + Dtype val = input[batch][plane][h][w][t]; + // for consistency with THNN, favor the first max + if (val > maxVal) { + maxIndex = h * input.getSize(3)*input.getSize(4) + w * input.getSize(4) + t; + maxVal = val; + } + } + } else { +#pragma unroll + for (int i = 0; i < PoolSizeTStatic; ++i) { + int t = i + poolT; + Dtype val = input[batch][plane][h][w][t]; + // for consistency with THNN, favor the first max + if (val > maxVal) { + maxIndex = h * input.getSize(3)*input.getSize(4) + w * input.getSize(4) + t; + maxVal = val; + } + } + } + } + } + + assert(THCNumerics::ne(maxVal, THCNumerics::min())); + assert(maxIndex != -1); + + // +1 for Lua index + indices[batch][plane][outputH][outputW][outputT] = maxIndex + TH_INDEX_BASE; + output[batch][plane][outputH][outputW][outputT] = maxVal; + } +} + +template +__global__ void VolumetricFractionalMaxPooling_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + THCDeviceTensor indices) { + // Output (h, w) point that this thread is responsible for + int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + + // Each thread generates a specific output point + if (ourOutputPoint < gradOutput.getSize(2) * gradOutput.getSize(3) * gradOutput.getSize(4)) { + int outputT = ourOutputPoint % gradOutput.getSize(4); + int outputW = (ourOutputPoint / gradOutput.getSize(4)) % gradOutput.getSize(3); + int outputH = ourOutputPoint / (gradOutput.getSize(3)*gradOutput.getSize(4)); + + int index = indices[batch][plane][outputH][outputW][outputT] - TH_INDEX_BASE; + assert(index >= 0); + int inputT = index % gradInput.getSize(4); + int inputW = (index / gradInput.getSize(4)) % gradInput.getSize(3); + int inputH = index / (gradInput.getSize(3) * gradInput.getSize(4)); + assert(inputH < gradInput.getSize(2)); + + atomicAdd(gradInput[batch][plane][inputH][inputW][inputT].data(), + gradOutput[batch][plane][outputH][outputW][outputT]); + } +} + +#include "generic/VolumetricFractionalMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricFullConvolution.cu b/aten/src/THCUNN/VolumetricFullConvolution.cu new file mode 100644 index 0000000..556b5bc --- /dev/null +++ b/aten/src/THCUNN/VolumetricFullConvolution.cu @@ -0,0 +1,7 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/VolumetricFullConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu new file mode 100644 index 0000000..c5c7196 --- /dev/null +++ b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu @@ -0,0 +1,9 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "vol2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include "generic/VolumetricFullDilatedConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu new file mode 100644 index 0000000..43b8cef --- /dev/null +++ b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu @@ -0,0 +1,421 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define WITHIN_BOUNDS(x, y, z, D, H, W) (x >= 0 && x < W && y >= 0 && y < H && z >= 0 && z < D) +#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value) \ + do { \ + if (WITHIN_BOUNDS(x, y, z, D, H, W)) { \ + atomicAdd(&input[n][c][z][y][x], value); \ + } \ + } while(0) + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +const int MODE_BORDER = 1; + + +template +__launch_bounds__(1024) +__global__ void VolumetricGridSamplerBilinear_updateOutput_kernel( + const int nthreads, + THCDeviceTensor input, + THCDeviceTensor grid, + THCDeviceTensor output, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int ID = input.getSize(2); + int IH = input.getSize(3); + int IW = input.getSize(4); + int D = grid.getSize(1); + int H = grid.getSize(2); + int W = grid.getSize(3); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int d = (index / N) % D; + const int h = (index / (N * D)) % H; + const int w = (index / (N * D * H)) % W; + int c; + + // get the corresponding input x, y, z co-ordinates from grid + Dtype ix = grid[n][d][h][w][0]; + Dtype iy = grid[n][d][h][w][1]; + Dtype iz = grid[n][d][h][w][2]; + + // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); + iz = ScalarConvert::to(((iz + 1.f) / 2) * (ID-1)); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ScalarConvert::to(ix)); + int iy_tnw = floor(ScalarConvert::to(iy)); + int iz_tnw = floor(ScalarConvert::to(iz)); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + Dtype tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + Dtype tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + Dtype tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + Dtype tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + Dtype bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + Dtype bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + Dtype bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + Dtype bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + // calculate bilinear weighted pixel value and set output pixel + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw, ID); + CLIP_COORDINATES(ix_tne, ix_tne, IW); + CLIP_COORDINATES(iy_tne, iy_tne, IH); + CLIP_COORDINATES(iz_tne, iz_tne, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw, ID); + CLIP_COORDINATES(ix_tse, ix_tse, IW); + CLIP_COORDINATES(iy_tse, iy_tse, IH); + CLIP_COORDINATES(iz_tse, iz_tse, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw, ID); + CLIP_COORDINATES(ix_bne, ix_bne, IW); + CLIP_COORDINATES(iy_bne, iy_bne, IH); + CLIP_COORDINATES(iz_bne, iz_bne, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw, ID); + CLIP_COORDINATES(ix_bse, ix_bse, IW); + CLIP_COORDINATES(iy_bse, iy_bse, IH); + CLIP_COORDINATES(iz_bse, iz_bse, ID); + } + + Dtype out_val; + for (c = 0; c < C; ++c) { + out_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tnw, iy_tnw, iz_tnw, ID, IH, IW)) { + out_val += input[n][c][iz_tnw][iy_tnw][ix_tnw] * tnw; + } + if (WITHIN_BOUNDS(ix_tne, iy_tne, iz_tne, ID, IH, IW)) { + out_val += input[n][c][iz_tne][iy_tne][ix_tne] * tne; + } + if (WITHIN_BOUNDS(ix_tsw, iy_tsw, iz_tsw, ID, IH, IW)) { + out_val += input[n][c][iz_tsw][iy_tsw][ix_tsw] * tsw; + } + if (WITHIN_BOUNDS(ix_tse, iy_tse, iz_tse, ID, IH, IW)) { + out_val += input[n][c][iz_tse][iy_tse][ix_tse] * tse; + } + if (WITHIN_BOUNDS(ix_bnw, iy_bnw, iz_bnw, ID, IH, IW)) { + out_val += input[n][c][iz_bnw][iy_bnw][ix_bnw] * bnw; + } + if (WITHIN_BOUNDS(ix_bne, iy_bne, iz_bne, ID, IH, IW)) { + out_val += input[n][c][iz_bne][iy_bne][ix_bne] * bne; + } + if (WITHIN_BOUNDS(ix_bsw, iy_bsw, iz_bsw, ID, IH, IW)) { + out_val += input[n][c][iz_bsw][iy_bsw][ix_bsw] * bsw; + } + if (WITHIN_BOUNDS(ix_bse, iy_bse, iz_bse, ID, IH, IW)) { + out_val += input[n][c][iz_bse][iy_bse][ix_bse] * bse; + } + output[n][c][d][h][w] = out_val; + } + } +} + +template +__launch_bounds__(1024) +__global__ void VolumetricGridSamplerBilinear_updateGradInput_kernel( + const int nthreads, + THCDeviceTensor input, THCDeviceTensor gradInput, + THCDeviceTensor grid, THCDeviceTensor gradGrid, + THCDeviceTensor gradOutput, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int ID = input.getSize(2); + int IH = input.getSize(3); + int IW = input.getSize(4); + int D = grid.getSize(1); + int H = grid.getSize(2); + int W = grid.getSize(3); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int d = (index / N) % D; + const int h = (index / (N * D)) % H; + const int w = (index / (N * D * H)) % W; + + // get the corresponding input x, y, z co-ordinates from grid + Dtype ix = grid[n][d][h][w][0]; + Dtype iy = grid[n][d][h][w][1]; + Dtype iz = grid[n][d][h][w][2]; + + Dtype gix = ScalarConvert::to(0); + Dtype giy = ScalarConvert::to(0); + Dtype giz = ScalarConvert::to(0); + + // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); + iz = ScalarConvert::to(((iz + 1.f) / 2) * (ID-1)); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ScalarConvert::to(ix)); + int iy_tnw = floor(ScalarConvert::to(iy)); + int iz_tnw = floor(ScalarConvert::to(iz)); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + Dtype tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + Dtype tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + Dtype tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + Dtype tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + Dtype bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + Dtype bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + Dtype bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + Dtype bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + Dtype gradout; + Dtype tnw_val; + Dtype tne_val; + Dtype tsw_val; + Dtype tse_val; + Dtype bnw_val; + Dtype bne_val; + Dtype bsw_val; + Dtype bse_val; + + int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; + int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; + int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; + int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID); + CLIP_COORDINATES(ix_tne, ix_tne_cl, IW); + CLIP_COORDINATES(iy_tne, iy_tne_cl, IH); + CLIP_COORDINATES(iz_tne, iz_tne_cl, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID); + CLIP_COORDINATES(ix_tse, ix_tse_cl, IW); + CLIP_COORDINATES(iy_tse, iy_tse_cl, IH); + CLIP_COORDINATES(iz_tse, iz_tse_cl, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID); + CLIP_COORDINATES(ix_bne, ix_bne_cl, IW); + CLIP_COORDINATES(iy_bne, iy_bne_cl, IH); + CLIP_COORDINATES(iz_bne, iz_bne_cl, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID); + CLIP_COORDINATES(ix_bse, ix_bse_cl, IW); + CLIP_COORDINATES(iy_bse, iy_bse_cl, IH); + CLIP_COORDINATES(iz_bse, iz_bse_cl, ID); + } + else { + ix_tnw_cl = ix_tnw; + iy_tnw_cl = iy_tnw; + iz_tnw_cl = iz_tnw; + ix_tne_cl = ix_tne; + iy_tne_cl = iy_tne; + iz_tne_cl = iz_tne; + ix_tsw_cl = ix_tsw; + iy_tsw_cl = iy_tsw; + iz_tsw_cl = iz_tsw; + ix_tse_cl = ix_tse; + iy_tse_cl = iy_tse; + iz_tse_cl = iz_tse; + ix_bnw_cl = ix_bnw; + iy_bnw_cl = iy_bnw; + iz_bnw_cl = iz_bnw; + ix_bne_cl = ix_bne; + iy_bne_cl = iy_bne; + iz_bne_cl = iz_bne; + ix_bsw_cl = ix_bsw; + iy_bsw_cl = iy_bsw; + iz_bsw_cl = iz_bsw; + ix_bse_cl = ix_bse; + iy_bse_cl = iy_bse; + iz_bse_cl = iz_bse; + } + + for (int c = 0; c < C; ++c) { + gradout = gradOutput[n][c][d][h][w]; + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout); + SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout); + SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout); + SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout); + SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout); + SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout); + SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout); + SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout); + + // calculate gradGrid + tnw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ID, IH, IW)) { + tnw_val = input[n][c][iz_tnw_cl][iy_tnw_cl][ix_tnw_cl]; + } + tne_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tne_cl, iy_tne_cl, iz_tne_cl, ID, IH, IW)) { + tne_val = input[n][c][iz_tne_cl][iy_tne_cl][ix_tne_cl]; + } + tsw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ID, IH, IW)) { + tsw_val = input[n][c][iz_tsw_cl][iy_tsw_cl][ix_tsw_cl]; + } + tse_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tse_cl, iy_tse_cl, iz_tse_cl, ID, IH, IW)) { + tse_val = input[n][c][iz_tse_cl][iy_tse_cl][ix_tse_cl]; + } + bnw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ID, IH, IW)) { + bnw_val = input[n][c][iz_bnw_cl][iy_bnw_cl][ix_bnw_cl]; + } + bne_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bne_cl, iy_bne_cl, iz_bne_cl, ID, IH, IW)) { + bne_val = input[n][c][iz_bne_cl][iy_bne_cl][ix_bne_cl]; + } + bsw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ID, IH, IW)) { + bsw_val = input[n][c][iz_bsw_cl][iy_bsw_cl][ix_bsw_cl]; + } + bse_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bse_cl, iy_bse_cl, iz_bse_cl, ID, IH, IW)) { + bse_val = input[n][c][iz_bse_cl][iy_bse_cl][ix_bse_cl]; + } + + Dtype m1 = ScalarConvert::to(-1); + gix += m1 * tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout; + gix += m1 * tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout; + gix += m1 * bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout; + gix += m1 * bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout; + + + giy += m1 * tnw_val * (ix_bse - ix) * (iz_bse - iz) * gradout; + giy += m1 * tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gradout; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gradout; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gradout; + giy += m1 * bnw_val * (ix_tse - ix) * (iz - iz_tse) * gradout; + giy += m1 * bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gradout; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gradout; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gradout; + + giz += m1 * tnw_val * (ix_bse - ix) * (iy_bse - iy) * gradout; + giz += m1 * tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gradout; + giz += m1 * tsw_val * (ix_bne - ix) * (iy - iy_bne) * gradout; + giz += m1 * tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gradout; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gradout; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gradout; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gradout; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gradout; + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + giz = giz * (ID - 1) / 2; + + Dtype gix_old = gradGrid[n][d][h][w][0]; + Dtype giy_old = gradGrid[n][d][h][w][1]; + Dtype giz_old = gradGrid[n][d][h][w][2]; + + gradGrid[n][d][h][w][0] = gix_old + gix; + gradGrid[n][d][h][w][1] = giy_old + giy; + gradGrid[n][d][h][w][2] = giz_old + giz; + } +} + +#undef MIN +#undef MAX +#undef CLIP_COORDINATES +#undef WITHIN_BOUNDS +#undef SAFE_ADD + +#include "generic/VolumetricGridSamplerBilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricMaxPooling.cu b/aten/src/THCUNN/VolumetricMaxPooling.cu new file mode 100644 index 0000000..2f7de7b --- /dev/null +++ b/aten/src/THCUNN/VolumetricMaxPooling.cu @@ -0,0 +1,10 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" + +#include + +#include "generic/VolumetricMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/VolumetricMaxUnpooling.cu new file mode 100644 index 0000000..eac3b2d --- /dev/null +++ b/aten/src/THCUNN/VolumetricMaxUnpooling.cu @@ -0,0 +1,57 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + +#include + +template +__global__ void cuda_VolumetricMaxUnpooling_updateOutput( + THCDeviceTensor input, + THCDeviceTensor indices, + Dtype* outputData, + int oT, int oH, int oW, + int dT, int dH, int dW, + int padT, int padH, int padW, int offsetZ) +{ + int64_t iColumn = blockIdx.x * blockDim.x + threadIdx.x; + int64_t iRow = blockIdx.y * blockDim.y + threadIdx.y; + int64_t iFrame = (blockIdx.z + offsetZ) % input.getSize(1); // intput frame/time + int64_t slice = (blockIdx.z + offsetZ) / input.getSize(1); // intput slice/feature + + if (iRow < input.getSize(2) && iColumn < input.getSize(3)) + { + Dtype val = input[slice][iFrame][iRow][iColumn]; + int64_t index = indices[slice][iFrame][iRow][iColumn]; + outputData[slice*oT*oH*oW + index] = val; + } +} + +template +__global__ void cuda_VolumetricMaxUnpooling_updateGradInput( + Dtype* gradOutputData, + int oT, int oH, int oW, + THCDeviceTensor indices, + THCDeviceTensor gradInput, + int dT, int dH, int dW, + int padT, int padH, int padW, int offsetZ) +{ + int iColumn = blockIdx.x * blockDim.x + threadIdx.x; + int iRow = blockIdx.y * blockDim.y + threadIdx.y; + int iFrame = (blockIdx.z + offsetZ) % gradInput.getSize(1); // output frame/time + int slice = (blockIdx.z + offsetZ) / gradInput.getSize(1); // output slice/feature + + if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3)) + { + int64_t index = indices[slice][iFrame][iRow][iColumn]; + Dtype grad_val = gradOutputData[slice*oT*oH*oW + index]; + gradInput[slice][iFrame][iRow][iColumn] = grad_val; + } +} + +#include "generic/VolumetricMaxUnpooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricReplicationPadding.cu b/aten/src/THCUNN/VolumetricReplicationPadding.cu new file mode 100644 index 0000000..27ea3ec --- /dev/null +++ b/aten/src/THCUNN/VolumetricReplicationPadding.cu @@ -0,0 +1,90 @@ +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCReduceApplyUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +#include + +template +__global__ void VolumetricReplicationPadding_updateOutput( + THCDeviceTensor input, + THCDeviceTensor output, + int pfront, int pback, int ptop, int pbottom, int pleft, int pright) { + + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + if (outputPointId >= (output.getSize(2) * output.getSize(3) * + output.getSize(4))) { + return; + } + int outputPointX = outputPointId % output.getSize(4); + int outputPointY = (outputPointId / output.getSize(4)) % output.getSize(3); + int outputPointZ = outputPointId / (output.getSize(3) * output.getSize(4)); + + int iStartX = max(0, -pleft); + int iStartY = max(0, -ptop); + int iStartZ = max(0, -pfront); + int oStartX = max(0, pleft); + int oStartY = max(0, ptop); + int oStartZ = max(0, pfront); + + int inputPointX = min(max(pleft, outputPointX), + input.getSize(4) + pleft - 1) - oStartX + iStartX; + int inputPointY = min(max(ptop, outputPointY), + input.getSize(3) + ptop - 1) - oStartY + iStartY; + int inputPointZ = min(max(pfront, outputPointZ), + input.getSize(2) + pfront - 1) - oStartZ + iStartZ; + + Dtype valueToCopy = + input[batch][plane][inputPointZ][inputPointY][inputPointX]; + output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy; +} + +template +__global__ void VolumetricReplicationPadding_updateGradInput( + THCDeviceTensor gradInput, + THCDeviceTensor gradOutput, + int pfront, int pback, int ptop, int pbottom, int pleft, int pright) { + int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + int plane = blockIdx.y; + int batch = blockIdx.z; + + if (outputPointId >= (gradOutput.getSize(2) * gradOutput.getSize(3) * + gradOutput.getSize(4))) { + return; + } + int outputPointX = outputPointId % gradOutput.getSize(4); + int outputPointY = (outputPointId / gradOutput.getSize(4)) % + gradOutput.getSize(3); + int outputPointZ = outputPointId / (gradOutput.getSize(3) * + gradOutput.getSize(4)); + + int iStartX = max(0, -pleft); + int iStartY = max(0, -ptop); + int iStartZ = max(0, -pfront); + int oStartX = max(0, pleft); + int oStartY = max(0, ptop); + int oStartZ = max(0, pfront); + + int inputPointX = min(max(pleft, outputPointX), + gradInput.getSize(4) + pleft - 1) - oStartX + iStartX; + int inputPointY = min(max(ptop, outputPointY), + gradInput.getSize(3) + ptop - 1) - oStartY + iStartY; + int inputPointZ = min(max(pfront, outputPointZ), + gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ; + + Dtype valueToCopy = + gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX]; + atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX], + valueToCopy); +} + + +#include "generic/VolumetricReplicationPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu new file mode 100644 index 0000000..babbd58 --- /dev/null +++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu @@ -0,0 +1,114 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCTensor.hpp" + +#include "linear_upsampling.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" + +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__global__ void nearest_neighbor_5d_kernel( + const int n, + const THCDeviceTensor data1, + THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int depth1 = data1.getSize(2); + const int height1 = data1.getSize(3); + const int width1 = data1.getSize(4); + const int depth2 = data2.getSize(2); + const int height2 = data2.getSize(3); + const int width2 = data2.getSize(4); + const float depth_scale = (float) depth1 / (float) depth2; + const float height_scale = (float) height1 / (float) height2; + const float width_scale = (float) width1 / (float) width2; + + if (index < n) { + const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 + const int d2 = index / (height2*width2); // 0:depth2-1 + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int d1 = d2; + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][d1][h1][w1]; + data2[n][c][d2][h2][w2] = val; + } + } + return; + } + // + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); + const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1); + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][d1][h1][w1]; + data2[n][c][d2][h2][w2] = val; + } + } + } +} + +// Backward operation +template +__global__ void nearest_neighbor_5d_kernel_backward( + const int n, + THCDeviceTensor data1, + const THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int depth1 = data1.getSize(2); + const int height1 = data1.getSize(3); + const int width1 = data1.getSize(4); + const int depth2 = data2.getSize(2); + const int height2 = data2.getSize(3); + const int width2 = data2.getSize(4); + const float depth_scale = (float) depth1 / (float) depth2; + const float height_scale = (float) height1 / (float) height2; + const float width_scale = (float) width1 / (float) width2; + + if (index < n) { + const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 + const int d2 = index / (height2*width2); // 0:depth2-1 + + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int d1 = d2; + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][d1][h1][w1]; + data1[n][c][d2][h2][w2] = val; + } + } + return; + } + // + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); + const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1); + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][d2][h2][w2]; + atomicAdd(data1[n][c][d1][h1][w1].data(), val); + } + } + } +} + + +#include "generic/VolumetricUpSamplingNearest.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu new file mode 100644 index 0000000..0f353b9 --- /dev/null +++ b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu @@ -0,0 +1,159 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou +#include "THCUNN.h" +#include "THCTensor.hpp" +#include "common.h" +#include "linear_upsampling.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template +__launch_bounds__(1024) +__global__ void caffe_gpu_interp2_kernel(const int n, + const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners, + const THCDeviceTensor data1, THCDeviceTensor data2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int depth1 = data1.getSize(2); + const int height1 = data1.getSize(3); + const int width1 = data1.getSize(4); + const int depth2 = data2.getSize(2); + const int height2 = data2.getSize(3); + const int width2 = data2.getSize(4); + + if (index < n) { + const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 + const int t2 = index / (height2*width2); // 0:depth2-1 + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int t1 = t2; + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data1[n][c][t1][h1][w1]; + data2[n][c][t2][h2][w2] = val; + } + } + return; + } + // + const Acctype t1r = linear_upsampling_compute_source_index(rdepth, t2, align_corners); + const int t1 = t1r; + const int t1p = (t1 < depth1 - 1) ? 1 : 0; + const Acctype t1lambda = t1r - t1; + const Acctype t0lambda = Acctype(1) - t1lambda; + // + const Acctype h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const Acctype h1lambda = h1r - h1; + const Acctype h0lambda = Acctype(1) - h1lambda; + // + const Acctype w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; + // + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1] + + w1lambda * data1[n][c][t1][h1][w1+w1p]) + + h1lambda * (w0lambda * data1[n][c][t1][h1+h1p][w1] + + w1lambda * data1[n][c][t1][h1+h1p][w1+w1p])) + + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1] + + w1lambda * data1[n][c][t1+t1p][h1][w1+w1p]) + + h1lambda * (w0lambda * data1[n][c][t1+t1p][h1+h1p][w1] + + w1lambda * data1[n][c][t1+t1p][h1+h1p][w1+w1p])); + data2[n][c][t2][h2][w2] = ScalarConvert::to(val); + } + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +__launch_bounds__(1024) +__global__ void caffe_gpu_interp2_kernel_backward(const int n, + const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners, + THCDeviceTensor data1, const THCDeviceTensor data2){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + const int batchsize = data1.getSize(0); + const int channels = data1.getSize(1); + const int depth1 = data1.getSize(2); + const int height1 = data1.getSize(3); + const int width1 = data1.getSize(4); + const int depth2 = data2.getSize(2); + const int height2 = data2.getSize(3); + const int width2 = data2.getSize(4); + if (index < n) { + const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 + const int t2 = index / (height2*width2); // 0:depth2-1 + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int t1 = t2; + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype val = data2[n][c][t1][h1][w1]; + data1[n][c][t2][h2][w2] += val; + } + } + return; + } + // + const Acctype t1r = linear_upsampling_compute_source_index(rdepth, t2, align_corners); + const int t1 = t1r; + const int t1p = (t1 < depth1 - 1) ? 1 : 0; + const Acctype t1lambda = t1r - t1; + const Acctype t0lambda = Acctype(1) - t1lambda; + // + const Acctype h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const Acctype h1lambda = h1r - h1; + const Acctype h0lambda = Acctype(1) - h1lambda; + // + const Acctype w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; + // + for (int n = 0; n < batchsize ; n++){ + for (int c = 0; c < channels; ++c) { + const Dtype d2val = data2[n][c][t2][h2][w2]; + atomicAdd(data1[n][c][t1][h1][w1].data(), + ScalarConvert::to(t0lambda * h0lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][t1][h1][w1+w1p].data(), + ScalarConvert::to(t0lambda * h0lambda * w1lambda * d2val)); + atomicAdd(data1[n][c][t1][h1+h1p][w1].data(), + ScalarConvert::to(t0lambda * h1lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][t1][h1+h1p][w1+w1p].data(), + ScalarConvert::to(t0lambda * h1lambda * w1lambda * d2val)); + atomicAdd(data1[n][c][t1+t1p][h1][w1].data(), + ScalarConvert::to(t1lambda * h0lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][t1+t1p][h1][w1+w1p].data(), + ScalarConvert::to(t1lambda * h0lambda * w1lambda * d2val)); + atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1].data(), + ScalarConvert::to(t1lambda * h1lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1+w1p].data(), + ScalarConvert::to(t1lambda * h1lambda * w1lambda * d2val)); + } + } + } + ///////////////////////////////////////////////////////// +} + + +#include "generic/VolumetricUpSamplingTrilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h new file mode 100644 index 0000000..5149003 --- /dev/null +++ b/aten/src/THCUNN/common.h @@ -0,0 +1,85 @@ +#ifndef THCUNN_COMMON_H +#define THCUNN_COMMON_H + +// CUDA: grid stride looping +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) + +#define THCUNN_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \ + "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") + +// Use 1024 threads per block, which requires cuda sm_2x or above +const int CUDA_NUM_THREADS = 1024; + +// CUDA: number of blocks for threads. +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +#define THCUNN_resizeAs_indices(STATE, I1, I2) \ + THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \ + if (!THCIndexTensor_(isSize)(STATE, I1, size2)) \ + { \ + THCudaLongTensor_resize(STATE, I1, size2, NULL); \ + } \ + THLongStorage_free(size2); + +#define THCUNN_check_shape(STATE, I1, I2) \ + if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \ + { \ + THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \ + THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \ + THError(#I1 " and " #I2 " shapes do not match: " \ + #I1 " %s, " #I2 " %s", s1.str, s2.str); \ + } + + +#define THCUNN_check_shape_indices(STATE, I1, I2) \ + THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \ + if (!THCIndexTensor_(isSize)(STATE, I1, size2)) \ + { \ + THCDescBuff s1 = THCIndexTensor_(sizeDesc)(STATE, I1); \ + THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \ + THError(#I1 " and " #I2 " shapes do not match: " \ + #I1 " %s, " #I2 " %s", s1.str, s2.str); \ + } \ + THLongStorage_free(size2); + +#define THCUNN_check_nElement(STATE, I1, I2) \ + if (I1 != NULL && I2 != NULL ) { \ + ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \ + ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \ + if (n1 != n2) \ + { \ + THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \ + THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \ + THError(#I1 " and " #I2 " have different number of elements: " \ + #I1 "%s has %ld elements, while " \ + #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \ + } \ + } + +#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \ + if (THCTensor_(nDimension)(STATE, T) != DIM || \ + THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ + THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ + THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ + " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ + } + +#define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE) \ + if (THCIndexTensor_(nDimension)(STATE, T) != DIM || \ + THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ + THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T); \ + THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ + " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ + } + +#define THCUNN_argCheck(STATE, COND, ARG, T, FORMAT) \ + if (!(COND)) { \ + THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ + THArgCheck(COND, ARG, FORMAT, s1.str); \ + } + +#endif diff --git a/aten/src/THCUNN/generic/Abs.cu b/aten/src/THCUNN/generic/Abs.cu new file mode 100644 index 0000000..0b2a5e7 --- /dev/null +++ b/aten/src/THCUNN/generic/Abs.cu @@ -0,0 +1,29 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Abs.cu" +#else + +#include "../common.h" + +void THNN_(Abs_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, absupdateOutput_functor()); +} + +void THNN_(Abs_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor()); +} + +#endif diff --git a/aten/src/THCUNN/generic/AbsCriterion.cu b/aten/src/THCUNN/generic/AbsCriterion.cu new file mode 100644 index 0000000..d1faeaa --- /dev/null +++ b/aten/src/THCUNN/generic/AbsCriterion.cu @@ -0,0 +1,82 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/AbsCriterion.cu" +#else + +void THNN_(AbsCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 3, input, target, output); + + if (reduction == Reduction::None) { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply3(state, input, target, output, + abs_updateOutput_no_reduce_functor()); + return; + } + + THCTensor_(resize1d)(state, output, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus(), abs_functor()); + + if (reduction == Reduction::ElementwiseMean) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + +void THNN_(AbsCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 4, input, target, gradOutput, gradInput); + + THCTensor_(resizeAs)(state, gradInput, input); + + if (reduction == Reduction::None) { + THCUNN_check_shape(state, gradOutput, input); + THC_pointwiseApply3(state, input, target, gradInput, + abs_updateGradInput_no_reduce_functor()); + THCTensor_(cmul)(state, gradInput, gradInput, gradOutput); + return; + } + + THCUNN_check_dim_size(state, gradOutput, 1, 0, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + real norm = ScalarConvert::to(reduction == Reduction::ElementwiseMean ? 1./size : 1.); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, + abs_updateGradInput_functor(norm, THCTensor_(get1d)(state, gradOutput, 0))); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/BCECriterion.cu b/aten/src/THCUNN/generic/BCECriterion.cu new file mode 100644 index 0000000..3dcde62 --- /dev/null +++ b/aten/src/THCUNN/generic/BCECriterion.cu @@ -0,0 +1,130 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/BCECriterion.cu" +#else + +void THNN_(BCECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction, + THCTensor *weights) +{ + THCUNN_check_nElement(state, input, target); + THCUNN_check_nElement(state, input, weights); + THCUNN_assertSameGPU(state, 3, input, target, weights); + + if (reduction == Reduction::None) { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply3(state, input, target, output, + bce_updateOutput_no_reduce_functor()); + if (weights) { + THCTensor_(cmul)(state, output, output, weights); + } + return; + } + + THCTensor_(resize1d)(state, output, 1); + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + + accreal sum; + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + thrust::device_ptr weights_data(THCTensor_(data)(state, weights)); + sum = thrust::transform_reduce( + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), + bce_functor_weights(), + (accreal) 0, + thrust::plus() + ); + THCTensor_(free)(state, weights); + } else { + sum = thrust::transform_reduce( + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), + bce_functor(), + (accreal) 0, + thrust::plus() + ); + } + + if (reduction == Reduction::ElementwiseMean) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + +void THNN_(BCECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + THCTensor *weights) +{ + THCUNN_check_nElement(state, input, target); + THCUNN_check_nElement(state, input, weights); + THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights); + + THCTensor_(resizeAs)(state, gradInput, input); + + if (reduction == Reduction::None) { + THCUNN_check_nElement(state, gradOutput, input); + THC_pointwiseApply3(state, input, target, gradInput, + bce_updateGradInput_no_reduce_functor()); + THCTensor_(cmul)(state, gradInput, gradInput, gradOutput); + if (weights) { + THCTensor_(cmul)(state, gradInput, gradInput, weights); + } + return; + } + + THCUNN_check_dim_size(state, gradOutput, 1, 0, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + real norm = ScalarConvert::to((reduction == Reduction::ElementwiseMean ? accreal(1)/size : accreal(1)) * THCTensor_(get1d)(state, gradOutput, 0)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + thrust::device_ptr weights_data(THCTensor_(data)(state, weights)); + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), + gradInput_data, + bce_updateGradInput_functor_weights(norm) + ); + THCTensor_(free)(state, weights); + } else { + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), + gradInput_data, + bce_updateGradInput_functor(norm) + ); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu new file mode 100644 index 0000000..1eb3b82 --- /dev/null +++ b/aten/src/THCUNN/generic/BatchNormalization.cu @@ -0,0 +1,108 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/BatchNormalization.cu" +#else + +#define DeviceTensor3 THCDeviceTensor +#define DeviceTensor1 THCDeviceTensor + +template +static THCDeviceTensor THNN_(devicetensor)(THCState *state, THCTensor *t) { + if (!t) { + return THCDeviceTensor(); + } + + int inDim = THCTensor__nDimension(state, t); + if (inDim == Dim) { + return toDeviceTensor(state, t); + } + + // View in which the last dimensions are collapsed or expanded as needed + THAssert(THCTensor_isContiguous(state, t)); + int size[Dim]; + for (int i = 0; i < Dim || i < inDim; ++i) { + if (i < Dim && i < inDim) { + size[i] = t->size[i]; + } else if (i < Dim) { + size[i] = 1; + } else { + size[Dim - 1] *= t->size[i]; + } + } + return THCDeviceTensor(t->data(), size); +} + +void THNN_(BatchNormalization_updateOutput)( + THCState *state, THCTensor *input_, THCTensor *output_, + THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_, + THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_, + bool train, double momentum, double eps) { + + THCTensor_(resizeAs)(state, output_, input_); + if (train) { + int64_t nInput = THCTensor_(size)(state, input_, 1); + THCTensor_(resize1d)(state, saveMean_, nInput); + THCTensor_(resize1d)(state, saveStd_, nInput); + } + DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_); + DeviceTensor3 output = THNN_(devicetensor)<3>(state, output_); + DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_); + DeviceTensor1 bias = THNN_(devicetensor)<1>(state, bias_); + DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_); + DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_); + DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_); + DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_); + + cudaStream_t s = THCState_getCurrentStream(state); + cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); + + if (!train) { + dim3 blocks(input.getSize(1)); + dim3 threads(getNumThreads(input.getSize(2))); + BatchNormalizationUpdateOutputInference_kernel <<>>( + input, output, runningMean, runningVar, weight, bias, eps); + } else { + dim3 blocks(input.getSize(1)); + dim3 threads(getNumThreads(input.getSize(2))); + BatchNormalizationUpdateOutput_kernel <<>>( + input, output, weight, bias, eps, momentum, runningMean, runningVar, + saveMean, saveStd); + } + THCudaCheck(cudaGetLastError()); +} + +void THNN_(BatchNormalization_backward)( + THCState *state, THCTensor *input_, THCTensor *gradOutput_, + THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_, + THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_, + THCTensor *saveMean_, THCTensor *saveStd_, bool train, double scale, double eps) { + + THCUNN_check_shape(state, input_, gradOutput_); + if (gradInput_) { + THCTensor_(resizeAs)(state, gradInput_, input_); + } + + DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_); + DeviceTensor3 gradOutput = THNN_(devicetensor)<3>(state, gradOutput_); + DeviceTensor3 gradInput = THNN_(devicetensor)<3>(state, gradInput_); + DeviceTensor1 gradWeight = THNN_(devicetensor)<1>(state, gradWeight_); + DeviceTensor1 gradBias = THNN_(devicetensor)<1>(state, gradBias_); + DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_); + DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_); + DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_); + DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_); + DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_); + + cudaStream_t s = THCState_getCurrentStream(state); + + dim3 blocks(gradOutput.getSize(1)); + dim3 threads(getNumThreads(gradOutput.getSize(2))); + BatchNormalizationBackward_kernel <<>>( + input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar, + saveMean, saveStd, train, scale, eps); + THCudaCheck(cudaGetLastError()); +} + +#undef DeviceTensor3 +#undef DeviceTensor1 + +#endif diff --git a/aten/src/THCUNN/generic/ClassNLLCriterion.cu b/aten/src/THCUNN/generic/ClassNLLCriterion.cu new file mode 100644 index 0000000..9508cc8 --- /dev/null +++ b/aten/src/THCUNN/generic/ClassNLLCriterion.cu @@ -0,0 +1,235 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/ClassNLLCriterion.cu" +#else + +void THNN_(ClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + int64_t reduction, + THCTensor *weights, + THCTensor *total_weight, + int64_t ignore_index) { + if (THCIndexTensor_(nDimension)(state, target) > 1) { + THError("multi-target not supported"); + } + + int n_dims = THCTensor_(nDimension)(state, input); + int n_classes = THCTensor_(size)(state, input, n_dims - 1); + ignore_index -= TH_INDEX_BASE; + + if (weights) { + THCUNN_assertSameGPU( + state, 5, input, target, weights, output, total_weight + ); + } else { + THCUNN_assertSameGPU( + state, 4, input, target, output, total_weight + ); + } + + THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected"); + + int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0); + int64_t num_targets = THCudaLongTensor_size(state, target, 0); + THArgCheck(batch_size == num_targets, + 2, "mismatch between the batch size of input (%ld) and that of target (%ld)", + batch_size, num_targets); + + if (weights && THCTensor_(nElement)(state, weights) != n_classes) { + THCDescBuff s1 = THCTensor_(sizeDesc)(state, weights); + THError("weight tensor should be defined either for all %d classes or no classes" + " but got weight tensor of shape: %s", n_classes, s1.str); + } + + if (reduction == Reduction::None && n_dims == 2) { + THCTensor_(resize1d)(state, output, batch_size); + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + } + + ClassNLLCriterion_updateOutput_no_reduce_kernel + <<>>( + batch_size, + toDeviceTensor(state, input), + toDeviceTensor(state, target), + toDeviceTensor(state, output), + weights ? THCTensor_(data)(state, weights) : NULL, + n_classes, + ignore_index); + + THCudaCheck(cudaGetLastError()); + + if (weights) { + THCTensor_(free)(state, weights); + } + return; + } + + THCTensor_(resize1d)(state, output, 1); + THCTensor_(resize1d)(state, total_weight, 1); + + input = THCTensor_(newContiguous)(state, input); + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *input_data = THCTensor_(data)(state, input); + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *output_data = THCTensor_(data)(state, output); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + if (THCTensor_(nDimension)(state, input) == 1) { + cunn_ClassNLLCriterion_updateOutput_kernel1 + <<<1, 1, 0, THCState_getCurrentStream(state)>>>( + output_data, + total_weight_data, + input_data, + target_data, + weights_data, + reduction == Reduction::ElementwiseMean, + n_classes, + ignore_index + ); + + } else if (THCTensor_(nDimension)(state, input) == 2) { + cunn_ClassNLLCriterion_updateOutput_kernel + <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>( + output_data, + total_weight_data, + input_data, + target_data, + weights_data, + reduction == Reduction::ElementwiseMean, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + n_classes, + ignore_index + ); + } + THCudaCheck(cudaGetLastError()); + + if (weights) { + THCTensor_(free)(state, weights); + } + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, input); +} + +void THNN_(ClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + THCTensor *weights, + THCTensor *total_weight, + int64_t ignore_index) { + if (THCIndexTensor_(nDimension)(state, target) > 1) { + THError("multi-target not supported"); + } + + int n_dims = THCTensor_(nDimension)(state, input); + int n_classes = THCTensor_(size)(state, input, n_dims - 1); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, "gradInput must be contiguous"); + + if (weights) { + THCUNN_assertSameGPU( + state, 5, weights, input, target, gradInput, total_weight + ); + } + else { + THCUNN_assertSameGPU( + state, 4, input, target, gradInput, total_weight + ); + } + + THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected"); + + int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0); + int64_t num_targets = THCudaLongTensor_size(state, target, 0); + THArgCheck(batch_size == num_targets, + 2, "mismatch between the batch size of input (%ld) and that of target (%ld)", + batch_size, num_targets); + + if (weights && THCTensor_(nElement)(state, weights) != n_classes) { + THError("weight tensor should be defined either for all or no classes"); + } + + if (reduction == Reduction::None && n_dims == 2) { + THCUNN_check_dim_size(state, gradOutput, 1, 0, batch_size); + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + } + + ClassNLLCriterion_updateGradInput_no_reduce_kernel + <<>>( + batch_size, + toDeviceTensor(state, target), + toDeviceTensor(state, gradOutput), + toDeviceTensor(state, gradInput), + weights ? THCTensor_(data)(state, weights) : NULL, + n_classes, + ignore_index); + + THCudaCheck(cudaGetLastError()); + + if (weights) { + THCTensor_(free)(state, weights); + } + return; + } + + ignore_index -= TH_INDEX_BASE; + + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + THCUNN_check_dim_size(state, gradOutput, 1, 0, 1); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + real *gradInput_data = THCTensor_(data)(state, gradInput); + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + if (THCTensor_(nDimension)(state, input) == 1) { + cunn_ClassNLLCriterion_updateGradInput_kernel1 + <<<1, 1, 0, THCState_getCurrentStream(state)>>>( + gradInput_data, + gradOutput_data, + weights_data, + target_data, + total_weight_data, + reduction == Reduction::ElementwiseMean, + n_classes, + ignore_index + ); + } else { + cunn_ClassNLLCriterion_updateGradInput_kernel + <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>( + gradInput_data, + gradOutput_data, + target_data, + weights_data, + total_weight_data, + reduction == Reduction::ElementwiseMean, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + n_classes, + ignore_index + ); + } + THCudaCheck(cudaGetLastError()); + + if (weights) { + THCTensor_(free)(state, weights); + } + THCIndexTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/Col2Im.cu b/aten/src/THCUNN/generic/Col2Im.cu new file mode 100644 index 0000000..c0a074c --- /dev/null +++ b/aten/src/THCUNN/generic/Col2Im.cu @@ -0,0 +1,129 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Col2Im.cu" +#else + +static inline void THNN_(Col2Im_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { + + THArgCheck(kW > 0 && kH > 0, 6, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(sW > 0 && sH > 0, 12, + "stride should be greater than zero, but got sH: %d sW: %d", sH, sW); + THArgCheck(dW > 0 && dH > 0, 8, + "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int64_t ndim = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 2, input, + "Expected non-empty 2D or 3D input tensor, but got input of shape %s"); + + int batch_dim = (ndim == 3) ? 0 : -1; + int64_t nInputPlane = input->size[batch_dim + 1]; + + if (nInputPlane % (kW * kH) != 0) { + THError("Expected size of input's dimension 1 to be divisible by the " + "product of kernel_size, but got input.size(1)=%lld and " + "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW); + } + + int64_t inputLength = input->size[batch_dim + 2]; + int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH; + int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW; + + if (inputLength != (nBlocksH * nBlocksW)) { + THError("Given output_size=(%d, %d), kernel_size=(%d, %d), " + "dilation=(%d, %d), padding=(%d, %d), stride=(%d, %d), expected " + "size of input's dimension 2 to match the calculated number of " + "sliding blocks %lld * %lld = %lld, but got input.size(2)=%lld.", + outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, sH, sW, + (long long) nBlocksH, (long long) nBlocksW, + (long long) (nBlocksH * nBlocksW), (long long) inputLength); + } + + if (outputWidth < 1 || outputHeight < 1) { + THError("Expected output spatial size to be positive, but got: output_size=(%d, %d).", + outputHeight, outputWidth); + } +} + +void THNN_(Col2Im_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THCUNN_assertSameGPU(state, 2, input, output); + + THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, sH, sW); + + bool batched_input = true; + if (input->dim() == 2) { + // Force batch + batched_input = false; + THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); + } + + int64_t batchSize = input->size[0]; + int64_t nInputPlane = input->size[1]; + int64_t nOutputPlane = nInputPlane / (kW * kH); + + input = THCTensor_(newContiguous)(state, input); + + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + THCTensor_(zero)(state, output); + + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + + for (int64_t elt = 0; elt < batchSize; elt++) { + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + col2im( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nOutputPlane, + outputHeight, outputWidth, + height_col, width_col, + kH, kW, + padH, padW, + sH, sW, + dH, dW, THCTensor_(data)(state, output_n)); + } + + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + if (!batched_input) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + } + THCTensor_(free)(state, input); +} + +void THNN_(Col2Im_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput, + kH, kW, dH, dW, padH, padW, sH, sW); + +} + +#endif diff --git a/aten/src/THCUNN/generic/DistKLDivCriterion.cu b/aten/src/THCUNN/generic/DistKLDivCriterion.cu new file mode 100644 index 0000000..e798285 --- /dev/null +++ b/aten/src/THCUNN/generic/DistKLDivCriterion.cu @@ -0,0 +1,89 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu" +#else + +void THNN_(DistKLDivCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 2, input, target); + + THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements"); + + if (reduction == Reduction::None) { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply3(state, input, target, output, + kl_updateOutput_no_reduce_functor()); + return; + } + + THCTensor_(resize1d)(state, output, 1); + + accreal sum; + + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus(), kl_functor()); + + if (reduction == Reduction::ElementwiseMean) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + +void THNN_(DistKLDivCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput); + + THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements"); + + THCTensor_(resizeAs)(state, gradInput, input); + + if (reduction == Reduction::None) { + THCUNN_check_shape(state, gradOutput, input); + THC_pointwiseApply3(state, target, gradOutput, gradInput, + kl_updateGradInput_no_reduce_functor()); + return; + } + + THCUNN_check_dim_size(state, gradOutput, 1, 0, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + real norm = (reduction == Reduction::ElementwiseMean ? ScalarConvert::to(accreal(1)/size) : ScalarConvert::to(1)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, + kl_updateGradInput_functor(norm, THCTensor_(get1d)(state, gradOutput, 0))); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/ELU.cu b/aten/src/THCUNN/generic/ELU.cu new file mode 100644 index 0000000..5c09a06 --- /dev/null +++ b/aten/src/THCUNN/generic/ELU.cu @@ -0,0 +1,50 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/ELU.cu" +#else + +#include "../common.h" + + +void THNN_(ELU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal alpha, + accreal scale, + bool inplace) +{ + real negcoef = ScalarConvert::to(alpha * scale); + real poscoef = ScalarConvert::to(scale); + THCUNN_assertSameGPU(state, 2, input, output); + + if (inplace) + { + THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(negcoef, poscoef)); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(negcoef, poscoef)); + } +} + + +void THNN_(ELU_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + accreal alpha, + accreal scale) +{ + real negcoef = ScalarConvert::to(alpha * scale); + real poscoef = ScalarConvert::to(scale); + THCUNN_check_nElement(state, output, gradOutput); + THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); + + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(negcoef, poscoef)); +} + +#endif diff --git a/aten/src/THCUNN/generic/FeatureLPPooling.cu b/aten/src/THCUNN/generic/FeatureLPPooling.cu new file mode 100644 index 0000000..3f95bcd --- /dev/null +++ b/aten/src/THCUNN/generic/FeatureLPPooling.cu @@ -0,0 +1,267 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/FeatureLPPooling.cu" +#else + +#include "../common.h" + +// non-batch mode: +// [feature dim] +// [feature dim][opt dim 1] +// [feature dim][opt dim 1][opt dim 2] +// +// batch mode: +// [batch dim][feature dim] +// [batch dim][feature dim][opt dim 1] +// [batch dim][feature dim][opt dim 1][opt dim 2] +THCDeviceTensor +THNN_(FeatureLPPooling_upcast)(THCState* state, THCTensor* t, bool batchMode) { + int inputDim = THCTensor_(_nDimension)(state, t); + + if (inputDim == 1) { + // [feature dim] + return toDeviceTensor(state, t). + upcastOuter<2>().upcastInner<4>(); + } else if (inputDim == 2) { + if (batchMode) { + // [batch dim][feature dim] + return toDeviceTensor(state, t). + upcastInner<4>(); + } else { + // [feature dim][opt dim 1] + return toDeviceTensor(state, t). + upcastOuter<3>().upcastInner<4>(); + } + } else if (inputDim == 3) { + if (batchMode) { + // [batch dim][feature dim][opt dim 1] + return toDeviceTensor(state, t). + upcastInner<4>(); + } else { + // [feature dim][opt dim 1][opt dim 2] + return toDeviceTensor(state, t). + upcastOuter<4>(); + } + } else { + // inputDim == 4 + // [batch dim][feature dim][opt dim 1][opt dim 2] + THAssert(batchMode); + return toDeviceTensor(state, t); + } +} + +// Resizes `toResize` based on the output size for `src` as an input +// tensor +void +THNN_(FeatureLPPooling_resizeForOutput)(THCState* state, + THCTensor* toResize, + THCTensor* input, + bool batchMode, + int width, + int stride) { + int inputDim = THCTensor_(_nDimension)(state, input); + THAssert(inputDim >= 1 && inputDim <= 4); + + int64_t outSize = + lpPoolingOutputSize(THCTensor_(size)(state, input, 0), width, stride); + if (batchMode) { + THAssert(inputDim > 1); + outSize = + lpPoolingOutputSize(THCTensor_(size)(state, input, 1), width, stride); + } else { + THAssert(inputDim < 4); + } + + if (inputDim == 1) { + THCTensor_(resize1d)(state, toResize, outSize); + } else if (inputDim == 2) { + if (batchMode) { + THCTensor_(resize2d)( + state, toResize, THCTensor_(size)(state, input, 0), outSize); + } else { + THCTensor_(resize2d)( + state, toResize, outSize, THCTensor_(size)(state, input, 1)); + } + } else if (inputDim == 3) { + if (batchMode) { + THCTensor_(resize3d)( + state, + toResize, + THCTensor_(size)(state, input, 0), outSize, + THCTensor_(size)(state, input, 2)); + } else { + THCTensor_(resize3d)( + state, + toResize, + outSize, THCTensor_(size)(state, input, 1), + THCTensor_(size)(state, input, 2)); + } + } else if (inputDim == 4) { + THCTensor_(resize4d)( + state, + toResize, + THCTensor_(size)(state, input, 0), outSize, + THCTensor_(size)(state, input, 2), THCTensor_(size)(state, input, 3)); + } +} + +// Makes `toResize` the same size/dimensionality as `src` +void +THNN_(FeatureLPPooling_resize)(THCState* state, + THCTensor* toResize, + THCTensor* src) { + int inputDim = THCTensor_(_nDimension)(state, src); + THAssert(inputDim >= 1 && inputDim <= 4); + + if (inputDim == 1) { + THCTensor_(resize1d)(state, + toResize, + THCTensor_(size)(state, src, 0)); + } else if (inputDim == 2) { + THCTensor_(resize2d)( + state, + toResize, + THCTensor_(size)(state, src, 0), + THCTensor_(size)(state, src, 1)); + } else if (inputDim == 3) { + THCTensor_(resize3d)( + state, + toResize, + THCTensor_(size)(state, src, 0), + THCTensor_(size)(state, src, 1), + THCTensor_(size)(state, src, 2)); + } else if (inputDim == 4) { + THCTensor_(resize4d)( + state, + toResize, + THCTensor_(size)(state, src, 0), + THCTensor_(size)(state, src, 1), + THCTensor_(size)(state, src, 2), + THCTensor_(size)(state, src, 3)); + } +} + +void THNN_(FeatureLPPooling_updateOutput)(THCState* state, + THCTensor* inputTH, + THCTensor* outputTH, + accreal power, + int width, + int stride, + bool batchMode) { + THCUNN_assertSameGPU(state, 2, inputTH, outputTH); + + int inputDim = THCTensor_(_nDimension)(state, inputTH); + + if (batchMode) { + THArgCheck(inputDim >= 2 && inputDim <= 4, 2, + "input must be 2-4 dimensions for batch mode"); + } else { + THArgCheck(inputDim >= 1 && inputDim <= 3, 2, + "input must be 1-3 dimensions for non-batch mode"); + } + + THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 2, + "input tensor must fit into 32-bit index math"); + + THCDeviceTensor input; + THCDeviceTensor output; + + input = THNN_(FeatureLPPooling_upcast)(state, inputTH, batchMode); + + // Make sure the feature dimension is properly sized + THArgCheck(input.getSize(1) >= width, 2, + "input: feature dimension must be >= width"); + + // Make sure that width and stride are within range + THArgCheck(width >= 2 && width <= 16, 5, + "width must be between 2 - 16"); + + THArgCheck(stride >= 1 && stride <= 4, 6, + "stride must be between 1 - 4"); + + THNN_(FeatureLPPooling_resizeForOutput)( + state, outputTH, inputTH, batchMode, width, stride); + + output = THNN_(FeatureLPPooling_upcast)(state, outputTH, batchMode); + + bool found = runFeatureLPPoolingUpdateOutput(state, + input, + output, + power, + width, + stride); + THAssert(found); +} + +void THNN_(FeatureLPPooling_updateGradInput)(THCState* state, + THCTensor* gradOutputTH, + THCTensor* inputTH, + THCTensor* outputTH, + THCTensor* gradInputTH, + accreal power, + int width, + int stride, + bool batchMode) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutputTH), 2, + "output gradient tensor must fit into 32-bit index math"); + THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 3, + "input tensor must fit into 32-bit index math"); + THCUNN_assertSameGPU(state, 4, gradOutputTH, inputTH, outputTH, gradInputTH); + + int inputDim = THCTensor_(_nDimension)(state, inputTH); + + if (batchMode) { + THArgCheck(inputDim >= 2 && inputDim <= 4, 2, + "input must be 2-4 dimensions for batch mode"); + } else { + THArgCheck(inputDim >= 1 && inputDim <= 3, 2, + "input must be 1-3 dimensions for non-batch mode"); + } + + THCDeviceTensor gradOutput; + THCDeviceTensor input; + THCDeviceTensor output; + THCDeviceTensor gradInput; + + input = THNN_(FeatureLPPooling_upcast)(state, inputTH, batchMode); + + // Make sure the feature dimension is properly sized + THArgCheck(input.getSize(1) >= width, 3, + "input: feature dimension must be >= width"); + + // Make sure that width and stride are within range + THArgCheck(width >= 2 && width <= 16, 7, + "width must be between 2 - 16"); + + THArgCheck(stride >= 1 && stride <= 4, 8, + "stride must be between 1 - 4"); + + gradOutput = THNN_(FeatureLPPooling_upcast)(state, gradOutputTH, batchMode); + output = THNN_(FeatureLPPooling_upcast)(state, outputTH, batchMode); + + for (int i = 0; i < 4; ++i) { + THAssertMsg(output.getSize(i) == gradOutput.getSize(i), + "output and gradOutput sizes do not match"); + } + + // Make sure that the input sizes produce the output sizes + THArgCheck(lpPoolingOutputSize(input.getSize(1), width, stride) == + output.getSize(1), 3, + "input and output sizes do not match with respect to " + "width and stride"); + + // Resize `gradInput` based on `input` + THNN_(FeatureLPPooling_resize)(state, gradInputTH, inputTH); + gradInput = THNN_(FeatureLPPooling_upcast)(state, gradInputTH, batchMode); + + bool found = runFeatureLPPoolingUpdateGradInput(state, + gradOutput, + input, + output, + gradInput, + power, + width, + stride); + THAssert(found); +} + +#endif diff --git a/aten/src/THCUNN/generic/FusedRNNKernel.cu b/aten/src/THCUNN/generic/FusedRNNKernel.cu new file mode 100644 index 0000000..1d16bc4 --- /dev/null +++ b/aten/src/THCUNN/generic/FusedRNNKernel.cu @@ -0,0 +1,785 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/FusedRNNKernel.cu" +#else +#include + +#include "../common.h" + +#define TINFO TensorInfo + +//factor will be 3 for GRU and 4 for LSTM +void THNN_(FusedRNNAssertSizes)(THCState *state, int factor, int count, ...) +{ + va_list list; + va_start(list, count); + THCTensor *input = va_arg(list, THCTensor*); + THCTensor *hidden = va_arg(list, THCTensor*); + THArgCheck(THCTensor_(nElement)(state, input) == + THCTensor_(nElement)(state, hidden), + 3, "Input and Hidden tensor sizes should be the same."); + + THAssertMsg(THCTensor__nDimension(state, input) <= MAX_CUTORCH_DIMS, + "Tensor dimension is too large."); + + THAssertMsg(THCTensor__nDimension(state, hidden) <= MAX_CUTORCH_DIMS, + "Tensor dimension is too large."); + + for (int arg=2; arg < count; ++arg){ + THCTensor *tens = va_arg(list, THCTensor*); + THArgCheck(THCTensor_(nElement)(state, input) == + THCTensor_(nElement)(state, tens)*factor, + 3, "A pointwise tensor was not the right size, should have 1/%u the elements of input/hidden tensor.", arg, factor); + THAssertMsg(THCTensor__nDimension(state, tens) <= MAX_CUTORCH_DIMS, + "Tensor dimension is too large."); + } + + va_end(list); +} + +int THNN_(minIndexType)(THCState *state, int count, ...) +{ + va_list list; + va_start(list, count); + + THCTensor* tens = va_arg(list, THCTensor*); + int startDim = THCTensor__nDimension(state, tens); + bool canCollapse = THCTensor_(isContiguous)(state,tens); + + for (int arg=1; arg < count; ++arg){ + tens = va_arg(list, THCTensor*); + canCollapse = canCollapse && THCTensor_(isContiguous)(state, tens); + if(THCTensor__nDimension(state, tens) != startDim){ + va_end(list); + return -1; + } + } + va_end(list); + if(canCollapse) return -2; + return startDim; +} + +bool THNN_(canUse32BitIndexMath)(THCState *state, int count, ...) +{ + va_list list; + va_start(list, count); + + for (int arg=0; arg < count; ++arg){ + THCTensor *tens = va_arg(list, THCTensor*); + if (!THCTensor_canUse32BitIndexMath(state, tens)){ + va_end(list); + return false; + } + } + va_end(list); + return true; +} + +#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) \ + D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)] + +#define H2F(input) ScalarConvert::to(input) +#define F2H(input) ScalarConvert::to(input) + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(32 * 16, 4) +#endif +__global__ void +THNN_(GRUForward)(TensorInfo Input, + TensorInfo Hidden, + TensorInfo Bias1, + TensorInfo Bias2, + TensorInfo _hx, + TensorInfo _hy, + TensorInfo storage, + IndexType hsz, + IndexType totalElements) +{ + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) + { + + IndexType offset = (linearIndex/hsz)*3*hsz+linearIndex%hsz; + + T ir = DEVICE_LINEAR_GET(Input, offset+0*hsz); + T ii = DEVICE_LINEAR_GET(Input, offset+1*hsz); + T in = DEVICE_LINEAR_GET(Input, offset+2*hsz); + T hr = DEVICE_LINEAR_GET(Hidden,offset+0*hsz); + T hi = DEVICE_LINEAR_GET(Hidden,offset+1*hsz); + T hn = DEVICE_LINEAR_GET(Hidden, offset+2*hsz); + + T hx = DEVICE_LINEAR_GET(_hx, linearIndex); + T* hy = &DEVICE_LINEAR_GET(_hy, linearIndex); + + bool has_bias = (Bias1.data != NULL); + + T b1r, b1i, b1n, b2r, b2i, b2n; + + if(has_bias){ + b1r = DEVICE_LINEAR_GET(Bias1, linearIndex%hsz+0*hsz); + b1i = DEVICE_LINEAR_GET(Bias1, linearIndex%hsz+1*hsz); + b1n = DEVICE_LINEAR_GET(Bias1, linearIndex%hsz+2*hsz); + + b2r = DEVICE_LINEAR_GET(Bias2, linearIndex%hsz+0*hsz); + b2i = DEVICE_LINEAR_GET(Bias2, linearIndex%hsz+1*hsz); + b2n = DEVICE_LINEAR_GET(Bias2, linearIndex%hsz+2*hsz); + }else{ +#ifndef THC_REAL_IS_HALF + b1r = 0.0; b1i = 0.0; b1n = 0.0; + b2r = 0.0; b2i = 0.0; b2n = 0.0; +#else + b1r = F2H(0.0); b1i = F2H(0.0); b1n = F2H(0.0); + b2r = F2H(0.0); b2i = F2H(0.0); b2n = F2H(0.0); +#endif + } + + + offset = (linearIndex/hsz)*5*hsz+linearIndex%hsz; + + accreal rg, ig, ng; + + rg = H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r); + ig = H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i); + + TensorSigmoidOp()(&rg, &rg); + TensorSigmoidOp()(&ig, &ig); + ng = H2F(in) + H2F(b1n) + rg*( H2F(hn)+H2F(b2n) ); + ng = THCNumerics::tanh(ng); + *hy = F2H( ng + ig * ( H2F(hx)-ng ) ); + + //SAVE FOR BACKWARDS + DEVICE_LINEAR_GET(storage, offset+0*hsz) = F2H(rg); + DEVICE_LINEAR_GET(storage, offset+1*hsz) = F2H(ig); + DEVICE_LINEAR_GET(storage, offset+2*hsz) = F2H(ng); + DEVICE_LINEAR_GET(storage, offset+3*hsz) = hx; + DEVICE_LINEAR_GET(storage, offset+4*hsz) = F2H(H2F(hn) + H2F(b2n)); + + } +} + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(32 * 16, 4) +#endif +__global__ void +THNN_(GRUBackward)(TensorInfo gradInInput, + TensorInfo gradInHidden, + TensorInfo gradOutput, + TensorInfo gradInputHx, + TensorInfo storage, + IndexType hsz, + IndexType totalElements) +{ + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + IndexType offset = (linearIndex/hsz)*5*hsz+linearIndex%hsz; + + T rg = DEVICE_LINEAR_GET(storage, offset+0*hsz); + T ig = DEVICE_LINEAR_GET(storage, offset+1*hsz); + T ng = DEVICE_LINEAR_GET(storage, offset+2*hsz); + T hx = DEVICE_LINEAR_GET(storage, offset+3*hsz); + T hn = DEVICE_LINEAR_GET(storage, offset+4*hsz); + + T go = DEVICE_LINEAR_GET(gradOutput, linearIndex); + + offset = (linearIndex/hsz)*3*hsz+linearIndex%hsz; + + accreal gig = H2F(go)*( H2F(hx)-H2F(ng) )*( 1-H2F(ig) )*H2F(ig); + accreal ghx = H2F(go)*H2F(ig); + accreal gin = H2F(go)*( 1-H2F(ig) )*( 1-H2F(ng)*H2F(ng) ); + accreal ghn = gin * H2F(rg); + accreal grg = gin *H2F(hn)*( 1-H2F(rg) )*H2F(rg); + + DEVICE_LINEAR_GET(gradInInput, offset+0*hsz) = F2H(grg); + DEVICE_LINEAR_GET(gradInInput, offset+1*hsz) = F2H(gig); + DEVICE_LINEAR_GET(gradInInput, offset+2*hsz) = F2H(gin); + + DEVICE_LINEAR_GET(gradInHidden, offset+0*hsz) = F2H(grg); + DEVICE_LINEAR_GET(gradInHidden, offset+1*hsz) = F2H(gig); + DEVICE_LINEAR_GET(gradInHidden, offset+2*hsz) = F2H(ghn); + DEVICE_LINEAR_GET(gradInputHx, linearIndex) = F2H(ghx); + + } +} + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(32 * 16, 4) +#endif +__global__ void + THNN_(LSTMForward)(TensorInfo input, + TensorInfo hidden, + TensorInfo bias1, + TensorInfo bias2, + TensorInfo _cx, + TensorInfo _hy, + TensorInfo _cy, + IndexType hsz, + IndexType totalElements) +{ + + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) + { + + IndexType offset = (linearIndex/hsz)*4*hsz+linearIndex%hsz; + + T* iig = &DEVICE_LINEAR_GET(input, offset+0*hsz); + T* ifg = &DEVICE_LINEAR_GET(input, offset+1*hsz); + T* icg = &DEVICE_LINEAR_GET(input, offset+2*hsz); + T* iog = &DEVICE_LINEAR_GET(input, offset+3*hsz); + + T hig = DEVICE_LINEAR_GET(hidden, offset+0*hsz); + T hfg = DEVICE_LINEAR_GET(hidden, offset+1*hsz); + T hcg = DEVICE_LINEAR_GET(hidden, offset+2*hsz); + T hog = DEVICE_LINEAR_GET(hidden, offset+3*hsz); + + T cx = DEVICE_LINEAR_GET(_cx, linearIndex); + + T* hy = &DEVICE_LINEAR_GET(_hy, linearIndex); + T* cy = &DEVICE_LINEAR_GET(_cy, linearIndex); + + bool has_bias = (bias1.data != NULL); + + T b1i, b1f, b1c, b1o; + T b2i, b2f, b2c, b2o; + + if(has_bias){ + b1i = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+0*hsz); + b1f = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+1*hsz); + b1c = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+2*hsz); + b1o = DEVICE_LINEAR_GET(bias1, linearIndex%hsz+3*hsz); + + b2i = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+0*hsz); + b2f = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+1*hsz); + b2c = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+2*hsz); + b2o = DEVICE_LINEAR_GET(bias2, linearIndex%hsz+3*hsz); + + }else{ +#ifndef THC_REAL_IS_HALF + b1i = 0.0; b1f = 0.0; b1c = 0.0; b1o = 0.0; + b2i = 0.0; b2f = 0.0; b2c = 0.0; b2o = 0.0; +#else + b1i = F2H(0.0); b1f = F2H(0.0); b1c = F2H(0.0); b1o = F2H(0.0); + b2i = F2H(0.0); b2f = F2H(0.0); b2c = F2H(0.0); b2o = F2H(0.0); +#endif + } + + accreal ig, fg, cg, og; + accreal f_hy, f_cy; + + ig = H2F(*iig) + H2F(hig) + H2F(b1i) + H2F(b2i); + fg = H2F(*ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f); + cg = H2F(*icg) + H2F(hcg) + H2F(b1c) + H2F(b2c); + og = H2F(*iog) + H2F(hog) + H2F(b1o) + H2F(b2o); + + TensorSigmoidOp()(&ig, &ig); + TensorSigmoidOp()(&fg, &fg); + cg = THCNumerics::tanh(cg); + TensorSigmoidOp()(&og, &og); + + f_cy = (fg * H2F(cx) ) + (ig * cg); + f_hy = og * THCNumerics::tanh(f_cy); + + *hy = F2H(f_hy); + *cy = F2H(f_cy); + + //SAVE FOR BACKWARDS + //Also need cy and cx but can be saved easily in python + *iig = F2H(ig); + *ifg = F2H(fg); + *icg = F2H(cg); + *iog = F2H(og); + + } +} + +template +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(32 * 16, 4) +#endif +__global__ void + THNN_(LSTMBackward)(TensorInfo storage, + TensorInfo gradInGates, + TensorInfo _cx, + TensorInfo _cy, + TensorInfo gradoutput, + TensorInfo gradoutputcell, + TensorInfo gradInputCx, + IndexType hsz, + IndexType totalElements) +{ + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + IndexType offset = (linearIndex/hsz)*4*hsz+linearIndex%hsz; + + T ig = DEVICE_LINEAR_GET(storage, offset+0*hsz); + T fg = DEVICE_LINEAR_GET(storage, offset+1*hsz); + T cg = DEVICE_LINEAR_GET(storage, offset+2*hsz); + T og = DEVICE_LINEAR_GET(storage, offset+3*hsz); + + T* ih = &DEVICE_LINEAR_GET(gradInGates, offset+0*hsz); + T* fh = &DEVICE_LINEAR_GET(gradInGates, offset+1*hsz); + T* ch = &DEVICE_LINEAR_GET(gradInGates, offset+2*hsz); + T* oh = &DEVICE_LINEAR_GET(gradInGates, offset+3*hsz); + + //will return hidden grads here + T cx = DEVICE_LINEAR_GET(_cx, linearIndex); + T cy = DEVICE_LINEAR_GET(_cy, linearIndex); + + T* gi = &DEVICE_LINEAR_GET(gradInputCx, linearIndex); + + T go = DEVICE_LINEAR_GET(gradoutput, linearIndex); + T goc= DEVICE_LINEAR_GET(gradoutputcell, linearIndex); + + accreal gcx = THCNumerics::tanh(H2F(cy)); + + + accreal gog = H2F(go) * gcx; + gcx = H2F(go) * H2F(og) * ( 1 - gcx*gcx) + H2F(goc); + + accreal gig = gcx * H2F(cg); + accreal gfg = gcx * H2F(cx); + accreal gcg = gcx * H2F(ig); + + gcx = gcx * H2F(fg); + + gig = gig * (1-H2F(ig)) * H2F(ig); + gfg = gfg * (1-H2F(fg)) * H2F(fg); + gcg = gcg * (1-H2F(cg)*H2F(cg)); + gog = gog * (1-H2F(og)) * H2F(og); + + *ih = F2H(gig); + *fh = F2H(gfg); + *ch = F2H(gcg); + *oh = F2H(gog); + + *gi = F2H(gcx); + + } +} + + +// ************ START Create function calls ********** // +#define FILL_FUNCTION(ITYPE, DIM, FUNCTION) FUNCTION(ITYPE, DIM) + +#define FILL_DIM(ITYPE, DIM, FUNCTION) \ + switch (DIM) { \ + case -2: \ + FILL_FUNCTION(ITYPE, -2, FUNCTION); \ + break; \ + case 1: \ + FILL_FUNCTION(ITYPE, 1, FUNCTION); \ + break; \ + case 2: \ + FILL_FUNCTION(ITYPE, 2, FUNCTION); \ + break; \ + default: \ + FILL_FUNCTION(ITYPE, -1, FUNCTION); \ + break; \ + } + +#define LSTM_FORWARD(ITYPE, DIM) THNN_(LSTMForward) \ + \ + <<>> \ + (inputI, hiddenI, \ + bias1I, bias2I, cxI, hyI, cyI, \ + hid_size, totalElements); + +#define LSTM_BACKWARD(ITYPE, DIM) THNN_(LSTMBackward) \ + \ + <<>> \ + (storageI, gradingatesI, cxI, cyI, \ + gradoutI, gradoutcI, gradincxI, \ + hid_size, totalElements); + +#define GRU_FORWARD(ITYPE, DIM) THNN_(GRUForward) \ + <<>> \ + (inputI, hiddenI, bias1I, bias2I, hxI, hyI, storageI, \ + hid_size, totalElements); + +#define GRU_BACKWARD(ITYPE, DIM) THNN_(GRUBackward) \ + \ + <<>> \ + (gradininputI, gradinhiddenI, gradoutI, gradinhxI, storageI, \ + hid_size, totalElements); + +// ************ END Create actual function calls ************ // + +template +void THNN_(LSTM_forw_ind_wrap)( + THCState *state, + THCTensor *input, + THCTensor *hidden, + THCTensor *bias1, + THCTensor *bias2, + THCTensor *cx, + THCTensor *hy, + THCTensor *cy) +{ + bool has_bias = (bias1!=NULL); + + int maxDim; + if(has_bias){ + THCUNN_assertSameGPU(state, 7, input, hidden, bias1, bias2, hy, cy, cx); + maxDim = THNN_(minIndexType) + (state, 7, input, hidden, bias1, bias2, hy, cy, cx); + }else{ + THCUNN_assertSameGPU(state, 5, input, hidden, hy, cy, cx); + maxDim = THNN_(minIndexType) + (state, 5, input, hidden, hy, cy, cx); + } + + ptrdiff_t totalElements = THCTensor_nElement(state, cx); + + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), + "Could not get grid size for pointwise apply."); + + TINFO inputI = getTensorInfo(state, input); + TINFO hiddenI = getTensorInfo(state, hidden); + TINFO cxI = getTensorInfo(state, cx); + TINFO hyI = getTensorInfo(state, hy); + TINFO cyI = getTensorInfo(state, cy); + + INDTYPE hid_size = cxI.sizes[cxI.dims-1]; + if(has_bias){ + THAssertMsg( hid_size*4 == static_cast(THCTensor_(nElement)(state, bias1)) && + hid_size*4 == static_cast(THCTensor_(nElement)(state, bias2)), + "Bias in pointwise operation is an incorrect size, must be 4 x feature size."); + } + + if(maxDim == -2){ + inputI.collapseDims(); + hiddenI.collapseDims(); + cxI.collapseDims(); + hyI.collapseDims(); + cyI.collapseDims(); + } + + INDTYPE zero[1] = {0}; + TINFO nullinfo = TINFO(NULL, 1, zero, zero); + TINFO bias1I = nullinfo; + TINFO bias2I = nullinfo; + + if(has_bias){ + bias1I = getTensorInfo(state, bias1); + bias2I = getTensorInfo(state, bias2); + if(maxDim == -2){ + bias1I.collapseDims(); + bias2I.collapseDims(); + } + } + + FILL_DIM(INDTYPE, maxDim, LSTM_FORWARD); + +} +void THNN_(LSTMFused_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *hidden, + THCTensor *bias1, + THCTensor *bias2, + THCTensor *cx, + THCTensor *hy, + THCTensor *cy) +{ + THCTensor_(resizeAs)(state, hy, cx); + THCTensor_(resizeAs)(state, cy, cx); + THNN_(FusedRNNAssertSizes)(state, 4, 5, input, hidden, hy, cy, cx); + + bool has_bias = (bias1!=NULL); + bool canUse32bi; + if(has_bias){ + canUse32bi = THNN_(canUse32BitIndexMath) + (state, 7, input, hidden, bias1, bias2, hy, cy, cx); + }else{ + canUse32bi = THNN_(canUse32BitIndexMath) + (state, 5, input, hidden, hy, cy, cx); + } + + if(canUse32bi){ + THNN_(LSTM_forw_ind_wrap) + (state, input, hidden, bias1, bias2, cx, hy, cy); + }else{ + THNN_(LSTM_forw_ind_wrap) + (state, input, hidden, bias1, bias2, cx, hy, cy); + } + THCudaCheck(cudaGetLastError()); +} + +template +void THNN_(LSTM_back_ind_wrap)( + THCState *state, + THCTensor *storage, + THCTensor *gradInGates, + THCTensor *cx, + THCTensor *cy, + THCTensor *gradOutput, + THCTensor *gradOutputCell, + THCTensor *gradInputCx) +{ + int maxDim = THNN_(minIndexType) + (state, 7, storage, gradInGates, cx, cy, + gradOutput, gradOutputCell, gradInputCx); + ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput); + + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), + "Could not get grid size for pointwise apply"); + + TINFO storageI = getTensorInfo(state, storage); + TINFO gradingatesI = getTensorInfo(state, gradInGates); + TINFO cxI = getTensorInfo(state, cx); + TINFO cyI = getTensorInfo(state, cy); + TINFO gradoutI = getTensorInfo(state, gradOutput); + TINFO gradoutcI = getTensorInfo(state, gradOutputCell); + TINFO gradincxI = getTensorInfo(state, gradInputCx); + + INDTYPE hid_size = gradoutI.sizes[gradoutI.dims-1]; + + if(maxDim == -2){ + storageI.collapseDims(); + gradingatesI.collapseDims(); + cxI.collapseDims(); + cyI.collapseDims(); + gradoutI.collapseDims(); + gradoutcI.collapseDims(); + gradincxI.collapseDims(); + } + FILL_DIM(INDTYPE, maxDim, LSTM_BACKWARD); + +} + +void THNN_(LSTMFused_updateGradInput)( + THCState *state, + THCTensor *storage, + THCTensor *gradInGates, + THCTensor *cx, + THCTensor *cy, + THCTensor *gradOutput, + THCTensor *gradOutputCell, + THCTensor *gradInputCx) +{ + THCTensor_(resizeAs)(state, gradInputCx, gradOutput); + THCUNN_assertSameGPU(state, 7, storage, gradInGates, cx, cy, + gradOutput, gradOutputCell, gradInputCx); + THNN_(FusedRNNAssertSizes) + (state, 4, 7, storage, gradInGates, cx, cy, + gradOutput, gradOutputCell, gradInputCx); + + bool canUse32bi = THNN_(canUse32BitIndexMath) + (state, 7, storage, gradInGates, cx, cy, + gradOutput, gradOutputCell, gradInputCx); + + if(canUse32bi){ + THNN_(LSTM_back_ind_wrap) + (state, storage, gradInGates, cx, cy, + gradOutput, gradOutputCell, gradInputCx); + }else{ + THNN_(LSTM_back_ind_wrap) + (state, storage, gradInGates, cx, cy, + gradOutput, gradOutputCell, gradInputCx); + } + THCudaCheck(cudaGetLastError()); +} + +template +void THNN_(GRU_forw_ind_wrap)( + THCState *state, + THCTensor *input, + THCTensor *hidden, + THCTensor *bias1, + THCTensor *bias2, + THCTensor *hx, + THCTensor *hy, + THCTensor *storage) +{ + bool has_bias = (bias1!=NULL); + int maxDim; + + if(has_bias){ + THCUNN_assertSameGPU + (state, 7, input, hidden, hx, hy, bias1, bias2, storage); + maxDim = THNN_(minIndexType) + (state, 7, input, hidden, hx, hy, bias1, bias2, storage); + }else{ + THCUNN_assertSameGPU + (state, 5, input, hidden, hx, hy, storage); + maxDim = THNN_(minIndexType) + (state, 5, input, hidden, hx, hy, storage); + } + + ptrdiff_t totalElements = THCTensor_nElement(state, hx); + + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), + "Could not get grid size for pointwise apply."); + + TINFO inputI = getTensorInfo(state, input); + TINFO hiddenI = getTensorInfo(state, hidden); + TINFO hxI = getTensorInfo(state, hx); + TINFO hyI = getTensorInfo(state, hy); + TINFO storageI = getTensorInfo(state, storage); + + INDTYPE hid_size = hxI.sizes[hxI.dims-1]; + if(has_bias){ + THAssertMsg( hid_size*3 == static_cast(THCTensor_(nElement)(state, bias1)) && + hid_size*3 == static_cast(THCTensor_(nElement)(state, bias2)), + "Bias in pointwise operation is an incorrect size, must be 3 x feature size."); + } + + if(maxDim == -2){ + inputI.collapseDims(); + hiddenI.collapseDims(); + hyI.collapseDims(); + hxI.collapseDims(); + storageI.collapseDims(); + } + + INDTYPE zero[1] = {0}; + TINFO nullinfo = TINFO(NULL, 1, zero, zero); + TINFO bias1I = nullinfo; + TINFO bias2I = nullinfo; + + if(has_bias){ + bias1I = getTensorInfo(state, bias1); + bias2I = getTensorInfo(state, bias2); + if(maxDim == -2){ + bias1I.collapseDims(); + bias2I.collapseDims(); + } + } + + FILL_DIM(INDTYPE, maxDim, GRU_FORWARD); + +} + +void THNN_(GRUFused_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *hidden, + THCTensor *bias1, + THCTensor *bias2, + THCTensor *hx, + THCTensor *hy, + THCTensor *storage) +{ + THCTensor_(resizeAs)(state, hy, hx); + THNN_(FusedRNNAssertSizes)(state, 3, 4, input, hidden, hx, hy); + THArgCheck(THCTensor_(nElement)(state, storage) == + THCTensor_(nElement)(state, hx)*5, + 3, "Storage tensor for fused kernel was not sized correctly."); + + + bool has_bias = (bias1!=NULL); + bool canUse32bi; + + if(has_bias){ + canUse32bi = THNN_(canUse32BitIndexMath) + (state, 7, input, hidden, hx, hy, bias1, bias2, storage); + }else{ + canUse32bi = THNN_(canUse32BitIndexMath) + (state, 5, input, hidden, hx, hy, storage); + } + + if(canUse32bi){ + THNN_(GRU_forw_ind_wrap) + (state, input, hidden, bias1, bias2, hx, hy, storage); + }else{ + THNN_(GRU_forw_ind_wrap) + (state, input, hidden, bias1, bias2, hx, hy, storage); + } + + THCudaCheck(cudaGetLastError()); +} + +template +void THNN_(GRU_back_ind_wrap)( + THCState *state, + THCTensor *gradInInput, + THCTensor *gradInHidden, + THCTensor *gradOutput, + THCTensor *gradInputHx, + THCTensor *storage) +{ + + int maxDim = THNN_(minIndexType)(state, 5, gradInInput, gradInHidden, gradOutput, + gradInputHx, storage); + ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput); + + const dim3 block = getApplyBlock(); + dim3 grid; + int curDevice = -1; + cudaGetDevice(&curDevice); + THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice), + "Could not get grid size for pointwise apply"); + + TINFO gradininputI = getTensorInfo(state, gradInInput); + TINFO gradinhiddenI = getTensorInfo(state, gradInHidden); + TINFO gradoutI = getTensorInfo(state, gradOutput); + TINFO gradinhxI = getTensorInfo(state, gradInputHx); + TINFO storageI = getTensorInfo(state, storage); + + INDTYPE hid_size = gradoutI.sizes[gradoutI.dims-1]; + + if(maxDim == -2){ + gradininputI.collapseDims(); + gradinhiddenI.collapseDims(); + gradoutI.collapseDims(); + gradinhxI.collapseDims(); + storageI.collapseDims(); + } + FILL_DIM(INDTYPE, maxDim, GRU_BACKWARD); +} + +void THNN_(GRUFused_updateGradInput)( + THCState *state, + THCTensor *gradInInput, + THCTensor *gradInHidden, + THCTensor *gradOutput, + THCTensor *gradInputHx, + THCTensor *storage) +{ + THCTensor_(resizeAs)(state, gradInputHx, gradOutput); + THCUNN_assertSameGPU(state, 5, gradInInput, gradInHidden, gradOutput, gradInputHx, storage); + THNN_(FusedRNNAssertSizes)(state, 3, 4, gradInInput, gradInHidden, gradOutput, gradInputHx); + bool canUse32bi = THNN_(canUse32BitIndexMath)(state, 5, gradInInput, gradInHidden, + gradOutput, gradInputHx, storage); + if(canUse32bi){ + THNN_(GRU_back_ind_wrap) + (state, gradInInput, gradInHidden, gradOutput, gradInputHx, storage); + }else{ + THNN_(GRU_back_ind_wrap) + (state, gradInInput, gradInHidden, gradOutput, gradInputHx, storage); + } + + THCudaCheck(cudaGetLastError()); +} + +//Clean up compiler namespace +#undef DEVICE_LINEAR_GET +#undef H2F +#undef F2H +#undef EXPAND_FUNCTION +#undef EXPAND_DIM +#undef EXPAND_TYPE +#undef FILL_TYPES_FORWARD +#undef FILL_FORWARD +#undef FILL_TYPES_BACKWARD +#undef FILL_BACKWARD + +#endif diff --git a/aten/src/THCUNN/generic/GatedLinearUnit.cu b/aten/src/THCUNN/generic/GatedLinearUnit.cu new file mode 100644 index 0000000..4622403 --- /dev/null +++ b/aten/src/THCUNN/generic/GatedLinearUnit.cu @@ -0,0 +1,59 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/GatedLinearUnit.cu" +#else + +void THNN_(GatedLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int dim) +{ + THCUNN_assertSameGPU(state, 2, input, output); + + // size output to half of input + dim = dim - TH_INDEX_BASE; + const int64_t nIn = THCTensor_(size)(state, input, dim); + THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", + dim + TH_INDEX_BASE, nIn); + const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2; + THLongStorage *newSizes = THCTensor_(newSizeOf)(state, input); + THLongStorage_set(newSizes, dim, inputSize); + THCTensor_(resize)(state, output, newSizes, NULL); + + // halve tensor + THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize); + THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize); + + // x = x1:cmul( sigmoid(x2) ) + THC_pointwiseApply3(state, output, secondHalf, firstHalf, gatedLinearCSigMul_functor()); + + THLongStorage_free(newSizes); + THCTensor_(free)(state, firstHalf); + THCTensor_(free)(state, secondHalf); +} + +void THNN_(GatedLinear_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int dim) +{ + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + dim = dim - TH_INDEX_BASE; + const int64_t nIn = THCTensor_(size)(state, input, dim); + THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", + dim + TH_INDEX_BASE, nIn); + + THCTensor_(resizeAs)(state, gradInput, input); + const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2; + THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize); + THCTensor *gradInputfirstHalf = THCTensor_(newNarrow)(state, gradInput, dim, 0, inputSize); + const int64_t stride_i = THCTensor_(stride)(state, input, dim) * inputSize; + const int64_t stride_gI = THCTensor_(stride)(state, gradInput, dim) * inputSize; + THC_pointwiseApply3(state, gradInputfirstHalf, gradOutput, firstHalf, gatedLinearDerivative(stride_i, stride_gI)); + THCTensor_(free)(state, firstHalf); + THCTensor_(free)(state, gradInputfirstHalf); +} + +#endif diff --git a/aten/src/THCUNN/generic/HardTanh.cu b/aten/src/THCUNN/generic/HardTanh.cu new file mode 100644 index 0000000..18195b7 --- /dev/null +++ b/aten/src/THCUNN/generic/HardTanh.cu @@ -0,0 +1,61 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/HardTanh.cu" +#else + +#include "../common.h" + +void THNN_(HardTanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal min_val_, + accreal max_val_, + bool inplace) +{ + real min_val = ScalarConvert::to(min_val_); + real max_val = ScalarConvert::to(max_val_); + + THCUNN_assertSameGPU(state, 2, input, output); + if(inplace) + { + THCTensor_(set)(state, output, input); + THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor(min_val, max_val)); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, + hardtanhupdateOutput_functor(min_val, max_val)); + } +} + +void THNN_(HardTanh_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal min_val_, + accreal max_val_, + bool inplace) +{ + real min_val = ScalarConvert::to(min_val_); + real max_val = ScalarConvert::to(max_val_); + + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + + if (inplace) + { + THCTensor_(set)(state, gradInput, gradOutput); + THC_pointwiseApply2(state, gradInput, input, + hardtanhupdateGradInput_functor(min_val, max_val)); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, + hardtanhupdateGradInput_functor(min_val, max_val)); + } +} + +#endif diff --git a/aten/src/THCUNN/generic/Im2Col.cu b/aten/src/THCUNN/generic/Im2Col.cu new file mode 100644 index 0000000..dd35461 --- /dev/null +++ b/aten/src/THCUNN/generic/Im2Col.cu @@ -0,0 +1,119 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Im2Col.cu" +#else + +static inline void THNN_(Im2Col_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { + + THArgCheck(kW > 0 && kH > 0, 4, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 6, + "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(padW >= 0 && padH >= 0, 8, + "padding should be non-negative, but got padH: %d padW: %d", padH, padW); + THArgCheck(sW > 0 && sH > 0, 10, + "stride should be greater than zero, but got sH: %d sW: %d", sH, sW); + + int64_t ndim = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "Expected non-empty 3D or 4D input tensor, but got input of shape %s"); + + int dim_batch = 0; + if (ndim == 3) { + dim_batch = -1; + } + int64_t nInputPlane = THCTensor_(size)(state, input, dim_batch + 1); + int64_t inputHeight = THCTensor_(size)(state, input, dim_batch + 2); + int64_t inputWidth = THCTensor_(size)(state, input, dim_batch + 3); + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + + if (outputHeight < 1 || outputWidth < 1) { + THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), " + "dilation=(%d, %d), padding=(%d, %d), calculated " + "shape of the array of sliding blocks as (%d, %d), which is " + "too small (non-positive).", + inputHeight, inputHeight, kH, kW, dH, dW, padH, padW, + outputHeight, outputWidth); + } +} + +void THNN_(Im2Col_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THCUNN_assertSameGPU(state, 2, input, output); + + THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW); + + input = THCTensor_(newContiguous)(state, input); + bool batched_input = true; + if (input->dim() == 3) { + batched_input = false; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } + + int64_t batchSize = THCTensor_(size)(state, input, 0); + int64_t nInputPlane = THCTensor_(size)(state, input, 1); + int64_t inputHeight = THCTensor_(size)(state, input, 2); + int64_t inputWidth = THCTensor_(size)(state, input, 3); + + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nOutputPlane = nInputPlane * kW * kH; + int64_t outputLength = outputHeight * outputWidth; + + THCTensor_(resize3d)(state, output, batchSize, nOutputPlane, outputLength); + THCTensor_(zero)(state, output); + + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + for (int64_t elt = 0; elt < batchSize; elt++) { + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, sH, sW, + dH, dW, THCTensor_(data)(state, output_n)); + } + + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + if (!batched_input) { + THCTensor_(resize2d)(state, output, nOutputPlane, outputLength); + } + THCTensor_(free)(state, input); +} + +void THNN_(Im2Col_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput, + inputHeight, inputWidth, + kH, kW, dH, dW, + padH, padW, sH, sW); +} + +#endif diff --git a/aten/src/THCUNN/generic/IndexLinear.cu b/aten/src/THCUNN/generic/IndexLinear.cu new file mode 100644 index 0000000..244d234 --- /dev/null +++ b/aten/src/THCUNN/generic/IndexLinear.cu @@ -0,0 +1,273 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/IndexLinear.cu" +#else + +static bool THNN_(checkKeysValues)(THCState *state, THCudaLongTensor* keys, + THCTensor* values) +{ + return THCudaLongTensor_size(state, keys, 0) == THCTensor_(nElement)(state, values) + && THCTensor_(_nDimension)(state, values) == 1 + && THCudaLongTensor__nDimension(state, keys) == 1; +} + +void THNN_(IndexLinear_updateOutput)( + THCState *state, + THCudaLongTensor *keys, + int64_t keysOffset, + THCTensor *values, + THCudaLongTensor *sizes, + THCudaLongTensor *cumSumSizes, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *normalizedValues, + int train) +{ + // Make sure these inputs are contiguous to accelerate computations + THArgCheck(THCudaLongTensor_isContiguous(state, keys), 1, + "keys vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, values), 3, + "values vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, sizes), 4, + "sizes vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 5, + "cumSumSizes vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, output), 6, + "output vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, weight), 7, + "weight matrix must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, bias), 8, + "bias vector must be contiguous"); + THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, + "Keys and values should have the same number of elements"); + + int64_t batchSize = sizes->size[0]; + int64_t outDim = bias->size[0]; + int64_t wDim = weight->size[1]; + int64_t weightStride = weight->stride[0]; + int maxNormalize = wDim - outDim; + int64_t keysSize = keys->size[0]; + int64_t nnzPerRow = divup(keysSize, batchSize); + + THCTensor_(resize2d)(state, output, batchSize, outDim); + int64_t *keysData = THCudaLongTensor_data (state, keys); + real *valuesData = THCTensor_(data) (state, values); + int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes); + real *biasData = THCTensor_(data) (state, bias); + real *weightData = THCTensor_(data) (state, weight); + real *outData = THCTensor_(data) (state, output); + + cudaStream_t stream = THCState_getCurrentStream(state); + dim3 threads(THREADS_X, THREADS_Y); + int blocks_x = divup(outDim, threads.x); + int blocks_y = batchSize; + int nnzPerBlock = ((outDim == 1 || batchSize == 1) ? THREADS_X : NNZ_PER_BLOCK_MAX); + int blocks_z = divup(nnzPerRow, nnzPerBlock); + + dim3 blocks(blocks_x, blocks_y, blocks_z); + + if (blocks_z > 1) { + THCudaCheck(cudaMemsetAsync(outData, 0, outDim * batchSize * sizeof(real), stream)); + } + + real *normalizedValuesData = NULL; + if (maxNormalize && train) { + THCTensor_(resize1d)(state, normalizedValues, keysSize); + normalizedValuesData = THCTensor_(data)(state, normalizedValues); + updateOutput<<>> + (outData, normalizedValuesData, valuesData, cumSumSizesData, keysData, + batchSize, outDim, weightData, biasData, weightStride, keysOffset, maxNormalize, nnzPerBlock); + } else { + updateOutput<<>> + (outData, normalizedValuesData, valuesData, cumSumSizesData, keysData, + batchSize, outDim, weightData, biasData, weightStride, keysOffset, maxNormalize, nnzPerBlock); + } +} + +void THNN_(IndexLinear_accGradParameters)( + THCState *state, + THCudaLongTensor *keys, + int64_t keysOffset, + THCTensor *values, + THCudaLongTensor *sizes, + THCudaLongTensor *cumSumSizes, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + THCTensor* valuesBuffer, + accreal weightDecay, + accreal scale) +{ + int64_t keysSize = keys->size[0]; + int64_t batchSize = sizes->size[0]; + int64_t outDim = bias->size[0]; + int64_t wDim = weight->size[1]; + int maxNormalize = wDim - outDim; + + // Make sure these inputs are contiguous to accelerate computations + THArgCheck(THCudaLongTensor_isContiguous(state, keys), 1, + "keys vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, values), 3, + "values vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, sizes), 4, + "sizes vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 5, + "cumSumSizes vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 6, + "gradOutput vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 7, + "gradWeight matrix must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 8, + "gradBias vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, weight), 9, + "weight matrix must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, bias), 10, + "bias vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, valuesBuffer), 11, + "valuesBuffer vector must be contiguous"); + THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, + "Keys and values should have the same number of elements"); + + THCTensor_(resize2d)(state, gradWeight, keysSize, outDim * (maxNormalize > 0 ? 2 : 1)); + + real *valuesData = THCTensor_(data) (state, values); + int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes); + real *gradOutputData = THCTensor_(data) (state, gradOutput); + real *gradBiasData = THCTensor_(data) (state, gradBias); + real *gradWeightData = THCTensor_(data) (state, gradWeight); + int64_t gradWeightStride = gradWeight->stride[0]; + + cudaStream_t stream = THCState_getCurrentStream(state); + dim3 threads(THREADS_X, THREADS_Y); + int blocks_x = divup(outDim, threads.x); + accGradBias<<>> + (gradBiasData, gradOutputData, outDim, batchSize, scale, weightDecay); + + dim3 blocks(blocks_x, batchSize); + accGradWeight<<>> + (gradWeightData, gradOutputData, valuesData, cumSumSizesData, outDim, + gradWeightStride, scale, weightDecay, maxNormalize); +} + +void THNN_(IndexLinear_accUpdateGradParameters)( + THCState *state, + THCudaLongTensor *keys, + int64_t keysOffset, + THCTensor *values, + THCudaLongTensor *sizes, + THCudaLongTensor *cumSumSizes, + THCTensor *gradOutput, + THCTensor *weight, + THCTensor *bias, + accreal weightDecay, + accreal scale) +{ + // Make sure these inputs are contiguous to accelerate computations + THArgCheck(THCudaLongTensor_isContiguous(state, keys), 1, + "keys vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, values), 3, + "values vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, sizes), 4, + "sizes vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 5, + "cumSumSizes vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 6, + "gradOutput vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, weight), 7, + "weight matrix must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, bias), 8, + "bias vector must be contiguous"); + THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, + "Keys and values should have the same number of elements"); + + int64_t batchSize = sizes->size[0]; + int64_t outDim = bias->size[0]; + int64_t keysSize = keys->size[0]; + int64_t wDim = weight->size[1]; + int maxNormalize = wDim - outDim; + + real *biasData = THCTensor_(data) (state, bias); + real *weightData = THCTensor_(data) (state, weight); + real *gradOutputData = THCTensor_(data) (state, gradOutput); + real *valuesData = THCTensor_(data) (state, values); + int64_t *keysData = THCudaLongTensor_data (state, keys); + int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes); + int64_t weightStride = weight->stride[0]; + + cudaStream_t stream = THCState_getCurrentStream(state); + dim3 threads(THREADS_X, THREADS_Y); + int blocks_x = divup(outDim, threads.x); + + accGradBias<<>> + (biasData, gradOutputData, outDim, batchSize, scale, weightDecay); + + int64_t nnzPerRow = divup(keysSize, batchSize); + int blocks_y = divup(nnzPerRow, REPEAT * threads.y); + dim3 blocks(blocks_x, blocks_y); + + for (int64_t batchId = 0; batchId < batchSize; batchId++) { + accUpdateWeight<<>> + (weightData, weightStride, gradOutputData, outDim, valuesData, + cumSumSizesData, keysData, keysOffset, scale, weightDecay, maxNormalize, + batchId); + } +} + +void THNN_(IndexLinear_updateParameters)( + THCState *state, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + THCudaLongTensor *runningKeys, + THCudaLongTensor *cumSumSizes, + int64_t keysOffset, + accreal weightDecay, + accreal learningRate) +{ + // Make sure these inputs are contiguous to accelerate computations + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 1, + "gradWeight matrix must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 2, + "gradBias vector must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, weight), 3, + "weight matrix must be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, bias), 4, + "bias vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, runningKeys), 5, + "runningKeys vector must be contiguous"); + THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 6, + "cumSumSizes vector must be contiguous"); + + int64_t outDim = bias->size[0]; + int64_t wDim = weight->size[1]; + int maxNormalize = wDim - outDim; + int64_t keysSize = runningKeys->size[0]; + int64_t batchSize = cumSumSizes->size[0]; + + THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias); + int64_t gradWeightStride = gradWeight->stride[0]; + int64_t weightStride = weight->stride[0]; + + int64_t *keysData = THCudaLongTensor_data (state, runningKeys); + int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes); + real *gradWeightData = THCTensor_(data) (state, gradWeight); + real *weightData = THCTensor_(data) (state, weight); + + dim3 threads(THREADS_X, THREADS_Y); + int64_t nnzPerRow = divup(keysSize, batchSize); + int blocks_x = divup(outDim, threads.x); + int blocks_y = divup(nnzPerRow, REPEAT * threads.y); + dim3 blocks(blocks_x, blocks_y); + cudaStream_t stream = THCState_getCurrentStream(state); + + for (int64_t batchId = 0; batchId < batchSize; batchId++) { + updateWeight<<>> + (weightData, gradWeightData, keysData, cumSumSizesData, outDim, + gradWeightStride, weightStride, keysOffset, learningRate, weightDecay, + maxNormalize, batchId); + } +} +#endif diff --git a/aten/src/THCUNN/generic/L1Cost.cu b/aten/src/THCUNN/generic/L1Cost.cu new file mode 100644 index 0000000..fd85e61 --- /dev/null +++ b/aten/src/THCUNN/generic/L1Cost.cu @@ -0,0 +1,44 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/L1Cost.cu" +#else + +void THNN_(L1Cost_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_check_dim_size(state, output, 1, 0, 1); + THCUNN_assertSameGPU(state, 1, input); + accreal sum; + ptrdiff_t size = THCTensor_(nElement)(state, input); + input = THCTensor_(newContiguous)(state, input); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor(), accreal(0), thrust::plus()); + + THCTensor_(free)(state, input); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + +void THNN_(L1Cost_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 2, input, gradInput); + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor()); + + THCTensor_(free)(state, input); +} + +#endif diff --git a/aten/src/THCUNN/generic/LeakyReLU.cu b/aten/src/THCUNN/generic/LeakyReLU.cu new file mode 100644 index 0000000..dc92090 --- /dev/null +++ b/aten/src/THCUNN/generic/LeakyReLU.cu @@ -0,0 +1,59 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LeakyReLU.cu" +#else + +#include "../common.h" + +void THNN_(LeakyReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal negval_, + bool inplace) +{ + real negval = ScalarConvert::to(negval_); + + THCUNN_assertSameGPU(state, 2, input, output); + + if (inplace) + { + THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP(negval)); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput(negval)); + } + + THCudaCheck(cudaGetLastError()); +} + +void THNN_(LeakyReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal negval_, + bool inplace) +{ + real negval = ScalarConvert::to(negval_); + + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput); + + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP(negval)); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput(negval)); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/LogSigmoid.cu b/aten/src/THCUNN/generic/LogSigmoid.cu new file mode 100644 index 0000000..02d55da --- /dev/null +++ b/aten/src/THCUNN/generic/LogSigmoid.cu @@ -0,0 +1,31 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LogSigmoid.cu" +#else + +#include "../common.h" + +void THNN_(LogSigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *buffer) +{ + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor()); +} + +void THNN_(LogSigmoid_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *buffer) +{ + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor()); +} + +#endif diff --git a/aten/src/THCUNN/generic/LookupTable.cu b/aten/src/THCUNN/generic/LookupTable.cu new file mode 100644 index 0000000..22653dd --- /dev/null +++ b/aten/src/THCUNN/generic/LookupTable.cu @@ -0,0 +1,212 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LookupTable.cu" +#else + +void THNN_(LookupTable_accGradParameters)( + THCState *state, + THCIndexTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCIndexTensor *count, + THCIndexTensor *sortedIndices, + THCIndexTensor *origIndices, + bool scaleGradByFreq, + int paddingValue, + accreal scale_) +{ + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, sortedIndices, origIndices); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + if (!(THCIndexTensor_(isContiguous)(state, input) && + THCTensor_(isContiguous)(state, gradWeight))) { + THError("Tensors must be contiguous"); + } + + int nDim = THCIndexTensor_(_nDimension)(state, input); + if (THCIndexTensor_(_nDimension)(state, input) != 1 && THCIndexTensor_(_nDimension)(state, input) != 2) { + THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, input); + THError("input must be a vector or matrix, but is of shape: %s", s1.str); + } + + ptrdiff_t numel = THCIndexTensor_(nElement)(state, input); + int64_t stride = THCTensor_(stride)(state, gradWeight, 0); + + cudaStream_t stream = THCState_getCurrentStream(state); + + if (numel <= 768 && !scaleGradByFreq) { + const int WARP_SIZE = 32; + const int BLOCKDIMY = 32; + dim3 grid(THCCeilDiv(stride, (int64_t)WARP_SIZE)); + dim3 block(WARP_SIZE, BLOCKDIMY); + + cunn_LookupTable_accGradParametersKernelByFeature + <<>> + (THCIndexTensor_(data)(state, input), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, gradWeight), + scale, + numel, + stride, + paddingValue); + THCTensor_(free)(state, gradOutput); + THCudaCheck(cudaGetLastError()); + return; + } + + THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input); + THCIndexTensor_(resize)(state, sortedIndices, inputSize, NULL); + THCIndexTensor_(resize)(state, origIndices, inputSize, NULL); + THLongStorage_free(inputSize); + + // Sort the inputs into sorted with the corresponding indices; we + // don't need a stable or multidimensional sort, so just use Thrust + // directly + { + THCIndexTensor_(copy)(state, sortedIndices, input); + + THCThrustAllocator thrustAlloc(state); + + thrust::device_ptr + sortedIndicesIter(THCIndexTensor_(data)(state, sortedIndices)); + thrust::device_ptr + origIndicesIter(THCIndexTensor_(data)(state, origIndices)); + + // Fill sortedOrigIndices with sequential indices + thrust::counting_iterator countIter(TH_INDEX_BASE); + + thrust::copy( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + countIter, countIter + numel, origIndicesIter); + + // Sort; a stable sort is not required + thrust::sort_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + sortedIndicesIter, sortedIndicesIter + numel, + origIndicesIter, ThrustLTOp()); + } + + THCIndex_t *sortedIndices_data = THCIndexTensor_(data)(state, sortedIndices); + THCIndex_t *origIndices_data = THCIndexTensor_(data)(state, origIndices); + THCIndex_t *count_data = NULL; + + if (scaleGradByFreq) { + THCIndexTensor_(resizeAs)(state, count, input); + count_data = THCIndexTensor_(data)(state, count); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr sortedIndices_ptr(sortedIndices_data); + thrust::device_ptr count_ptr(count_data); + + // Compute an increasing sequence per unique item in sortedIndices: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + thrust::inclusive_scan_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + sortedIndices_ptr, + sortedIndices_ptr + numel, + thrust::make_constant_iterator(1), + count_ptr + ); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + thrust::make_reverse_iterator(sortedIndices_ptr + numel), + thrust::make_reverse_iterator(sortedIndices_ptr), + thrust::make_reverse_iterator(count_ptr + numel), + thrust::make_reverse_iterator(count_ptr + numel), + thrust::equal_to(), + thrust::maximum() + ); + } + + dim3 grid(THCCeilDiv(numel, (ptrdiff_t) 4), THCCeilDiv(stride, (int64_t) 128)); + dim3 block(32, 4); + cunn_LookupTable_accGradParametersKernel<<>>( + sortedIndices_data, + origIndices_data, + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, gradWeight), + count_data, + scale, + numel, + stride, + paddingValue + ); + + THCTensor_(free)(state, gradOutput); + THCudaCheck(cudaGetLastError()); +} + +#define THREADS 256 +#define RUN(NORM, IDXTYPE) \ + calculate_norms_and_renorm \ + <<>> \ + (weightsRaw, idxRaw, normType, maxNorm, THCTensor_(stride)(state, weight, 0)) + +void THNN_(LookupTable_renorm)( + THCState *state, + THCIndexTensor *idx, + THCTensor *weight, + accreal maxNorm, + accreal normType) +{ + THCUNN_assertSameGPU(state, 2, idx, weight); + if (!(THCIndexTensor_(isContiguous)(state, idx) && + THCTensor_(isContiguous)(state, weight))) { + THError("Tensors must be contiguous"); + } + + if (THCIndexTensor_(_nDimension)(state, idx) != 1) { + THError("idx must be a vector"); + } + + if (normType <= 0) { + THError("non-positive-norm not supported"); + } + + THCIndex_t numel = THCIndexTensor_(nElement)(state, idx); + + real * weightsRaw = THCTensor_(data)(state, weight); + THCIndex_t * idxRaw = THCIndexTensor_(data)(state, idx); + + // get the unique indices + thrust::device_ptr idxThrust(idxRaw); + thrust::device_ptr endIdxThrust(thrust::unique(idxThrust, idxThrust+numel)); + numel = endIdxThrust - idxThrust; + + // At launch time figure out what the index type is and norm type + int Norm = ScalarConvert::to(normType); + if (THCTensor_canUse32BitIndexMath(state, idx)) { + if (Norm == 1) { + RUN(1, unsigned int); + } else if (Norm == 2) { + RUN(2, unsigned int); + } else { + RUN(-1, unsigned int); + } + } else { + if (Norm == 1) { + RUN(1, unsigned long); + } else if (Norm == 2) { + RUN(2, unsigned long); + } else { + RUN(-1, unsigned long); + } + } +} + +#endif diff --git a/aten/src/THCUNN/generic/LookupTableBag.cu b/aten/src/THCUNN/generic/LookupTableBag.cu new file mode 100644 index 0000000..8386f60 --- /dev/null +++ b/aten/src/THCUNN/generic/LookupTableBag.cu @@ -0,0 +1,200 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LookupTableBag.cu" +#else + + +void THNN_(LookupTableBag_updateOutput)( + THCState *state, + THCIndexTensor *input, + THCIndexTensor *offsets, + THCTensor *weight, + THCTensor *output, + THCIndexTensor *offset2bag, + int mode, + THCIndexTensor *bag_size) +{ + THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag); + + if (!(THCIndexTensor_(isContiguous)(state, input) && + THCIndexTensor_(isContiguous)(state, offsets) && + THCTensor_(isContiguous)(state, weight))) { + THError("Tensors must be contiguous"); + } + + ptrdiff_t numIndices = THCIndexTensor_(size)(state, input, 0); + ptrdiff_t numBags = THCIndexTensor_(size)(state, offsets, 0); + ptrdiff_t stride = THCTensor_(size)(state, weight, 1); + int64_t *bag_size_data = NULL; + if (bag_size != NULL) { + bag_size_data = THCIndexTensor_(data)(state, bag_size); + } + + cudaStream_t stream = THCState_getCurrentStream(state); + + THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input); + THLongStorage *outputSize = THLongStorage_newWithSize(2); + THLongStorage_data(outputSize)[0] = numBags; + THLongStorage_data(outputSize)[1] = stride; + THCTensor_(resize)(state, output, outputSize, NULL); + THCTensor_(zero)(state, output); + THCIndexTensor_(resize)(state, offset2bag, inputSize, NULL); + THLongStorage_free(inputSize); + THLongStorage_free(outputSize); + + dim3 block = dim3(32, 8); + int grid = 1024; + cunn_LookupTableBag_updateOutputKernel<<>>( + THCIndexTensor_(data)(state, input), + THCIndexTensor_(data)(state, offsets), + THCTensor_(data)(state, weight), + THCTensor_(data)(state, output), + THCIndexTensor_(data)(state, offset2bag), + numIndices, + numBags, + stride, + mode, + bag_size_data + ); + + THCudaCheck(cudaGetLastError()); +} + + +void THNN_(LookupTableBag_accGradParameters)( + THCState *state, + THCIndexTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCIndexTensor *offset2bag, + THCIndexTensor *count, + THCIndexTensor *sortedIndices, + THCIndexTensor *origIndices, + bool scaleGradByFreq, + int mode, + THCIndexTensor *bag_size, + accreal scale_) +{ + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, offset2bag, sortedIndices, origIndices); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + if (!(THCIndexTensor_(isContiguous)(state, input) && + THCTensor_(isContiguous)(state, gradWeight) && + THCIndexTensor_(isContiguous)(state, offset2bag))) { + THError("Tensors must be contiguous"); + } + + int64_t *bag_size_data = NULL; + if (bag_size != NULL) { + bag_size_data = THCIndexTensor_(data)(state, bag_size); + } + + int nDim = THCIndexTensor_(_nDimension)(state, input); + if (THCIndexTensor_(_nDimension)(state, input) != 1 && THCIndexTensor_(_nDimension)(state, input) != 2) { + THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, input); + THError("input must be a vector or matrix, but is of shape: %s", s1.str); + } + + ptrdiff_t numel = THCIndexTensor_(nElement)(state, input); + int64_t stride = THCTensor_(stride)(state, gradWeight, 0); + + cudaStream_t stream = THCState_getCurrentStream(state); + + THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input); + THCIndexTensor_(resize)(state, sortedIndices, inputSize, NULL); + THCIndexTensor_(resize)(state, origIndices, inputSize, NULL); + THLongStorage_free(inputSize); + + // Sort the inputs into sorted with the corresponding indices; we + // don't need a stable or multidimensional sort, so just use Thrust + // directly + { + THCIndexTensor_(copy)(state, sortedIndices, input); + + THCThrustAllocator thrustAlloc(state); + + thrust::device_ptr + sortedIndicesIter(THCIndexTensor_(data)(state, sortedIndices)); + thrust::device_ptr + origIndicesIter(THCIndexTensor_(data)(state, origIndices)); + + // Fill sortedOrigIndices with sequential indices + thrust::counting_iterator countIter(TH_INDEX_BASE); + + thrust::copy( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + countIter, countIter + numel, origIndicesIter); + + // Sort; a stable sort is not required + thrust::sort_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + sortedIndicesIter, sortedIndicesIter + numel, + origIndicesIter, ThrustLTOp()); + } + + THCIndex_t *sortedIndices_data = THCIndexTensor_(data)(state, sortedIndices); + THCIndex_t *origIndices_data = THCIndexTensor_(data)(state, origIndices); + THCIndex_t *offset2bag_data = THCIndexTensor_(data)(state, offset2bag); + THCIndex_t *count_data = NULL; + + if (scaleGradByFreq) { + THCIndexTensor_(resizeAs)(state, count, input); + count_data = THCIndexTensor_(data)(state, count); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr sortedIndices_ptr(sortedIndices_data); + thrust::device_ptr count_ptr(count_data); + + // Compute an increasing sequence per unique item in sortedIndices: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + thrust::inclusive_scan_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + sortedIndices_ptr, + sortedIndices_ptr + numel, + thrust::make_constant_iterator(1), + count_ptr + ); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + thrust::make_reverse_iterator(sortedIndices_ptr + numel), + thrust::make_reverse_iterator(sortedIndices_ptr), + thrust::make_reverse_iterator(count_ptr + numel), + thrust::make_reverse_iterator(count_ptr + numel), + thrust::equal_to(), + thrust::maximum() + ); + } + + dim3 grid(THCCeilDiv(numel, (ptrdiff_t) 4), THCCeilDiv(stride, (int64_t) 128)); + dim3 block(32, 4); + cunn_LookupTableBag_accGradParametersKernel<<>>( + sortedIndices_data, + origIndices_data, + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, gradWeight), + offset2bag_data, + count_data, + scale, + numel, + stride, + mode, + bag_size_data + ); + + THCTensor_(free)(state, gradOutput); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/MSECriterion.cu b/aten/src/THCUNN/generic/MSECriterion.cu new file mode 100644 index 0000000..e41e741 --- /dev/null +++ b/aten/src/THCUNN/generic/MSECriterion.cu @@ -0,0 +1,126 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MSECriterion.cu" +#else + +void THNN_(MSECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 3, input, target, output); + + if (reduction != Reduction::None) { + THCTensor_(resize1d)(state, output, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, (accreal) 0, + thrust::plus(), mse_functor()); + + if (reduction == Reduction::ElementwiseMean) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); + return; + } + + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply3( + state, + input, + target, + output, + mse_updateOutput_functor()); +} + +void THNN_(MSECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput); + + if (reduction != Reduction::None) { + ptrdiff_t size = THCTensor_(nElement)(state, input); + + THCUNN_check_dim_size(state, gradOutput, 1, 0, 1); + accreal norm = reduction == Reduction::ElementwiseMean ? (accreal)(2)/size : (accreal)(2); + norm *= ScalarConvert::to(THCTensor_(get1d)(state, gradOutput, 0)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, gradInput_data, + mse_updateGradInput_functor(norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + return; + } + + THCUNN_check_shape(state, input, gradOutput); + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + THCTensor_(resizeAs)(state, gradInput, input); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradOutput_data(THCTensor_(data)(state, gradOutput)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, gradInput_data, + mse_updateGradInput_functor(2)); + + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + gradInput_data, gradInput_data+size, gradOutput_data, gradInput_data, + thrust::multiplies()); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/MarginCriterion.cu b/aten/src/THCUNN/generic/MarginCriterion.cu new file mode 100644 index 0000000..221f9d9 --- /dev/null +++ b/aten/src/THCUNN/generic/MarginCriterion.cu @@ -0,0 +1,70 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MarginCriterion.cu" +#else + +void THNN_(MarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage, + accreal margin_) +{ + real margin = ScalarConvert::to(margin_); + THCUNN_check_nElement(state, input, target); + THCUNN_check_dim_size(state, output, 1, 0, 1); + THCUNN_assertSameGPU(state, 2, input, target); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus(), + margin_functor(ScalarConvert::to(margin))); + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + + +void THNN_(MarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage, + accreal margin_) +{ + real margin = ScalarConvert::to(margin_); + + THCUNN_check_nElement(state, input, target); + THCUNN_assertSameGPU(state, 3, input, target, gradInput); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + accreal norm = sizeAverage ? 1.f/size : 1; + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, + margin_updateGradInput_functor(ScalarConvert::to(margin), norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu new file mode 100644 index 0000000..2b02bf2 --- /dev/null +++ b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu @@ -0,0 +1,162 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu" +#else + +// TODO: improve error messages +void THNN_(MultiLabelMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + THCTensor *istarget, + int64_t reduction) +{ + input = THCTensor_(newContiguous)(state, input); + target = THCIndexTensor_(newContiguous)(state, target); + istarget = THCTensor_(newContiguous)(state, istarget); + THCTensor_(resizeAs)(state, istarget, input); + + if(input->dim() == 1) + { + int dim = input->size[0]; + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3, + "inconsistent target size"); + THCTensor_(resize1d)(state, output, 1); + + dim3 blocks(1); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateOutput_kernel + <<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + 1, dim, + reduction == Reduction::ElementwiseMean + ); + THCudaCheck(cudaGetLastError()); + } + else if(input->dim() == 2) + { + int nframe = input->size[0]; + int dim = input->size[1]; + THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe) + && (target->size[1] == dim), 3, "inconsistent target size"); + + dim3 blocks(input->size[0]); + dim3 threads(MULTILABELMARGIN_THREADS); + + if (reduction != Reduction::None) + { + THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]); + THCTensor_(resize1d)(state, output, 1); + + cunn_MultiLabelMarginCriterion_updateOutput_kernel + <<>>( + THCTensor_(data)(state, output_tmp), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + nframe, dim, + reduction == Reduction::ElementwiseMean + ); + THCudaCheck(cudaGetLastError()); + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(THCTensor_(sumall)(state, output_tmp))); + THCTensor_(free)(state, output_tmp); + } + else + { + THCTensor_(resize1d)(state, output, input->size[0]); + + cunn_MultiLabelMarginCriterion_updateOutput_kernel + <<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + nframe, dim, + false + ); + THCudaCheck(cudaGetLastError()); + } + } + else + AT_ERROR("non-empty vector or matrix expected, got size: ", input->sizes()); + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, istarget); +} + +void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *istarget, + int64_t reduction) +{ + input = THCTensor_(newContiguous)(state, input); + target = THCIndexTensor_(newContiguous)(state, target); + istarget = THCTensor_(newContiguous)(state, istarget); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + + if(gradInput->dim() == 1) + { + int dim = gradInput->size[0]; + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3, + "inconsistent target size"); + THArgCheck(!istarget->is_empty() && (istarget->dim() == 1) && (istarget->size[0] == dim), 3, + "inconsistent isTarget size"); + dim3 blocks(1); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateGradInput_kernel + <<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + 1, gradInput->size[0], + reduction == Reduction::ElementwiseMean, + reduction != Reduction::None); + + } + else if(gradInput->dim() == 2) + { + int nframe = gradInput->size[0]; + int dim = gradInput->size[1]; + THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe) + && (target->size[1] == dim), 3, "inconsistent target size"); + THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size[0] == nframe) + && (istarget->size[1] == dim), 3, "inconsistent isTarget size"); + dim3 blocks(gradInput->size[0]); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateGradInput_kernel + <<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + gradInput->size[0], gradInput->size[1], + reduction == Reduction::ElementwiseMean, + reduction != Reduction::None); + } + else + AT_ERROR("non-empty vector or matrix expected, got size: ", gradInput->sizes()); + + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, istarget); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu new file mode 100644 index 0000000..a620c0f --- /dev/null +++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu @@ -0,0 +1,236 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MultiMarginCriterion.cu" +#else + +// TODO: improve error messages +void THNN_(MultiMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + int64_t reduction, + int p, + THCTensor *weights, + accreal margin_) +{ + real margin = ScalarConvert::to(margin_); + THCUNN_assertSameGPU(state, 2, input, target); + input = THCTensor_(newContiguous)(state, input); + if(weights) + weights = THCTensor_(newContiguous)(state, weights); + if (input->dim() == 1) + { + dim3 blocks(1); + dim3 threads(MULTIMARGIN_THREADS); + THCTensor_(resize1d)(state, output, 1); + if (p == 1) + { + cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, input->size[0], + reduction == Reduction::ElementwiseMean, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, input->size[0], + reduction == Reduction::ElementwiseMean, + margin + ); + } + THCudaCheck(cudaGetLastError()); + } + else if (input->dim() == 2) + { + int nframe = input->size[0]; + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3, + "inconsistent target size"); + dim3 blocks(input->size[0]); + dim3 threads(MULTIMARGIN_THREADS); + + if (reduction == Reduction::None) + { + THCTensor_(resize1d)(state, output, input->size[0]); + if (p == 1) + { + cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + nframe, input->size[1], + false, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + nframe, input->size[1], + false, + margin + ); + } + THCudaCheck(cudaGetLastError()); + } + else + { + THCTensor_(resize1d)(state, output, 1); + THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]); // tmp output buffer + if (p == 1) + { + cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<>>( + THCTensor_(data)(state, output_), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + nframe, input->size[1], + reduction == Reduction::ElementwiseMean, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<>>( + THCTensor_(data)(state, output_), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + input->size[0], input->size[1], + reduction == Reduction::ElementwiseMean, + margin + ); + } + THCudaCheck(cudaGetLastError()); + float sum = THCTensor_(sumall)(state, output_); + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); + THCTensor_(free)(state, output_); + } + } + else + { + AT_ERROR("non-empty vector or matrix expected, got sizes: ", input->sizes()); + } + + THCTensor_(free)(state, input); + if(weights) + THCTensor_(free)(state, weights); +} + +void THNN_(MultiMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + int p, + THCTensor *weights, + accreal margin_) +{ + real margin = ScalarConvert::to(margin_); + THCUNN_assertSameGPU(state, 3, input, gradInput, target); + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + if(weights) + weights = THCTensor_(newContiguous)(state, weights); + + if (input->dim() == 1) + { + dim3 blocks(1); + dim3 threads(MULTIMARGIN_THREADS); + + if (p == 1) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, gradInput->size[0], + reduction == Reduction::ElementwiseMean, + margin, + reduction != Reduction::None + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, gradInput->size[0], + reduction == Reduction::ElementwiseMean, + margin, + reduction != Reduction::None + ); + } + THCudaCheck(cudaGetLastError()); + } + else if (input->dim() == 2) + { + int nframe = gradInput->size[0]; + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3, + "inconsistent target size"); + dim3 blocks(gradInput->size[0]); + dim3 threads(MULTIMARGIN_THREADS); + + if (p == 1) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + nframe, gradInput->size[1], + reduction == Reduction::ElementwiseMean, + margin, + reduction != Reduction::None + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + nframe, gradInput->size[1], + reduction == Reduction::ElementwiseMean, + margin, + reduction != Reduction::None + ); + } + THCudaCheck(cudaGetLastError()); + } + else + { + AT_ERROR("non-empty vector or matrix expected, got ", input->sizes()); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + if(weights) + THCTensor_(free)(state, weights); +} + +#endif diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu new file mode 100644 index 0000000..e03d573 --- /dev/null +++ b/aten/src/THCUNN/generic/PReLU.cu @@ -0,0 +1,164 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/PReLU.cu" +#else + +void THNN_(PReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight) +{ + THCTensor_(resizeAs)(state, output, input); + int64_t nOutputPlane = THCTensor_(numel)(state, weight); + + weight = THCTensor_(newContiguous)(state, weight); + real *w = THCTensor_(data)(state, weight); + + if (nOutputPlane == 1) + { + THC_pointwiseApply2(state, output, input, PReLUUpdateOutput(w)); + } + else + { + int ndim = THCTensor_(_nDimension)(state, input); + input = THCTensor_(newContiguous)(state, input); + + int n = THCTensor_(nElement)(state, input); + if (input->size[ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]); + + int mapSize = 1; + for (int d = 2; d < ndim; d++) { + mapSize *= input->size[d]; + } + int nElemsPerSample = nOutputPlane * mapSize; + preluForward<<>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + w, + n, nElemsPerSample, mapSize + ); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + } + + THCTensor_(free)(state, weight); +} + +void THNN_(PReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight) +{ + THCUNN_check_nElement(state, input, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + int64_t nOutputPlane = THCTensor_(numel)(state, weight); + + weight = THCTensor_(newContiguous)(state, weight); + real *w = THCTensor_(data)(state, weight); + if (nOutputPlane == 1) + { + THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput(w)); + } + else + { + int ndim = THCTensor_(_nDimension)(state, input); + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int n = THCTensor_(nElement)(state, input); + if (input->size[ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]); + + int mapSize = 1; + for (int d = 2; d < ndim; d++) { + mapSize *= input->size[d]; + } + int nElemsPerSample = nOutputPlane * mapSize; + preluBackward<<>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + w, + THCTensor_(data)(state, gradOutput), + n, nElemsPerSample, mapSize + ); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + } + THCTensor_(free)(state, weight); +} + +void THNN_(PReLU_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradWeight, + accreal scale_) +{ + real scale = ScalarConvert::to(scale_); + THCUNN_check_nElement(state, input, gradOutput); + int64_t nOutputPlane = THCTensor_(numel)(state, weight); + // use grad input for temporary storage, then call updateGradInput again + + if (nOutputPlane == 1) + { + THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared()); + + // introduces a sync point + real sum = ScalarConvert::to(THCTensor_(sumall)(state, gradInput)); + real w = THCTensor_(get1d)(state, gradWeight, 0); + THCTensor_(set1d)(state, gradWeight, 0, w + sum * scale); + + // restore gradInput + THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight); + } + else + { + int ndim = THCTensor_(_nDimension)(state, input); + + if (ndim == 1) + { + THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1(scale)); + } + else + { + THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters(scale)); + THCTensor *gradWeightBuf = THCTensor_(new)(state); + THCTensor_(resizeAs)(state, gradWeightBuf, gradWeight); + + if (ndim == 2) + { + THCTensor_(sum)(state, gradWeightBuf, gradInput, 0, 1); + THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf); + } + else + { + THCTensor *sumbuf = THCTensor_(new)(state); + THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput); + int64_t size3 = 1; + for (int d = 2; d < ndim; d++) { + size3 *= input->size[d]; + } + THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, size3); + THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane); + THCTensor_(sum)(state, sumbuf, buffer, 2, 1); + THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0, 1); + THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf); + THCTensor_(free)(state, buffer); + THCTensor_(free)(state, sumbuf); + } + + THCTensor_(free)(state, gradWeightBuf); + + // restore gradInput + THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight); + } + } +} + +#endif diff --git a/aten/src/THCUNN/generic/RReLU.cu b/aten/src/THCUNN/generic/RReLU.cu new file mode 100644 index 0000000..bea7f10 --- /dev/null +++ b/aten/src/THCUNN/generic/RReLU.cu @@ -0,0 +1,109 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/RReLU.cu" +#else + +#include "../common.h" + +void THNN_(RReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace, + void *generator) +{ + THCUNN_assertSameGPU(state, 3, input, output, noise); + struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state); + + if (train) + { + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, noise, input); + real *input_data = THCTensor_(data)(state, input); + real *noise_data = THCTensor_(data)(state, noise); + ptrdiff_t n = THCTensor_(nElement)(state, input); + if (inplace) + { + rreluUpdateOutputTrain<<>>( + n, gen_states, input_data, noise_data, input_data, lower, upper); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + real *output_data = THCTensor_(data)(state, output); + rreluUpdateOutputTrain<<>>( + n, gen_states, input_data, noise_data, output_data, lower, upper); + } + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + } + else + { + const real negSlope = ScalarConvert::to((lower + upper) / 2); + if (inplace) + { + THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor(negSlope)); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor(negSlope)); + } + } +} + +void THNN_(RReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace) +{ + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise); + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU + { + // multiply the gradient by the noise tensor + if (inplace) + { + THCTensor_(cmul)(state, gradOutput, gradOutput, noise); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(cmul)(state, gradInput, gradOutput, noise); + } + } + else + { + // use constant factor for negative input values + const real negSlope = ScalarConvert::to((lower + upper) / 2); + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor(negSlope)); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor(negSlope)); + } + } + + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/Sigmoid.cu b/aten/src/THCUNN/generic/Sigmoid.cu new file mode 100644 index 0000000..a91a5dd --- /dev/null +++ b/aten/src/THCUNN/generic/Sigmoid.cu @@ -0,0 +1,28 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Sigmoid.cu" +#else + +#include "../common.h" + +void THNN_(Sigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(sigmoid)(state, output, input); +} + +void THNN_(Sigmoid_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_check_nElement(state, output, gradOutput); + THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoid_updateGradInput_functor()); +} + +#endif diff --git a/aten/src/THCUNN/generic/SmoothL1Criterion.cu b/aten/src/THCUNN/generic/SmoothL1Criterion.cu new file mode 100644 index 0000000..1760b08 --- /dev/null +++ b/aten/src/THCUNN/generic/SmoothL1Criterion.cu @@ -0,0 +1,103 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu" +#else + +void THNN_(SmoothL1Criterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 3, input, target, output); + THArgCheck( + THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements" + ); + + if (reduction == Reduction::None) { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply3(state, input, target, output, + smoothl1_updateOutput_no_reduce_functor()); + return; + } + + THCTensor_(resize1d)(state, output, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, (accreal) 0, + thrust::plus(), smoothl1_functor() + ); + + if (reduction == Reduction::ElementwiseMean) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + +void THNN_(SmoothL1Criterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput); + THArgCheck( + THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements" + ); + + THCTensor_(resizeAs)(state, gradInput, input); + + if (reduction == Reduction::None) { + THCUNN_check_shape(state, gradOutput, input); + THC_pointwiseApply3(state, input, target, gradInput, + smoothl1_updateGradInput_no_reduce_functor()); + THCTensor_(cmul)(state, gradInput, gradInput, gradOutput); + return; + } + + THCUNN_check_dim_size(state, gradOutput, 1, 0, 1); + + ptrdiff_t size = THCTensor_(nElement)(state, input); + real norm = ScalarConvert::to(reduction == Reduction::ElementwiseMean ? accreal(1)/size : accreal(1)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCThrustAllocator thrustAlloc(state); + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, gradInput_data, + smoothl1_updateGradInput_functor(norm, THCTensor_(get1d)(state, gradOutput, 0)) + ); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/SoftMarginCriterion.cu b/aten/src/THCUNN/generic/SoftMarginCriterion.cu new file mode 100644 index 0000000..47a4368 --- /dev/null +++ b/aten/src/THCUNN/generic/SoftMarginCriterion.cu @@ -0,0 +1,81 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu" +#else + +void THNN_(SoftMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 3, input, target, output); + + if (reduction == Reduction::None) { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply3(state, input, target, output, + softmargin_no_reduce_functor()); + return; + } + + accreal sum; + ptrdiff_t size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + THCTensor_(resize1d)(state, output, 1); + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus(), softmargin_functor()); + + if (reduction == Reduction::ElementwiseMean) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); +} + +void THNN_(SoftMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction) +{ + THCUNN_check_shape(state, input, target); + THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput); + + THCTensor_(resizeAs)(state, gradInput, input); + + if (reduction == Reduction::None) { + THCUNN_check_shape(state, gradOutput, input); + THC_pointwiseApply3(state, input, target, gradInput, + softmargin_updateGradInput_no_reduce_functor()); + THCTensor_(cmul)(state, gradInput, gradInput, gradOutput); + return; + } + + ptrdiff_t size = THCTensor_(nElement)(state, input); + accreal norm = (reduction == Reduction::ElementwiseMean ? 1./size : 1.); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + + thrust::device_ptr input_data(THCTensor_(data)(state, input)); + thrust::device_ptr target_data(THCTensor_(data)(state, target)); + thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, + softmargin_updateGradInput_functor(norm, THCTensor_(get1d)(state, gradOutput, 0))); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/aten/src/THCUNN/generic/SoftPlus.cu b/aten/src/THCUNN/generic/SoftPlus.cu new file mode 100644 index 0000000..5154d8d --- /dev/null +++ b/aten/src/THCUNN/generic/SoftPlus.cu @@ -0,0 +1,38 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftPlus.cu" +#else + +#include "../common.h" + +void THNN_(SoftPlus_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal beta_, + accreal threshold_) +{ + real beta = ScalarConvert::to(beta_); + real threshold = ScalarConvert::to(threshold_); + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor(threshold, beta)); +} + +void THNN_(SoftPlus_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + accreal beta_, + accreal threshold_) +{ + real beta = ScalarConvert::to(beta_); + real threshold = ScalarConvert::to(threshold_); + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor(threshold, beta)); +} + +#endif diff --git a/aten/src/THCUNN/generic/SoftShrink.cu b/aten/src/THCUNN/generic/SoftShrink.cu new file mode 100644 index 0000000..0743f70 --- /dev/null +++ b/aten/src/THCUNN/generic/SoftShrink.cu @@ -0,0 +1,35 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftShrink.cu" +#else + +#include "../common.h" + +void THNN_(SoftShrink_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal lambda_) +{ + real lambda = ScalarConvert::to(lambda_); + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput(lambda)); + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SoftShrink_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal lambda_) +{ + real lambda = ScalarConvert::to(lambda_); + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput(lambda)); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu new file mode 100644 index 0000000..d5270d6 --- /dev/null +++ b/aten/src/THCUNN/generic/SparseLinear.cu @@ -0,0 +1,274 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SparseLinear.cu" +#else + +static bool THNN_(checkInput)(THCTensor* t) +{ + return !t->is_empty() && t->_dim() == 2 && t->size[1] == 3; +} + +static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1) +{ + return !t->is_empty() && t->_dim() == 2 && t->size[0] == size0 && t->size[1] == size1; +} + +static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0) +{ + return !t->is_empty() && t->_dim() == 1 && t->size[0] == size0; +} + +static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) { + #ifdef THC_REAL_IS_FLOAT + THCudaIntTensor_copyCudaFloat(state, buf, t); + #elif defined(THC_REAL_IS_DOUBLE) + THCudaIntTensor_copyCudaDouble(state, buf, t); + #elif defined(THC_REAL_IS_HALF) + THCudaIntTensor_copyCudaHalf(state, buf, t); + #endif +} + +void THNN_(SparseLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias) +{ + THAssert(THCTensor_(checkGPU)(state, 4, input, output, weight, bias)); + + int64_t h; + int64_t outDim = THCTensor_(size)(state, weight, 0); + int64_t inDim = THCTensor_(size)(state, weight, 1); + + THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 3"); + AT_CHECK(!output->is_empty() && THCTensor_(nDimension)(state, output) == 2, + "output must be batchsize x outputsize, got size: ", output->sizes()); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); + + weight = THCTensor_(newContiguous)(state, weight); + + int64_t batchnum = THCTensor_(size)(state, output, 0); + int64_t nnz = THCTensor_(size)(state, input, 0); + + THCTensor *buffer = THCTensor_(new)(state); + THCTensor *sel = THCTensor_(new)(state); + THCTensor *values = THCTensor_(new)(state); + THCudaIntTensor *rowbuf = THCudaIntTensor_new(state); + THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state); + THCudaIntTensor *colInds = THCudaIntTensor_new(state); + + THCTensor_(resize1d)(state, values, nnz); + THCudaIntTensor_resize1d(state, rowbuf, nnz); + THCudaIntTensor_resize1d(state, colInds, nnz); + THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1); + + // Get data ready for cusparse, need CudaInt buffers + // We do not need to sort, since rows are already in order + // If rows might get out of order in future implementations, or if cusparse + // complains with an illegal memory access, sort like we do in AccGradParameters + THCTensor_(select)(state, sel, input, 1, 0); + THNN_(copyCudaFloatingType)(state, rowbuf, sel); + THCTensor_(select)(state, sel, input, 1, 1); + THNN_(copyCudaFloatingType)(state, colInds, sel); + THCTensor_(select)(state, sel, input, 1, 2); + THCTensor_(copyCuda)(state, values, sel); + + init_cusparse(); + cusparseXcoo2csr(cusparse_handle, + THCudaIntTensor_data(state, rowbuf), nnz, batchnum, + THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE); + + // output = bias + THCTensor_(resize2d)(state, buffer, outDim, batchnum); + THCTensor_(zero)(state, buffer); + for (h=0; h::to(1); + cusparseMatDescr_t descr = 0; + cusparseCreateMatDescr(&descr); + cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); + #ifdef THC_REAL_IS_FLOAT + cusparseScsrmm(cusparse_handle, + #elif defined(THC_REAL_IS_DOUBLE) + cusparseDcsrmm(cusparse_handle, + #endif + CUSPARSE_OPERATION_NON_TRANSPOSE, + batchnum, outDim, inDim, nnz, + &one, + descr, + THCTensor_(data)(state, values), + THCudaIntTensor_data(state, csrPtrs), + THCudaIntTensor_data(state, colInds), + THCTensor_(data)(state, weight), inDim, + &one, THCTensor_(data)(state, buffer), batchnum + ); + THCTensor_(transpose)(state, buffer, NULL, 0, 1); + + // We do work in the buffer to keep the output contiguous + THCTensor_(copy)(state, output, buffer); + + cusparseDestroyMatDescr(descr); + descr = 0; + THCTensor_(free)(state, buffer); + THCTensor_(free)(state, sel); + THCTensor_(free)(state, values); + THCTensor_(free)(state, weight); + THCudaIntTensor_free(state, rowbuf); + THCudaIntTensor_free(state, colInds); + THCudaIntTensor_free(state, csrPtrs); +} + +void THNN_(SparseLinear_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + accreal weightDecay, + accreal scale) +{ + int64_t outDim = THCTensor_(size)(state, weight, 0); + int64_t inDim = THCTensor_(size)(state, weight, 1); + + THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); + + weight = THCTensor_(newContiguous)(state, weight); + int64_t nnz = THCTensor_(size)(state, input, 0); + int64_t batchnum = THCTensor_(size)(state, gradOutput, 0); + + THCTensor *buf = THCTensor_(new)(state); + THCTensor *cols = THCTensor_(new)(state); + THCTensor *sel = THCTensor_(new)(state); + THCudaLongTensor *inds = THCudaLongTensor_new(state); + THCTensor *values = THCTensor_(new)(state); + THCudaIntTensor *colbuf = THCudaIntTensor_new(state); + THCudaIntTensor *colPtrs = THCudaIntTensor_new(state); + THCudaIntTensor *rowInds = THCudaIntTensor_new(state); + + THCTensor_(select)(state, sel, input, 1, 0); // rowInds + THCTensor_(select)(state, cols, input, 1, 1); // colInds + THCTensor_(cadd)(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds + THCTensor_(sort)(state, buf, inds, buf, 0, 0); // Indices are now in ind + THCTensor_(indexSelect)(state, buf, input, 0, inds); + + THCTensor_(resize1d)(state, values, nnz); + THCudaIntTensor_resize1d(state, colbuf, nnz); + THCudaIntTensor_resize1d(state, rowInds, nnz); + THCudaIntTensor_resize1d(state, colPtrs, inDim+1); + + // Get data ready for cusparse, need CudaInt buffers + THCTensor_(select)(state, sel, buf, 1, 0); + THNN_(copyCudaFloatingType)(state, rowInds, sel); + THCTensor_(select)(state, sel, buf, 1, 1); + THNN_(copyCudaFloatingType)(state, colbuf, sel); + THCTensor_(select)(state, sel, buf, 1, 2); + THCTensor_(copyCuda)(state, values, sel); + + init_cusparse(); + // Secretly coo2csc + cusparseXcoo2csr(cusparse_handle, + THCudaIntTensor_data(state, colbuf), nnz, inDim, + THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE); + + // FORTRAN expects contiguous col-major matricies + THCTensor *tgradOutput = THCTensor_(new)(state); + THCTensor_(transpose)(state, tgradOutput, gradOutput, 0, 1); + THCTensor_(resize2d)(state, buf, batchnum, outDim); + THCTensor_(copy)(state, buf, tgradOutput); + THCTensor_(free)(state, tgradOutput); + + real one = ScalarConvert::to(1); + cusparseMatDescr_t descr = 0; + cusparseCreateMatDescr(&descr); + cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); + #ifdef THC_REAL_IS_FLOAT + cusparseScsrmm(cusparse_handle, + #elif defined(THC_REAL_IS_DOUBLE) + cusparseDcsrmm(cusparse_handle, + #endif + CUSPARSE_OPERATION_NON_TRANSPOSE, + inDim, outDim, batchnum, nnz, + &one, + descr, + THCTensor_(data)(state, values), + THCudaIntTensor_data(state, colPtrs), + THCudaIntTensor_data(state, rowInds), + THCTensor_(data)(state, buf), batchnum, + &one, THCTensor_(data)(state, gradWeight), inDim + ); + + THCTensor_(sum)(state, buf, gradOutput, 0, 1); + THCTensor_(resize1d)(state, buf, outDim); + THCTensor_(cadd)(state, gradBias, gradBias, scale, buf); + + if (weightDecay != 0) + { + THCTensor_(cadd)(state, gradWeight, gradWeight, weightDecay, weight); + THCTensor_(cadd)(state, gradBias, gradBias, weightDecay, bias); + } + + THCTensor_(free)(state, weight); + THCTensor_(free)(state, buf); + THCTensor_(free)(state, sel); + THCTensor_(free)(state, cols); + THCudaLongTensor_free(state, inds); + THCTensor_(free)(state, values); + THCudaIntTensor_free(state, colbuf); + THCudaIntTensor_free(state, rowInds); + THCudaIntTensor_free(state, colPtrs); +} + +void THNN_(SparseLinear_legacyUpdateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias) { + THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors"); +} +void THNN_(SparseLinear_legacyAccGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + accreal weightDecay, + accreal scale) { + THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors"); +} + +// Dense updates are pretty fast on the GPU +void THNN_(SparseLinear_zeroGradParameters)( + THCState *state, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput) { + THCTensor_(zero)(state, gradWeight); + THCTensor_(zero)(state, gradBias); +} + +void THNN_(SparseLinear_updateParameters)( + THCState *state, + THCTensor *weight, + THCTensor *bias, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput, + accreal learningRate) { + THCTensor_(cadd)(state, weight, weight, -learningRate, gradWeight); + THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu new file mode 100644 index 0000000..05a7b04 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu @@ -0,0 +1,173 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.cu" +#else + +#include "../common.h" + +// 4d tensor B x D x H x W + +void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int osizeW, + int osizeH) +{ + THCUNN_assertSameGPU(state, 2, input, output); + + real *output_data; + real *input_data; + + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 3) { + int64_t sizeD = input->size[0]; + int64_t isizeH = input->size[1]; + int64_t isizeW = input->size[2]; + + int64_t istrideD = input->stride[0]; + int64_t istrideH = input->stride[1]; + int64_t istrideW = input->stride[2]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize3d)(state, output, sizeD, osizeH, osizeW); + + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int blocksH = max((int)(16L / sizeD), 1); + dim3 blocks(sizeD, blocksH); + dim3 threads(32, 8); + + // run averagepool kernel + adaptiveaveragepool <<>> (input_data, output_data, + isizeH, isizeW, osizeH, osizeW, + istrideD, istrideH, istrideW); + THCudaCheck(cudaGetLastError()); + + } else { + input = THCTensor_(newContiguous)(state, input); + int64_t sizeB = input->size[0]; + int64_t sizeD = input->size[1]; + int64_t isizeH = input->size[2]; + int64_t isizeW = input->size[3]; + + int64_t istrideD = input->stride[1]; + int64_t istrideH = input->stride[2]; + int64_t istrideW = input->stride[3]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, sizeB, sizeD, osizeH, osizeW); + + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int blocksH = max((int)(16L / sizeD), 1); + dim3 blocks(sizeB * sizeD, blocksH); + dim3 threads(32, 8); + + // run averagepool kernel + adaptiveaveragepool <<>> (input_data, output_data, + isizeH, isizeW, osizeH, osizeW, + istrideD, istrideH, istrideW); + THCudaCheck(cudaGetLastError()); + // clean + THCTensor_(free)(state, input); + } +} + +void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests + + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + + real *gradInput_data; + real *gradOutput_data; + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + if (input->dim() == 3) { + int64_t sizeD = input->size[0]; + int64_t isizeH = input->size[1]; + int64_t isizeW = input->size[2]; + + int64_t osizeH = gradOutput->size[1]; + int64_t osizeW = gradOutput->size[2]; + + //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int blocksH = max((int)(16L / sizeD), 1); + dim3 blocks(sizeD, blocksH); + dim3 threads(32, 8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + isizeH, isizeW, osizeH, osizeW); + } + else + { + // run updateGradInput kernel + adaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + isizeH, isizeW, osizeH, osizeW); + } + THCudaCheck(cudaGetLastError()); + } else { + int64_t sizeB = input->size[0]; + int64_t sizeD = input->size[1]; + int64_t isizeH = input->size[2]; + int64_t isizeW = input->size[3]; + + int64_t osizeH = gradOutput->size[2]; + int64_t osizeW = gradOutput->size[3]; + + //bool atomic = //(isizeW%osizeW != 0) || (isizeH%osizeH != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int blocksH = max((int)(16L / sizeD), 1); + dim3 blocks(sizeB * sizeD, blocksH); + dim3 threads(32, 8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + isizeH, isizeW, osizeH, osizeW); + } + else + { + // run updateGradInput kernel, accumulate gradients atomically + adaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + isizeH, isizeW, osizeH, osizeW); + } + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state,gradOutput); + +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu new file mode 100644 index 0000000..3e5fab6 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu @@ -0,0 +1,193 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.cu" +#else + +#include "../common.h" + +// 4d tensor B x D x H x W + +void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int osizeW, + int osizeH) +{ + THCUNN_assertSameGPU(state, 3, input, output, indices); + + THCIndex_t *indices_data; + real *output_data; + real *input_data; + + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 3) { + int64_t sizeD = input->size[0]; + int64_t isizeH = input->size[1]; + int64_t isizeW = input->size[2]; + + int64_t istrideD = input->stride[0]; + int64_t istrideH = input->stride[1]; + int64_t istrideW = input->stride[2]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize3d)(state, output, sizeD, osizeH, osizeW); + THCIndexTensor_(resize3d)(state, indices, sizeD, osizeH, osizeW); + + indices_data = THCIndexTensor_(data)(state, indices); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int blocksH = (int)(16L / sizeD); + blocksH = blocksH < 1 ? 1 : blocksH; + dim3 blocks(sizeD, blocksH); + dim3 threads(32, 8); + + // run maxpool kernel + adaptivemaxpool <<>> (input_data, output_data, + indices_data, + isizeH, isizeW, osizeH, osizeW, + istrideD, istrideH, istrideW); + THCudaCheck(cudaGetLastError()); + + } else { + input = THCTensor_(newContiguous)(state, input); + int64_t sizeB = input->size[0]; + int64_t sizeD = input->size[1]; + int64_t isizeH = input->size[2]; + int64_t isizeW = input->size[3]; + + int64_t istrideD = input->stride[1]; + int64_t istrideH = input->stride[2]; + int64_t istrideW = input->stride[3]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, sizeB, sizeD, osizeH, osizeW); + THCIndexTensor_(resize4d)(state, indices, sizeB, sizeD, osizeH, osizeW); + + indices_data = THCIndexTensor_(data)(state, indices); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int blocksH = (int)(16L / sizeD); + blocksH = blocksH < 1 ? 1 : blocksH; + dim3 blocks(sizeB*sizeD, blocksH); + dim3 threads(32, 8); + + // run maxpool kernel + adaptivemaxpool <<>> (input_data, output_data, + indices_data, + isizeH, isizeW, osizeH, osizeW, + istrideD, istrideH, istrideW); + THCudaCheck(cudaGetLastError()); + // clean + THCTensor_(free)(state, input); + } +} + +void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices) +{ + bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests + + THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); + + THCIndex_t *indices_data; + real *gradInput_data; + real *gradOutput_data; + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + if (input->dim() == 3) { + int64_t sizeD = input->size[0]; + int64_t isizeH = input->size[1]; + int64_t isizeW = input->size[2]; + + int64_t osizeH = gradOutput->size[1]; + int64_t osizeW = gradOutput->size[2]; + + //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + indices_data = THCIndexTensor_(data)(state, indices); + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int blocksH = (int)(16L / sizeD); + blocksH = blocksH < 1 ? 1 : blocksH; + dim3 blocks(sizeD, blocksH); + dim3 threads(32, 8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptivemaxgradinput <<>> (gradInput_data, gradOutput_data, + indices_data, + isizeH, isizeW, osizeH, osizeW); + } + else + { + // run updateGradInput kernel + atomicadaptivemaxgradinput <<>> (gradInput_data, gradOutput_data, + indices_data, + isizeH, isizeW, osizeH, osizeW); + } + THCudaCheck(cudaGetLastError()); + } else { + int64_t sizeB = input->size[0]; + int64_t sizeD = input->size[1]; + int64_t isizeH = input->size[2]; + int64_t isizeW = input->size[3]; + + int64_t osizeH = gradOutput->size[2]; + int64_t osizeW = gradOutput->size[3]; + + //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + indices_data = THCIndexTensor_(data)(state, indices); + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int blocksH = (int)(16L / sizeD); + blocksH = blocksH < 1 ? 1 : blocksH; + dim3 blocks(sizeB*sizeD, blocksH); + dim3 threads(32, 8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptivemaxgradinput <<>> (gradInput_data, gradOutput_data, + indices_data, + isizeH, isizeW, osizeH, osizeW); + } + else + { + // run updateGradInput kernel, accumulate gradients atomically + adaptivemaxgradinput <<>> (gradInput_data, gradOutput_data, + indices_data, + isizeH, isizeW, osizeH, osizeW); + } + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state,gradOutput); + +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAveragePooling.cu new file mode 100644 index 0000000..7b3d2d4 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialAveragePooling.cu @@ -0,0 +1,237 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialAveragePooling.cu" +#else + +#include "../common.h" + +static inline void THNN_(SpatialAveragePooling_shapeCheck)( + THCState *state, + THCTensor *input, THCTensor *gradOutput, + int kH, int kW, int dH, int dW, int padH, int padW, bool ceil_mode) { + + THArgCheck(kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size, but got " + "padW = %d, padH = %d, kW = %d, kH = %d", + padW, padH, kW, kH); + + int64_t nInputPlane = input->size[dimh-1]; + int64_t nInputRows = input->size[dimh]; + int64_t nInputCols = input->size[dimw]; + int64_t nOutputRows, nOutputCols; + int64_t nOutputPlane = nInputPlane; + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + if (nOutputCols < 1 || nOutputRows < 1) + THError("Given input size: (%dx%dx%d). " + "Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols); + } +} + +void THNN_(SpatialAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad) +{ + THCUNN_assertSameGPU(state, 2, input, output); + THNN_(SpatialAveragePooling_shapeCheck) + (state, input, NULL, kH, kW, dH, dW, + padH, padW, ceil_mode); + + int64_t nInputCols, nInputRows, nInputPlane, batchSize; + int64_t nOutputCols, nOutputRows; + + if (input->dim() == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + input = THCTensor_(newContiguous)(state, input); + real* input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); + + real* output_data = THCTensor_(data)(state, output); + + int count = THCTensor_(nElement)(state, output); + + if(count_include_pad) + AvePoolForward + <<>>( + count, input_data, + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, output_data); + else + AvePoolForward + <<>>( + count, input_data, + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, output_data); + THCudaCheck(cudaGetLastError()); + + if(input->dim() == 3) + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + + THCTensor_(free)(state, input); + +} + +void THNN_(SpatialAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad) +{ + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + THNN_(SpatialAveragePooling_shapeCheck) + (state, input, gradOutput, kH, kW, dH, dW, + padH, padW, ceil_mode); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int64_t nInputCols, nInputRows, nInputPlane, batchSize; + int64_t nOutputCols, nOutputRows; + int dimCol = 2; + int dimRow = 1; + + if (input->dim() == 3) { + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + dimCol = 3; + dimRow = 2; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + nInputCols = input->size[dimCol]; + nInputRows = input->size[dimRow]; + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + THCUNN_check_dim_size(state, gradOutput, input->dim(), dimRow, nOutputRows); + THCUNN_check_dim_size(state, gradOutput, input->dim(), dimCol, nOutputCols); + THCTensor_(resizeAs)(state, gradInput, input); + + int count = THCTensor_(nElement)(state, input); + + if(count_include_pad) + AvePoolBackward + <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, + THCTensor_(data)(state, gradOutput), + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, + THCTensor_(data)(state, gradInput)); + else + AvePoolBackward + <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, + THCTensor_(data)(state, gradOutput), + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, + THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + // clean + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu new file mode 100644 index 0000000..693a26d --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu @@ -0,0 +1,233 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialClassNLLCriterion.cu" +#else + +void THNN_(SpatialClassNLLCriterion_shapeCheck)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *weights) +{ + AT_CHECK(!target->is_empty() && THCIndexTensor_(nDimension)(state, target) == 3, 1, + "only batches of spatial targets supported (non-empty 3D tensors)" \ + " but got targets of size: : ", target->sizes()); + AT_CHECK(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4, 2, + "only batches of spatial inputs supported (non-empty 4D tensors), " \ + "but got input of size: ", input->sizes()); + if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) || + THCTensor_(size)(state, input, 2) != THCIndexTensor_(size)(state, target, 1) || + THCTensor_(size)(state, input, 3) != THCIndexTensor_(size)(state, target, 2)) { + THCDescBuff input_size = THCTensor_(sizeDesc)(state, input); + THCDescBuff target_size = THCIndexTensor_(sizeDesc)(state, target); + THError("input and target batch or spatial sizes don't match: target %s, input %s", + target_size.str, input_size.str); + } + + if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) { + THError("weight tensor should be defined either for all or no classes"); + } +} + +static void THNN_(SpatialClassNLLCriterion_gradOutput_no_reduce_shapeCheck)( + THCState *state, + THCTensor *gradOutput, + THCIndexTensor *target) +{ + AT_CHECK(!gradOutput->is_empty() && THCTensor_(nDimension)(state, gradOutput) == 3, 2, + "Expected non-empty dimension 3 but got gradOutput of size: ", gradOutput->sizes()); + if (THCTensor_(size)(state, gradOutput, 0) != THCIndexTensor_(size)(state, target, 0) || + THCTensor_(size)(state, gradOutput, 1) != THCIndexTensor_(size)(state, target, 1) || + THCTensor_(size)(state, gradOutput, 2) != THCIndexTensor_(size)(state, target, 2)) { + THCDescBuff gradOutput_size = THCTensor_(sizeDesc)(state, gradOutput); + THCDescBuff target_size = THCIndexTensor_(sizeDesc)(state, target); + THError("gradOutput sizes don't match target sizes: target %s, gradOutput %s", + target_size.str, gradOutput_size.str); + } +} + +void THNN_(SpatialClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + int64_t reduction, + THCTensor *weights, + THCTensor *total_weight, + int64_t ignore_index) +{ + THNN_(SpatialClassNLLCriterion_shapeCheck)(state, input, target, weights); + THCTensor_(resize1d)(state, output, 1); + THCTensor_(resize1d)(state, total_weight, 1); + ignore_index -= TH_INDEX_BASE; + + if (weights) + THCUNN_assertSameGPU(state, 5, input, target, weights, output, total_weight); + else + THCUNN_assertSameGPU(state, 4, input, target, output, total_weight); + + if (reduction == Reduction::None) { + int64_t batch_size = THCTensor_(size)(state, input, 0); + int64_t H = THCTensor_(size)(state, input, 2); + int64_t W = THCTensor_(size)(state, input, 3); + + THCTensor_(resize3d)(state, output, batch_size, H, W); + + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + } + + int64_t count = batch_size * H * W; + SpatialClassNLLCriterion_updateOutput_no_reduce_kernel + <<>>( + count, + toDeviceTensor(state, input), + toDeviceTensor(state, target), + toDeviceTensor(state, output), + weights ? THCTensor_(data)(state, weights) : NULL, + ignore_index); + + if (weights) { + THCTensor_(free)(state, weights); + } + return; + } + + input = THCTensor_(newContiguous)(state, input); + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *input_data = THCTensor_(data)(state, input); + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *output_data = THCTensor_(data)(state, output); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0); + THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size; + int blocks_per_sample = GET_BLOCKS(map_nelem) / 128; + blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; + int total_blocks = blocks_per_sample * batch_size; + + THCTensor_(fill)(state, output, ScalarConvert::to(0)); + THCTensor_(fill)(state, total_weight, ScalarConvert::to(0)); + + cunn_SpatialClassNLLCriterion_updateOutput_kernel + <<>>( + output_data, + total_weight_data, + input_data, + target_data, + weights_data, + reduction == Reduction::ElementwiseMean, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + THCTensor_(size)(state, input, 2) * THCTensor_(size)(state, input, 3), + blocks_per_sample, + ignore_index + ); + THCudaCheck(cudaGetLastError()); + if (reduction == Reduction::ElementwiseMean) { + cunn_SpatialClassNLLCriterion_sizeAverage_kernel<<<1, 1, 0, THCState_getCurrentStream(state)>>>( + output_data, total_weight_data + ); + THCudaCheck(cudaGetLastError()); + } + + if (weights) + THCTensor_(free)(state, weights); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, input); +} + +void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + THCTensor *weights, + THCTensor *total_weight, + int64_t ignore_index) +{ + THNN_(SpatialClassNLLCriterion_shapeCheck)(state, input, target, weights); + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, + "gradInput must be contiguous"); + ignore_index -= TH_INDEX_BASE; + + if (weights) + THCUNN_assertSameGPU(state, 5, weights, input, target, gradInput, total_weight); + else + THCUNN_assertSameGPU(state, 4, input, target, gradInput, total_weight); + + if (reduction == Reduction::None) { + THNN_(SpatialClassNLLCriterion_gradOutput_no_reduce_shapeCheck)( + state, + gradOutput, + target); + + int64_t batch_size = THCTensor_(size)(state, input, 0); + int64_t H = THCTensor_(size)(state, input, 2); + int64_t W = THCTensor_(size)(state, input, 3); + + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + } + + int64_t count = batch_size * H * W; + SpatialClassNLLCriterion_updateGradInput_no_reduce_kernel + <<>>( + count, + toDeviceTensor(state, target), + toDeviceTensor(state, gradOutput), + toDeviceTensor(state, gradInput), + weights ? THCTensor_(data)(state, weights) : NULL, + ignore_index); + + if (weights) { + THCTensor_(free)(state, weights); + } + return; + } + + input = THCTensor_(newContiguous)(state, input); + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + real *gradInput_data = THCTensor_(data)(state, gradInput); + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0); + THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size; + int blocks_per_sample = GET_BLOCKS(map_nelem) / 128; + blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; + int total_blocks = blocks_per_sample * batch_size; + + cunn_SpatialClassNLLCriterion_updateGradInput_kernel + <<>>( + gradInput_data, + gradOutput_data, + target_data, + weights_data, + total_weight_data, + reduction == Reduction::ElementwiseMean, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + THCTensor_(size)(state, input, 2) *THCTensor_(size)(state, input, 3), + blocks_per_sample, + ignore_index + ); + THCudaCheck(cudaGetLastError()); + + if (weights) + THCTensor_(free)(state, weights); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, input); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu new file mode 100644 index 0000000..6446394 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu @@ -0,0 +1,411 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialConvolutionLocal.cu" +#else + +static inline void THNN_(SpatialConvolutionLocal_shapeCheck)( + THCState *state, + THCTensor *input, THCTensor *gradOutput, + THCTensor *weight, THCTensor *bias, + int kH, int kW, int dH, + int dW, int padH, int padW, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t nInputPlane = weight->size[2] / (kH * kW); + int64_t nOutputPlane = weight->size[1]; + + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 3, 0, nOutputPlane); + THCUNN_check_dim_size(state, bias, 3, 1, outputHeight); + THCUNN_check_dim_size(state, bias, 3, 2, outputWidth); + } + + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +static THCTensor* THNN_(view_weight_local)( + THCState *state, + THCTensor *_weight) +{ + THCTensor *weight = THCTensor_(newContiguous)(state, _weight); + AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4, + "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes()); + if (weight->dim() == 6) { + int64_t s1 = weight->size[0] * weight->size[1]; + int64_t s2 = weight->size[2]; + int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5]; + THCTensor *old_weight = weight; + weight = THCTensor_(newWithStorage3d)(state, + weight->storage, + weight->storageOffset, + s1, -1, s2, -1, s3, -1); + THCTensor_(free)(state, old_weight); + } + return weight; +} + +void THNN_(SpatialConvolutionLocal_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight) +{ + THCUNN_assertSameGPU(state, 5, input, output, weight, + bias, finput); + + weight = THNN_(view_weight_local)(state, weight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THCTensor_(newContiguous)(state, input); + + int64_t nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH); + int64_t nOutputPlane = THCTensor_(size)(state,weight,1); + + int batch = 1; + if (input->dim() == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth); + } + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Augment the input + THCTensor_(resize3d)(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *finput_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + THCTensor *finput3d, *output3d; + THCTensor *wslice = THCTensor_(new)(state); + THCTensor *islice = THCTensor_(new)(state); + THCTensor *oslice = THCTensor_(new)(state); + + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, finput_n, finput, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, finput_n) + ); + + output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + + finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset, + outputHeight*outputWidth, 1, + kW*kH*nInputPlane, outputHeight*outputWidth, + 1, kW*kH*nInputPlane*outputHeight*outputWidth); + + THCTensor_(copy)(state, output_n, bias); + + // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW + // finput3d: oH*oW x nInputPlane*kH*kW x 1 + THCTensor_(baddbmm)(state, output3d, ScalarConvert::to(1), + output3d, ScalarConvert::to(1), + weight, finput3d); + // output3d: oH*oW x nOutputPlane x 1 + + THCTensor_(free)(state, output3d); + THCTensor_(free)(state, finput3d); + THCTensor_(free)(state, wslice); + THCTensor_(free)(state, islice); + THCTensor_(free)(state, oslice); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, finput_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, weight); +} + +void THNN_(SpatialConvolutionLocal_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight) +{ + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, + fgradInput, gradInput); + + weight = THNN_(view_weight_local)(state, weight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int64_t nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH); + int64_t nOutputPlane = THCTensor_(size)(state,weight,1); + + int batch = 1; + if (input->dim() == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth); + } + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize3d)(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *fgradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + THCTensor *tweight = THCTensor_(new)(state); + THCTensor_(transpose)(state, tweight, weight, 1, 2); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + THCTensor *gradOutput3d, *fgradInput3d; + THCTensor *wslice = THCTensor_(new)(state); + THCTensor *gislice = THCTensor_(new)(state); + THCTensor *goslice = THCTensor_(new)(state); + + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset, + outputHeight*outputWidth, 1, + kW*kH*nInputPlane, outputHeight*outputWidth, + 1, kW*kH*nInputPlane*outputHeight*outputWidth); + + // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane + // gradOutput3d: oH*oW x nOutputPlane x 1 + THCTensor_(baddbmm)(state, fgradInput3d, + ScalarConvert::to(0), + fgradInput3d, ScalarConvert::to(1), + tweight, gradOutput3d); + // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1 + + // Unpack columns back into input: + col2im( + THCState_getCurrentStream(state), + THCTensor_(data)(state, fgradInput_n), + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, gradInput_n) + ); + + THCTensor_(free)(state, gradOutput3d); + THCTensor_(free)(state, fgradInput3d); + THCTensor_(free)(state, wslice); + THCTensor_(free)(state, gislice); + THCTensor_(free)(state, goslice); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, fgradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, tweight); + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, weight); +} + +void THNN_(SpatialConvolutionLocal_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight, + accreal scale_) +{ + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, + gradBias, finput); + + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous"); + gradWeight = THNN_(view_weight_local)(state, gradWeight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int64_t nInputPlane = THCTensor_(size)(state,gradWeight,2)/(kW*kH); + int64_t nOutputPlane = THCTensor_(size)(state,gradWeight,1); + + int batch = 1; + if (input->dim() == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth); + } + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *finput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + THCTensor *gradOutput3d, *finput3d; + THCTensor *gwslice = THCTensor_(new)(state); + THCTensor *islice = THCTensor_(new)(state); + THCTensor *goslice = THCTensor_(new)(state); + + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, finput_n, finput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset, + outputHeight*outputWidth, 1, + 1, kW*kH*nInputPlane*outputHeight*outputWidth, + kW*kH*nInputPlane, outputHeight*outputWidth); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, finput_n) + ); + + // gradOutput3d: oH*oW x nOutputPlane x 1 + // finput3d: oH*oW x 1 x kW*kH*nInputPlane + THCTensor_(baddbmm)(state, gradWeight, ScalarConvert::to(1), + gradWeight, scale, gradOutput3d, finput3d); + // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane + + THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutput_n); + + THCTensor_(free)(state, gradOutput3d); + THCTensor_(free)(state, finput3d); + THCTensor_(free)(state, gwslice); + THCTensor_(free)(state, goslice); + THCTensor_(free)(state, islice); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, finput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, gradWeight); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu new file mode 100644 index 0000000..b5dab9b --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu @@ -0,0 +1,527 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialConvolutionMM.cu" +#else + +static inline void THNN_(SpatialConvolutionMM_shapeCheck)( + THCState *state, + THCTensor *input, THCTensor *gradOutput, + THCTensor *weight, THCTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, + int weight_nullable) { + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + if (weight != NULL) { + THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, + "non-empty 2D or 4D weight tensor expected, but got: %s"); + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + + int64_t exactInputHeight = inputHeight + 2 * padH; + int64_t exactInputWidth = inputWidth + 2 * padW; + + if (exactInputHeight < kH || exactInputWidth < kW) { + THError("Calculated padded input size per channel: (%ld x %ld). " + "Kernel size: (%ld x %ld). Kernel size can't be greater than actual input size", + exactInputHeight, exactInputWidth, kH, kW); + } + + int64_t outputHeight = (exactInputHeight - kH) / dH + 1; + int64_t outputWidth = (exactInputWidth - kW) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld). " + "Calculated output size per channel: (%ld x %ld). Output size is too small", + inputHeight, inputWidth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + if (weight->dim() == 2) { + nInputPlane /= (kH * kW); + } + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialConvolutionMM_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH) { + + THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); + if (bias) { + THCUNN_assertSameGPU(state, 2, weight, bias); + } + THArgCheck(THCTensor_(isContiguous)(state, weight), 4, + "weight tensor has to be contiguous"); + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, + "bias tensor has to be contiguous"); + + int freeWeight = 0; + + // Params: + int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kH*kW) : weight->size[1]; + int nOutputPlane = weight->size[0]; + + if (weight->dim() == 4) { + int64_t s1 = weight->size[0]; + int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3]; + weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); + freeWeight = 1; + } + + THNN_(SpatialConvolutionMM_shapeCheck) + (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, 0); + + input = THCTensor_(newContiguous)(state, input); + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = columns->size[1]; + int64_t k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + if (freeWeight) + THCTensor_(free)(state, weight); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); +} + +void THNN_(SpatialConvolutionMM_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH) { + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THArgCheck(THCTensor_(isContiguous)(state, weight), 4, + "weight tensor has to be contiguous"); + + THNN_(SpatialConvolutionMM_shapeCheck) + (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0); + + // Params + int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kW*kH) : weight->size[1]; + int nOutputPlane = weight->size[0]; + + int freeWeight = 0; + if (weight->dim() == 4) { + int64_t s1 = weight->size[0]; + int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3]; + weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); + freeWeight = 1; + } + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nInputPlane*kW*kH; + int64_t n = gradColumns->size[1]; + int64_t k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2im( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + if (freeWeight) + THCTensor_(free)(state, weight); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +void THNN_(SpatialConvolutionMM_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + accreal scale_) { + + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones); + if (gradWeight) { + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous"); + } + if (gradBias) { + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous"); + } + + THNN_(SpatialConvolutionMM_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, 1); + + // Params + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t nInputPlane = input->size[1]; + int64_t nOutputPlane = gradOutput->size[1]; + + int freeWeight = 0; + if (gradWeight && gradWeight->dim() == 4) { + int64_t s1 = gradWeight->size[0]; + int64_t s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3]; + gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1); + freeWeight = 1; + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = nInputPlane*kW*kH; + int64_t k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + if (freeWeight) + THCTensor_(free)(state, gradWeight); + + // Resize + if (is_batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu new file mode 100644 index 0000000..fbdd8b4 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu @@ -0,0 +1,122 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu" +#else + +void THNN_(LRNforward)(THCState* state, THCTensor* input, THCTensor* output, + THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_) +{ + real alpha = ScalarConvert::to(alpha_); + real beta = ScalarConvert::to(beta_); + real k = ScalarConvert::to(k_); + + THCTensor_(resizeAs)(state, output, input); + THCTensor_(resizeAs)(state, scale, input); + + int batchSize; + int nInputPlane; + int imsize_h; + int imsize_w; + + if (input->dim() == 3) { + batchSize = 1; + nInputPlane = input->size[0]; + imsize_h = input->size[1]; + imsize_w = input->size[2]; + } + else + { + batchSize = input->size[0]; + nInputPlane = input->size[1]; + imsize_h = input->size[2]; + imsize_w = input->size[3]; + } + + input = THCTensor_(newContiguous)(state, input); + + int n_threads = batchSize * imsize_h * imsize_w; + LRNFillScale <<>>( + n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size, + alpha / local_size, k, THCTensor_(data)(state, scale)); + n_threads *= nInputPlane; + THCudaCheck(cudaGetLastError()); + LRNComputeOutput<<>>( + n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output)); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); +} + + +void THNN_(LRNbackward)(THCState* state, THCTensor* input, THCTensor* output, + THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale, + int local_size, accreal alpha_, accreal beta_, accreal k_) +{ + real alpha = ScalarConvert::to(alpha_); + real beta = ScalarConvert::to(beta_); + real k = ScalarConvert::to(k_); + (void) k; + THCTensor_(resizeAs)(state, gradInput, input); + + int batchSize; + int nInputPlane; + int imsize_h; + int imsize_w; + + if (input->dim() == 3) { + batchSize = 1; + nInputPlane = input->size[0]; + imsize_h = input->size[1]; + imsize_w = input->size[2]; + } + else + { + batchSize = input->size[0]; + nInputPlane = input->size[1]; + imsize_h = input->size[2]; + imsize_w = input->size[3]; + } + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int n_threads = batchSize * imsize_h * imsize_w; + LRNComputeDiff <<>>( + n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output), + THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w, + local_size, -beta, ScalarConvert::to(2) * alpha * beta / local_size, + THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +void THNN_(SpatialCrossMapLRN_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *scale, + int size, + accreal alpha, + accreal beta, + accreal k) +{ + THNN_(LRNforward)(state, input, output, scale, size, alpha, beta, k); +} + +void THNN_(SpatialCrossMapLRN_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *scale, + THCTensor *output, + int size, + accreal alpha, + accreal beta, + accreal k) +{ + THNN_(LRNbackward)(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu new file mode 100644 index 0000000..61cd0e2 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu @@ -0,0 +1,254 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialDepthwiseConvolution.cu" +#else + +void THNN_(SpatialDepthwiseConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + THCUNN_assertSameGPU(state, 3, input, output, weight); + + // Only handle 4D Input Tensors for now + THAssert(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4); + THAssert(!weight->is_empty() && THCTensor_(nDimension)(state, weight) == 4); + + // We assume that the input and weight Tensors are shaped properly by + // the caller, so we verify that here to some extent + + // Weight Tensor is shape (output_channels, 1, kH, kW) + THAssert(weight->size[1] == 1); + + // Input Tensor is shape (N, input_channels, H, W) + // We verify that the # of output_channels is a multiple of input_channels + THAssert(weight->size[0] % input->size[1] == 0); + + // Bias has same # of channels as output + if (bias) { + THAssert(bias->size[0] == weight->size[0]); + } + + input = THCTensor_(newContiguous)(state, input); + weight = THCTensor_(newContiguous)(state, weight); + bias = bias ? THCTensor_(newContiguous)(state, bias) : bias; + + // Following the behvaior of other THCUNN functions, we shape the output + // Tensor ourselves + + int batchSize = input->size[0]; + int height = input->size[2]; + int width = input->size[3]; + int outputHeight = (height + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int outputWidth = (width + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int outputChannels = weight->size[0]; + + THCTensor_(resize4d)(state, output, batchSize, outputChannels, outputHeight, outputWidth); + + // Create THCDeviceTensor + // Kernel currently relies upon all the Tensors to be contiguous, but we made + // them contiguous above + THCDeviceTensor dInput = toDeviceTensor(state, input); + THCDeviceTensor dWeight = toDeviceTensor(state, weight); + THCDeviceTensor dOutput = toDeviceTensor(state, output); + THCDeviceTensor dBias; + if (bias) { + dBias = toDeviceTensor(state, bias); + } + + int inputChannels = input->size[1]; + int depthwiseMultiplier = outputChannels / inputChannels; + + // One thread per output value + int n = THCTensor_(nElement)(state, output); + int blocks = GET_BLOCKS(n); + dim3 grid(blocks); + dim3 block(CUDA_NUM_THREADS); + if (kW == 3 && kH == 3) { + spatialDepthwiseConvolutionUpdateOutput<<>>( + dInput, dOutput, dWeight, dBias, bias != NULL, n, outputChannels, depthwiseMultiplier, + width, height, outputWidth, outputHeight, + kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else if (kW == 1 && kH == 1) { + spatialDepthwiseConvolutionUpdateOutput<<>>( + dInput, dOutput, dWeight, dBias, bias != NULL, n, outputChannels, depthwiseMultiplier, + width, height, outputWidth, outputHeight, + kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else { + spatialDepthwiseConvolutionUpdateOutput<<>>( + dInput, dOutput, dWeight, dBias, bias != NULL, n, outputChannels, depthwiseMultiplier, + width, height, outputWidth, outputHeight, + kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } + + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, weight); + if (bias) THCTensor_(free)(state, bias); +} + +void THNN_(SpatialDepthwiseConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + THCUNN_assertSameGPU(state, 3, gradOutput, gradInput, weight); + + // Only handle 4D Input Tensors for now + THAssert(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4); + THAssert(!weight->is_empty() && THCTensor_(nDimension)(state, weight) == 4); + THAssert(!gradOutput->is_empty() && THCTensor_(nDimension)(state, gradOutput) == 4); + + // Minimal shape checking, as above + // Same # of elements in batch + THAssert(input->size[0] == gradOutput->size[0]); + // Same # of filters as outputChannels + THAssert(weight->size[0] == gradOutput->size[1]); + + // Resize GradInput + THCTensor_(resizeAs)(state, gradInput, input); + + int inputChannels = input->size[1]; + int height = input->size[2]; + int width = input->size[3]; + + int outputChannels = gradOutput->size[1]; + int outputHeight = gradOutput->size[2]; + int outputWidth = gradOutput->size[3]; + + int depthwiseMultiplier = outputChannels / inputChannels; + + THCDeviceTensor dGradOutput = toDeviceTensor(state, gradOutput); + THCDeviceTensor dGradInput = toDeviceTensor(state, gradInput); + THCDeviceTensor dWeight = toDeviceTensor(state, weight); + + // Kernel currently relies upon all the Tensors to be contiguous + THAssert(dGradOutput.isContiguous()); + THAssert(dGradInput.isContiguous()); + THAssert(dWeight.isContiguous()); + + // One thread per gradInput value + int n = THCTensor_(nElement)(state, gradInput); + int blocks = GET_BLOCKS(n); + dim3 grid(blocks); + dim3 block(CUDA_NUM_THREADS); + if (kW == 3 && kH == 3) + if (dW == 1 && dH == 1){ + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else if (dW == 2 && dH == 2) { + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else { + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } + else if (kW == 1 && kH == 1) + if (dW == 1 && dH == 1){ + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else if (dW == 2 && dH == 2) { + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else { + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } + else + if (dW == 1 && dH == 1){ + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else if (dW == 2 && dH == 2) { + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } else { + spatialDepthwiseConvolutionUpdateGradInput<<>>( + dGradOutput, dGradInput, dWeight, n, inputChannels, depthwiseMultiplier, outputChannels, width, + height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + } + + + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SpatialDepthwiseConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradWeight); + + // Only handle 4D Input Tensors for now + THAssert(!input->is_empty() && THCTensor_(nDimension)(state, input) == 4); + THAssert(!gradOutput->is_empty() && THCTensor_(nDimension)(state, gradOutput) == 4); + THAssert(!gradWeight->is_empty() && THCTensor_(nDimension)(state, gradWeight) == 4); + + // Minimal shape checking as above + // Same # of elements in batch + THAssert(input->size[0] == gradOutput->size[0]); + // Same # of filters as outputChannels + THAssert(gradWeight->size[0] == gradOutput->size[1]); + + int batchSize = input->size[0]; + int inputChannels = input->size[1]; + int height = input->size[2]; + int width = input->size[3]; + + int outputChannels = gradOutput->size[1]; + int outputHeight = gradOutput->size[2]; + int outputWidth = gradOutput->size[3]; + + int depthwiseMultiplier = outputChannels / inputChannels; + + THCDeviceTensor dGradOutput = toDeviceTensor(state, gradOutput); + THCDeviceTensor dInput = toDeviceTensor(state, input); + THCDeviceTensor dGradWeight = toDeviceTensor(state, gradWeight); + + // Kernel currently relies upon all the Tensors to be contiguous + THAssert(dGradOutput.isContiguous()); + THAssert(dInput.isContiguous()); + THAssert(dGradWeight.isContiguous()); + + // We parallelize so that each block computes a single value in gradWeight + int blocks = outputChannels * kH * kW; + + + // Make sure we have enough threads to perform the reduction, and use this number + // to create the shared memory size for the reduction + dim3 grid(blocks); + dim3 block(getGradParamsNumThreads(batchSize)); + int smem = block.x * sizeof(accreal); + + spatialDepthwiseConvolutionAccGradParameters<<>>( + dGradOutput, dInput, dGradWeight, batchSize, inputChannels, outputChannels, depthwiseMultiplier, + width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu new file mode 100644 index 0000000..1cac7f6 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu @@ -0,0 +1,497 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialDilatedConvolution.cu" +#else + +static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( + THCState *state, + THCTensor *input, THCTensor *gradOutput, + THCTensor *weight, THCTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, int weight_nullable) { + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationW > 0 && dilationH > 0, 14, + "dilation should be greater than 0, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + if (weight != NULL) { + THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 4, 4, weight, + "non-empty 4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " + "but got: %s"); + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld). " + "Calculated output size per channel: (%ld x %ld). Output size is too small", + inputHeight, inputWidth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) { + + THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); + if (bias) { + THCUNN_assertSameGPU(state, 2, weight, bias); + THArgCheck(THCTensor_(isContiguous)(state, bias), 5, "bias tensor has to be contiguous"); + } + THNN_(SpatialDilatedConvolution_shapeCheck) + (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, 0); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THCTensor_(newContiguous)(state, input); + weight = THCTensor_(newContiguous)(state, weight); + bias = bias ? THCTensor_(newContiguous)(state, bias) : bias; + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = columns->size[1]; + int64_t k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, weight); + if (bias) THCTensor_(free)(state, bias); +} + +void THNN_(SpatialDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) { + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THNN_(SpatialDilatedConvolution_shapeCheck) + (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, 0); + + // Params + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + weight = THCTensor_(newContiguous)(state, weight); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nInputPlane*kW*kH; + int64_t n = gradColumns->size[1]; + int64_t k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2im( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, weight); +} + +void THNN_(SpatialDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + accreal scale_) { + + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones); + if (gradBias) { + THCUNN_assertSameGPU(state, 2, gradWeight, gradBias); + } + THNN_(SpatialDilatedConvolution_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, 1); + + if (gradWeight) { + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous"); + } + if (gradBias) { + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous"); + } + + // Params + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t nInputPlane = input->size[1]; + int64_t nOutputPlane = gradOutput->size[1]; + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = nInputPlane*kW*kH; + int64_t k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (is_batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu new file mode 100644 index 0000000..7425345 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu @@ -0,0 +1,246 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialDilatedMaxPooling.cu" +#else + +#include "../common.h" + +static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( + THCState *state, + THCTensor *input, THCTensor *gradOutput, THCIndexTensor *indices, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, bool ceil_mode) { + + THArgCheck(kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationH > 0 && dilationW > 0, 12, + "dilation should be greater than zero, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + int batchSize = 1; + + if (ndim == 4) { + batchSize = input->size[0]; + dimf++; + dimh++; + dimw++; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size, but got " + "padW = %d, padH = %d, kW = %d, kH = %d", + padW, padH, kW, kH); + + int64_t nInputPlane = input->size[dimh-1]; + int64_t nInputRows = input->size[dimh]; + int64_t nInputCols = input->size[dimw]; + int64_t nOutputRows, nOutputCols; + int64_t nOutputPlane = nInputPlane; + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + if (nOutputCols < 1 || nOutputRows < 1) + THError("Given input size: (%dx%dx%d). " + "Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols); + } + if (indices != NULL) { + THCUNN_check_dim_size_indices(state, indices, 4, 0, batchSize); + THCUNN_check_dim_size_indices(state, indices, 4, 1, nOutputPlane); + THCUNN_check_dim_size_indices(state, indices, 4, 2, nOutputRows); + THCUNN_check_dim_size_indices(state, indices, 4, 3, nOutputCols); + } +} + +void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode) +{ + + THCUNN_assertSameGPU(state, 3, input, output, indices); + THNN_(SpatialDilatedMaxPooling_shapeCheck) + (state, input, NULL, NULL, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, ceil_mode); + + int64_t nInputCols, nInputRows, nInputPlane, batchSize; + int64_t nOutputCols, nOutputRows; + + if (input->dim() == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + input = THCTensor_(newContiguous)(state, input); + real* input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); + THCUNN_resizeAs_indices(state, indices, output); + + THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices); + real* output_data = THCTensor_(data)(state, output); + + int count = THCTensor_(nElement)(state, output); + + MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, input_data, + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); + THCudaCheck(cudaGetLastError()); + + if(input->dim() == 3) + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + + THCTensor_(free)(state, input); +} + +void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode) +{ + THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput); + THNN_(SpatialDilatedMaxPooling_shapeCheck) + (state, input, gradOutput, indices, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, ceil_mode); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int64_t nInputCols, nInputRows, nInputPlane, batchSize; + int64_t nOutputCols, nOutputRows; + + if (input->_dim() == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + + int count = THCTensor_(nElement)(state, input); + dim3 grid; + int imgcount = nInputCols * nInputRows; + const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS; + grid.x = blocks; + grid.y = batchSize; + grid.z = nInputPlane; + uint64_t maxGridY = THCState_getCurrentDeviceProperties(state)->maxGridSize[1]; + uint64_t maxGridZ = THCState_getCurrentDeviceProperties(state)->maxGridSize[2]; + if (maxGridY < grid.y) grid.y = maxGridY; + if (maxGridZ < grid.z) grid.z = maxGridZ; + MaxPoolBackward <<< grid, BACKWARD_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, + THCTensor_(data)(state, gradOutput), + THCIndexTensor_(data)(state, indices), + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, gradOutput); + + // clean + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/generic/SpatialFractionalMaxPooling.cu new file mode 100644 index 0000000..0535653 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialFractionalMaxPooling.cu @@ -0,0 +1,157 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialFractionalMaxPooling.cu" +#else + +void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices, + THCTensor *randomSamples) +{ + int planeDim = 0; + int dimh = 1; + int dimw = 2; + int64_t numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (numInputDims == 4) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + } + + /* sizes */ + int64_t numPlanes = THCTensor_(size)(state, input, planeDim); + int64_t inputH = THCTensor_(size)(state, input, dimh); + int64_t inputW = THCTensor_(size)(state, input, dimw); + + THArgCheck(outputH + poolSizeH - 1 <= inputH, 6, + "poolSizeH (%d) too large relative to input height (%d)", + poolSizeH, inputH); + THArgCheck(outputW + poolSizeW - 1 <= inputW, 5, + "poolSizeW (%d) too large relative to input width (%d)", + poolSizeW, inputW); + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + THCDeviceTensor devIndices; + THCDeviceTensor devSamples = + toDeviceTensor(state, randomSamples); + + if (numInputDims == 3) { + /* resize output */ + THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THCIndexTensor_(resize3d)(state, indices, numPlanes, outputH, outputW); + + devInput = toDeviceTensor(state, input).upcastOuter<4>(); + devOutput = toDeviceTensor(state, output).upcastOuter<4>(); + devIndices = toDeviceTensor(state, indices).upcastOuter<4>(); + } else { + THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THCIndexTensor_(resize4d)(state, indices, numBatch, numPlanes, outputH, outputW); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + devIndices = toDeviceTensor(state, indices); + } + + // block is limited to 4 warps + // grid handles overflow per each plane + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); + dim3 grid(THCCeilDiv(outputPlaneSize, 128), + devInput.getSize(1), + devInput.getSize(0)); + dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); + +#define SFMP_UPDATE_OUTPUT(POOL_W) \ + SpatialFractionalMaxPooling_updateOutput \ + <<>>( \ + devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH); + +#define SFMP_UPDATE_OUTPUT_CASE(POOL_W) \ + case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break + + switch (poolSizeW) { + SFMP_UPDATE_OUTPUT_CASE(2); + SFMP_UPDATE_OUTPUT_CASE(3); + SFMP_UPDATE_OUTPUT_CASE(4); + SFMP_UPDATE_OUTPUT_CASE(5); + SFMP_UPDATE_OUTPUT_CASE(6); + SFMP_UPDATE_OUTPUT_CASE(7); + default: + // dynamic pool width + SFMP_UPDATE_OUTPUT_CASE(-1); + } + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices) +{ + int dimh = 1; + int dimw = 2; + + int64_t numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 4) { + dimh++; + dimw++; + } + + /* sizes */ + int64_t inputH = THCTensor_(size)(state, input, dimh); + int64_t inputW = THCTensor_(size)(state, input, dimw); + + THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3, + "gradOutput height unexpected"); + THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected"); + + /* resize */ + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + THCDeviceTensor devIndices; + + /* backprop */ + if (numInputDims == 3) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<4>(); + devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<4>(); + devIndices = toDeviceTensor(state, indices).upcastOuter<4>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + devIndices = toDeviceTensor(state, indices); + } + + // block is limited to 4 warps + // grid handles overflow per each plane + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); + dim3 grid(THCCeilDiv(outputPlaneSize, 128), + devGradInput.getSize(1), + devGradInput.getSize(0)); + dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); + + SpatialFractionalMaxPooling_updateGradInput + <<>>( + devGradInput, devGradOutput, devIndices); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialFullConvolution.cu b/aten/src/THCUNN/generic/SpatialFullConvolution.cu new file mode 100644 index 0000000..6f9fa98 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialFullConvolution.cu @@ -0,0 +1,61 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu" +#else + +void THNN_(SpatialFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + THNN_(SpatialFullDilatedConvolution_updateOutput)( + state, input, output, weight, bias, columns, ones, + kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH); +} + +void THNN_(SpatialFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + THNN_(SpatialFullDilatedConvolution_updateGradInput)( + state, input, gradOutput, gradInput, weight, gradColumns, + kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH); +} + + +void THNN_(SpatialFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + accreal scale_) +{ + THNN_(SpatialFullDilatedConvolution_accGradParameters)( + state, input, gradOutput, gradWeight, gradBias, + columns, ones, + kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH, scale_); +} + +#endif \ No newline at end of file diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu new file mode 100644 index 0000000..58ab364 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu @@ -0,0 +1,498 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialFullDilatedConvolution.cu" +#else + +static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( + THCState *state, + THCTensor *input, THCTensor *gradOutput, + THCTensor *weight, THCTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, + int adjH, int adjW, int weight_nullable) { + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationH: %d, dilationW: %d", + dilationH, dilationW); + THArgCheck((adjW < dW || adjW < dilationW) && (adjH < dH || adjH < dilationH), 15, + "output padding must be smaller than either stride or dilation, but got adjH: %d adjW: %d dH: %d dW: %d dilationH: %d dilationW: %d", + adjH, adjW, dH, dW, dilationH, dilationW); + + if (weight != NULL) { + THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, + "non-empty 2D or 4D weight tensor expected, but got: %s"); + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + if (outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld). " + "Calculated output spatial size per channel: (%ld x %ld). Output size is too small", + inputHeight, inputWidth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[0]; + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[1]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialFullDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH) +{ + + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + + THCUNN_assertSameGPU(state, 6, input, output, weight, + bias, columns, ones); + THNN_(SpatialFullDilatedConvolution_shapeCheck) + (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0); + + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, + "bias tensor has to be contiguous"); + input = THCTensor_(newContiguous)(state, input); + weight = THCTensor_(newContiguous)(state, weight); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t n = columns->size[1]; + int64_t k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, input_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert::to(0), + THCTensor_(data)(state, columns), n + ); + + // Unpack columns back into input: + col2im( + THCState_getCurrentStream(state), + THCTensor_(data)(state, columns), + nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, THCTensor_(data)(state, output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, output_n), n_ + ); + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, weight); +} + +void THNN_(SpatialFullDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH) +{ + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THNN_(SpatialFullDilatedConvolution_shapeCheck) + (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + weight = THCTensor_(newContiguous)(state, weight); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputHeight, outputWidth, + inputHeight, inputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, THCTensor_(data)(state, gradColumns) + ); + + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[0]; + int64_t n = gradColumns->size[1]; + int64_t k = weight->size[1] * weight->size[2] * weight->size[3]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradColumns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert::to(0), + THCTensor_(data)(state, gradInput_n), n + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, weight); +} + + +void THNN_(SpatialFullDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH, + accreal scale_) +{ + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, + gradBias, columns, ones); + THNN_(SpatialFullDilatedConvolution_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, adjH, adjW, 1); + + int nOutputPlane; + if (gradWeight != NULL) { + nOutputPlane = THCTensor_(size)(state, gradWeight, 1); + } else if (gradBias != NULL) { + nOutputPlane = THCTensor_(size)(state, gradBias, 0); + } else { + return; + } + + if (gradWeight) { + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous"); + } + THArgCheck(THCTensor_(isContiguous)(state, columns), 6, "columns needs to be contiguous"); + if (gradBias) { + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous"); + } + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputHeight, outputWidth, + inputHeight, inputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t n = columns->size[0]; // nOutputPlane * kh * kw + int64_t m = input_n->size[0]; // nInputPlane + int64_t k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, input_n), k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (is_batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, input->size[1], inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu new file mode 100644 index 0000000..0e9afdf --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu @@ -0,0 +1,97 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialGridSamplerBilinear.cu" +#else + +static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *gradOutput) { + THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimension)(state, input) == 4, 2, input, + "non-empty 4D input tensor expected but got: %s"); + THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimension)(state, grid) == 4, 2, grid, + "4D grid tensor expected but got: %s"); + + int64_t nbatch = THCTensor_(size)(state, input, 0); + int64_t channels = THCTensor_(size)(state, input, 1); + int64_t iheight = THCTensor_(size)(state, input, 2); + int64_t iwidth = THCTensor_(size)(state, input, 3); + int64_t oheight = THCTensor_(size)(state, grid, 1); + int64_t owidth = THCTensor_(size)(state, grid, 2); + + THCUNN_check_dim_size(state, grid, 4, 0, nbatch); + THCUNN_check_dim_size(state, grid, 4, 3, 2); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 4, 0, nbatch); + THCUNN_check_dim_size(state, gradOutput, 4, 1, channels); + THCUNN_check_dim_size(state, gradOutput, 4, 2, oheight); + THCUNN_check_dim_size(state, gradOutput, 4, 3, owidth); + } +} + +THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode) { + + THCUNN_assertSameGPU(state, 3, input, grid, output); + THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, NULL); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t IH = THCTensor_(size)(state, input, 2); + int64_t IW = THCTensor_(size)(state, input, 3); + int64_t H = THCTensor_(size)(state,grid, 1); + int64_t W = THCTensor_(size)(state, grid, 2); + + // resize output to the same shape as input + THCTensor_(resize4d)(state, output, N, C, H, W); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devOutput = toDeviceTensor(state, output); + + int count = static_cast(N*H*W); + SpatialGridSamplerBilinear_updateOutput_kernel + <<>>( + count, devInput, devGrid, devOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode) { + + THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput); + THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t IH = THCTensor_(size)(state, input, 2); + int64_t IW = THCTensor_(size)(state, input, 3); + int64_t H = THCTensor_(size)(state, grid, 1); + int64_t W = THCTensor_(size)(state, grid, 2); + + THCTensor_(resize4d)(state, gradInput, N, C, IH, IW); + THCTensor_(resize4d)(state, gradGrid, N, H, W, 2); + THCTensor_(zero)(state, gradInput); + THCTensor_(zero)(state, gradGrid); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGradInput = toDeviceTensor(state, gradInput); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devGradGrid = toDeviceTensor(state, gradGrid); + THCDeviceTensor devGradOutput = toDeviceTensor(state, gradOutput); + + int count = static_cast(N*H*W); + SpatialGridSamplerBilinear_updateGradInput_kernel + <<>>( + count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialMaxPooling.cu b/aten/src/THCUNN/generic/SpatialMaxPooling.cu new file mode 100644 index 0000000..6be838d --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialMaxPooling.cu @@ -0,0 +1,40 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu" +#else + +#include "../common.h" + +void THNN_(SpatialMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); + +} + +void THNN_(SpatialMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); + +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu new file mode 100644 index 0000000..90d6e0a --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu @@ -0,0 +1,104 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu" +#else + +void THNN_(SpatialMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int owidth, int oheight) +{ + THCUNN_assertSameGPU(state, 3, input, output, indices); + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + THCUNN_check_shape_indices(state, indices, input); + + int64_t nInputCols, nInputRows, nInputPlane, batchSize; + + if (input->dim() == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth); + THCTensor_(zero)(state, output); + + int count = THCTensor_(nElement)(state, input); + + MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices), + batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output)); + THCudaCheck(cudaGetLastError()); + + if(input->dim() == 3) + THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth); + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, indices); +} + +void THNN_(SpatialMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int owidth, int oheight) +{ + THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput); + THCUNN_check_shape_indices(state, indices, input); + + int64_t nInputCols, nInputRows, nInputPlane, batchSize; + int dimw = 2; + int dimh = 1; + + if (input->dim() == 3) { + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + ++dimw; + ++dimh; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + nInputCols = input->size[dimw]; + nInputRows = input->size[dimh]; + + if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){ + THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", + oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]); + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + + int count = THCTensor_(nElement)(state, input); + + MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices), + batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + // clean + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, indices); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu new file mode 100644 index 0000000..0c90944 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu @@ -0,0 +1,137 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu" +#else + +void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s") + + if (numInputDims == 4) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputH = THCTensor_(size)(state, input, dimh); + int inputW = THCTensor_(size)(state, input, dimw); + + THArgCheck(padL < inputW && padR < inputW, 4, + "Padding size should be less than the corresponding input dimension, " + "but got: padding (%d, %d) at dimension %d of input %s", + padL, padR, dimw, THCTensor_(sizeDesc)(state, input).str); + + THArgCheck(padT < inputH && padB < inputH, 6, + "Padding size should be less than the corresponding input dimension, " + "but got: padding (%d, %d) at dimension %d of input %s", + padT, padB, dimh, THCTensor_(sizeDesc)(state, input).str); + + int outputH = inputH + padT + padB; + int outputW = inputW + padL + padR; + + THArgCheck(outputW >= 1 || outputH >= 1, 2, + "input (H: %d, W: %d)is too small." + " Calculated output H: %d W: %d", + inputH, inputW, outputH, outputW); + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + + if (numInputDims == 3) { + THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); + + devInput = toDeviceTensor(state, input).upcastOuter<4>(); + devOutput = toDeviceTensor(state, output).upcastOuter<4>(); + } else { + THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + } + + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReflectionPadding_updateOutput<<>>( + devInput, devOutput, padT, padB, padL, padR); + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SpatialReflectionPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB) { + + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, + "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 4) { + planeDim++; + dimh++; + dimw++; + } + int iheight = input->size[dimh]; + int iwidth = input->size[dimw]; + int oheight = iheight + padT + padB; + int owidth = iwidth + padL + padR; + + THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THCTensor_(size)(state, gradOutput, dimw)); + THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THCTensor_(size)(state, gradOutput, dimh)); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + + if (numInputDims == 3) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<4>(); + devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<4>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReflectionPadding_updateGradInput<<>>( + devGradInput, devGradOutput, padT, padB, padL, padR); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu new file mode 100644 index 0000000..6ab694d --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu @@ -0,0 +1,127 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu" +#else + +void THNN_(SpatialReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s") + + if (numInputDims == 4) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputH = THCTensor_(size)(state, input, dimh); + int inputW = THCTensor_(size)(state, input, dimw); + int outputH = inputH + padT + padB; + int outputW = inputW + padL + padR; + + THArgCheck(outputW >= 1 || outputH >= 1 , 2, + "input (H: %d, W: %d)is too small." + " Calculated output H: %d W: %d", + inputH, inputW, outputH, outputW); + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + + if (numInputDims == 3) { + THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); + + devInput = toDeviceTensor(state, input).upcastOuter<4>(); + devOutput = toDeviceTensor(state, output).upcastOuter<4>(); + } else { + THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + } + + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReplicationPadding_updateOutput<<>>( + devInput, devOutput, padT, padB, padL, padR); + +} + +void THNN_(SpatialReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB) { + + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, + "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 4) { + planeDim++; + dimh++; + dimw++; + } + int iheight = input->size[dimh]; + int iwidth = input->size[dimw]; + int oheight = iheight + padT + padB; + int owidth = iwidth + padL + padR; + + THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THCTensor_(size)(state, gradOutput, dimw)); + THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THCTensor_(size)(state, gradOutput, dimh)); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + + if (numInputDims == 3) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<4>(); + devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<4>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReplicationPadding_updateGradInput<<>>( + devGradInput, devGradOutput, padT, padB, padL, padR); + +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialSubSampling.cu b/aten/src/THCUNN/generic/SpatialSubSampling.cu new file mode 100644 index 0000000..ea71c82 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialSubSampling.cu @@ -0,0 +1,259 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialSubSampling.cu" +#else + +#include "../common.h" + +static inline void THNN_(SpatialSubSampling_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *weight, + int kW, int kH) { + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int nInputPlane = THCTensor_(size)(state, weight, 0); + + int dimc = 2; + int dimr = 1; + int dimp = 0; + + if (input->dim() == 4) { + dimc++; + dimr++; + dimp++; + } + + int64_t nInputCols = input->size[dimc]; + int64_t nInputRows = input->size[dimr]; + THArgCheck(input->size[dimp] == nInputPlane, 2, "invalid number of input planes"); + THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size"); +} + +void THNN_(SpatialSubSampling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int kH, + int dW, int dH) +{ + real *weight_data = THCTensor_(data)(state, weight); + real *bias_data = THCTensor_(data)(state, bias); + real *output_data; + real *input_data; + + int nInputPlane = THCTensor_(size)(state, weight, 0); + + THCUNN_assertSameGPU(state, 4, input, output, weight, bias); + THNN_(SpatialSubSampling_shapeCheck)(state, input, NULL, weight, kW, kH); + + if (input->dim() == 3) { + int64_t nInputCols = input->size[2]; + int64_t nInputRows = input->size[1]; + int64_t nOutputCols = (nInputCols - kW) / dW + 1; + int64_t nOutputRows = (nInputRows - kH) / dH + 1; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + // run subsample kernel + subsample <<>> ( + input_data, output_data, weight_data, bias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + THCudaCheck(cudaGetLastError()); + } else { + int64_t nInputCols = input->size[3]; + int64_t nInputRows = input->size[2]; + int64_t nbatch = input->size[0]; + int64_t nOutputCols = (nInputCols - kW) / dW + 1; + int64_t nOutputRows = (nInputRows - kH) / dH + 1; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + // run subsample kernel + subsample <<>> ( + input_data, output_data, weight_data, bias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state, input); + +} + +void THNN_(SpatialSubSampling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int kH, + int dW, int dH) +{ + THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput); + THNN_(SpatialSubSampling_shapeCheck)(state, input, gradOutput, weight, kW, kH); + + int nInputPlane = THCTensor_(size)(state, weight, 0); + + if (input->dim() == 3) { + int64_t nInputCols = input->size[2]; + int64_t nInputRows = input->size[1]; + + real *weight_data = THCTensor_(data)(state, weight); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *gradInput_data; + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + // run updateGradInput kernel + if (kH <= dH && kW <= dW) { + subgradinput <<>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } else { + subgradinputAtomic <<>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } + THCudaCheck(cudaGetLastError()); + } else { + int64_t nInputCols = input->size[3]; + int64_t nInputRows = input->size[2]; + int64_t nbatch = input->size[0]; + + real *weight_data = THCTensor_(data)(state, weight); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *gradInput_data; + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + // run updateGradInput kernel + if (kH <= dH && kW <= dW) { + subgradinput <<>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } else { + subgradinputAtomic <<>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } + THCudaCheck(cudaGetLastError()); + } + THCTensor_(free)(state, gradOutput); +} + +void THNN_(SpatialSubSampling_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int kH, + int dW, int dH, + accreal scale) +{ + THCUNN_assertSameGPU(state, 4, input, gradOutput, gradWeight, gradBias); + THNN_(SpatialSubSampling_shapeCheck)(state, input, gradOutput, gradWeight, kW, kH); + + int nInputPlane = THCTensor_(size)(state, gradWeight, 0); + + if (input->dim() == 3) { + int64_t nInputCols = input->size[2]; + int64_t nInputRows = input->size[1]; + + real *gradWeight_data = THCTensor_(data)(state, gradWeight); + real *gradBias_data = THCTensor_(data)(state, gradBias); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *input_data; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + // cuda blocks & threads: + dim3 blocks(nInputPlane); + dim3 threads(32,8); + + // run gradweight kernel + subgradweight <<>> ( + input_data, gradOutput_data, gradWeight_data, gradBias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); + THCudaCheck(cudaGetLastError()); + } else { + int64_t nInputCols = input->size[3]; + int64_t nInputRows = input->size[2]; + int64_t nbatch = input->size[0]; + + real *gradWeight_data = THCTensor_(data)(state, gradWeight); + real *gradBias_data = THCTensor_(data)(state, gradBias); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *input_data; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + // cuda blocks & threads: + dim3 blocks(nInputPlane); + dim3 threads(32,8); + + // run gradweight kernel + int64_t sl; + for (sl=0; sl <<>> ( + input_data + sl*input->stride[0], + gradOutput_data + sl*gradOutput->stride[0], + gradWeight_data, gradBias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); + } + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu new file mode 100644 index 0000000..f9cc0a4 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu @@ -0,0 +1,105 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu" +#else + +#include "../linear_upsampling.h" + +static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck) + (THCState *state, + THCTensor *input, THCTensor *gradOutput, + int nBatch, int nChannels, + int inputHeight, int inputWidth, + int outputHeight, int outputWidth) { + THArgCheck(inputHeight > 0 && inputWidth > 0 + && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (H: %d, W: %d) output (H: %d, W: %d)", + inputHeight, inputWidth, outputHeight, outputWidth); + if (input != NULL) { + THCUNN_argCheck(state, !input->is_empty() && input->dim() == 4, 2, input, + "non-empty 4D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch); + THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels); + THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight); + THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth); + } +} + +void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputHeight, + int outputWidth, + bool align_corners) +{ + int nbatch = THCTensor_(size)(state, input, 0); + int channels = THCTensor_(size)(state, input, 1); + int inputHeight = THCTensor_(size)(state, input, 2); + int inputWidth = THCTensor_(size)(state, input, 3); + THNN_(SpatialUpSamplingBilinear_shapeCheck) + (state, input, NULL, + nbatch, channels, + inputHeight, inputWidth, + outputHeight, outputWidth); + + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resize4d)(state, output, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + outputHeight, outputWidth); + THCTensor_(zero)(state, output); + THCDeviceTensor idata = toDeviceTensor(state, input); + THCDeviceTensor odata = toDeviceTensor(state, output); + THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + const int num_kernels = outputHeight * outputWidth; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel <<>>(num_kernels, rheight, rwidth, align_corners, idata, odata); + THCudaCheck(cudaGetLastError()); +} + + +void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth, + bool align_corners) +{ + THNN_(SpatialUpSamplingBilinear_shapeCheck) + (state, NULL, gradOutput, + nbatch, nchannels, + inputHeight, inputWidth, + outputHeight, outputWidth); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth); + THCTensor_(zero)(state, gradInput); + THCDeviceTensor data1 = toDeviceTensor(state, gradInput); + THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + const int num_kernels = outputHeight * outputWidth; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rheight, rwidth, align_corners, data1, data2); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu new file mode 100644 index 0000000..a71fc5b --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu @@ -0,0 +1,101 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu" +#else + +#include "../common.h" + +static inline void THNN_(SpatialUpSamplingNearest_shapeCheck) + (THCState *state, + THCTensor *input, THCTensor *gradOutput, + int nBatch, int nChannels, + int inputHeight, int inputWidth, + int outputHeight, int outputWidth) { + THArgCheck(inputHeight > 0 && inputWidth > 0 + && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (H: %d, W: %d) output (H: %d, W: %d)", + inputHeight, inputWidth, outputHeight, outputWidth); + if (input != NULL) { + THCUNN_argCheck(state, input->_dim() == 4, 2, input, + "4D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch); + THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels); + THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight); + THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth); + } +} + + +void THNN_(SpatialUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputHeight, + int outputWidth) +{ + THCUNN_assertSameGPU(state, 2, input, output); + int nbatch = THCTensor_(size)(state, input, 0); + int channels = THCTensor_(size)(state, input, 1); + int inputHeight = THCTensor_(size)(state, input, 2); + int inputWidth = THCTensor_(size)(state, input, 3); + + THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, + inputHeight, inputWidth, + outputHeight, outputWidth); + THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); + + THCTensor_(resize4d)(state, output, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + outputHeight, + outputWidth); + THCTensor_(zero)(state, output); + + THCDeviceTensor idata = toDeviceTensor(state, input); + THCDeviceTensor odata = toDeviceTensor(state, output); + + const int num_kernels = outputHeight * outputWidth; + const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + nearest_neighbor_4d_kernel <<>>(num_kernels, idata, odata); + THCudaCheck(cudaGetLastError()); +} + + + +void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth) +{ + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, + inputHeight, inputWidth, outputHeight, outputWidth); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth); + + THCTensor_(zero)(state, gradInput); + THCDeviceTensor data1 = toDeviceTensor(state, gradInput); + THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); + + const int num_kernels = outputHeight * outputWidth; + const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + + nearest_neighbor_4d_kernel_backward <<>>(num_kernels, data1, data2); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/Sqrt.cu b/aten/src/THCUNN/generic/Sqrt.cu new file mode 100644 index 0000000..57a6fc8 --- /dev/null +++ b/aten/src/THCUNN/generic/Sqrt.cu @@ -0,0 +1,32 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Sqrt.cu" +#else + +#include "../common.h" + +void THNN_(Sqrt_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal eps_) +{ + real eps = ScalarConvert::to(eps_); + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor(eps)); +} + +void THNN_(Sqrt_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_check_shape(state, output, gradOutput); + THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor()); +} + +#endif diff --git a/aten/src/THCUNN/generic/Square.cu b/aten/src/THCUNN/generic/Square.cu new file mode 100644 index 0000000..745502b --- /dev/null +++ b/aten/src/THCUNN/generic/Square.cu @@ -0,0 +1,29 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Square.cu" +#else + +#include "../common.h" + +void THNN_(Square_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, squareupdateOutput_functor()); +} + +void THNN_(Square_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_check_shape(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor()); +} + +#endif diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h new file mode 100644 index 0000000..eaadf66 --- /dev/null +++ b/aten/src/THCUNN/generic/THCUNN.h @@ -0,0 +1,1694 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCUNN.h" +#else + +#include "Reduction.h" + +THC_API void THNN_(Abs_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +THC_API void THNN_(Abs_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + +THC_API void THNN_(AbsCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction); + +THC_API void THNN_(AbsCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction); + +THC_API void THNN_(BatchNormalization_updateOutput)( + THCState *state, + THCTensor *input_, + THCTensor *output_, + THCTensor *weight_, // [OPTIONAL] + THCTensor *bias_, // [OPTIONAL] + THCTensor *runningMean_, // [OPTIONAL] if train + THCTensor *runningVar_, // [OPTIONAL] if train + THCTensor *saveMean_, + THCTensor *saveStd_, + bool train, + double momentum, + double eps); + +THC_API void THNN_(BatchNormalization_backward)( + THCState *state, + THCTensor *input_, + THCTensor *gradOutput_, + THCTensor *gradInput_, // [OPTIONAL] + THCTensor *gradWeight_, // [OPTIONAL] + THCTensor *gradBias_, // [OPTIONAL] + THCTensor *weight_, // [OPTIONAL] + THCTensor *runningMean_, // [OPTIONAL] if train + THCTensor *runningVar_, // [OPTIONAL] if train + THCTensor *saveMean_, // [OPTIONAL] if !train + THCTensor *saveStd_, // [OPTIONAL] if !train + bool train, + double scale, + double eps); + +THC_API void THNN_(BCECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction, + THCTensor *weights); // [OPTIONAL] + +THC_API void THNN_(BCECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + THCTensor *weights); // [OPTIONAL] + +THC_API void THNN_(ClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + int64_t reduction, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight, + int64_t ignore_index); + +THC_API void THNN_(ClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight, + int64_t ignore_index); + +THC_API void THNN_(DistKLDivCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction); + +THC_API void THNN_(DistKLDivCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction); + +THC_API void THNN_(ELU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal alpha, + accreal scale, + bool inplace); + +THC_API void THNN_(ELU_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + accreal alpha, + accreal scale); + +THC_API void THNN_(FeatureLPPooling_updateOutput)( + THCState* state, + THCTensor* inputTH, + THCTensor* outputTH, + accreal power, + int width, + int stride, + bool batchMode); + +THC_API void THNN_(FeatureLPPooling_updateGradInput)( + THCState* state, + THCTensor* gradOutputTH, + THCTensor* inputTH, + THCTensor* outputTH, + THCTensor* gradInputTH, + accreal power, + int width, + int stride, + bool batchMode); + +THC_API void THNN_(HardTanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal min_val, + accreal max_val, + bool inplace); + +THC_API void THNN_(HardTanh_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal min_val, + accreal max_val, + bool inplace); + +THC_API void THNN_(GatedLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int dim); + +THC_API void THNN_(GatedLinear_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int dim); + +THC_API void THNN_(Im2Col_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +THC_API void THNN_(Im2Col_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +THC_API void THNN_(Col2Im_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + + THC_API void THNN_(Col2Im_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +THC_API void THNN_(LeakyReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal negval, + bool inplace); + +THC_API void THNN_(LeakyReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal negval, + bool inplace); + +THC_API void THNN_(GRUFused_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *hidden, + THCTensor *bias1, // [OPTIONAL] + THCTensor *bias2, // [OPTIONAL] + THCTensor *hx, + THCTensor *hy, + THCTensor *storage); + +THC_API void THNN_(GRUFused_updateGradInput)( + THCState *state, + THCTensor *gradInInput, + THCTensor *gradInHidden, + THCTensor *gradOutput, + THCTensor *gradInputHx, + THCTensor *storage); + +THC_API void THNN_(LSTMFused_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *hidden, + THCTensor *bias1, // [OPTIONAL] + THCTensor *bias2, // [OPTIONAL] + THCTensor *cx, + THCTensor *hy, + THCTensor *cy); + +THC_API void THNN_(LSTMFused_updateGradInput)( + THCState *state, + THCTensor *storage, + THCTensor *gradInGates, + THCTensor *prevC, + THCTensor *cy, + THCTensor *gradOutput, + THCTensor *gradOutputCell, + THCTensor *gradInputCx); + +THC_API void THNN_(LogSigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *buffer); + +THC_API void THNN_(LogSigmoid_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *buffer); + +THC_API void THNN_(LookupTable_accGradParameters)( + THCState *state, + THCIndexTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCIndexTensor *count, + THCIndexTensor *sorted, // [OPTIONAL] + THCIndexTensor *indices, // [OPTIONAL] + bool scaleGradByFreq, + int paddingValue, + accreal scale); + +THC_API void THNN_(LookupTable_renorm)( + THCState *state, + THCIndexTensor *idx, + THCTensor *weight, + accreal maxNorm, + accreal normType); + +THC_API void THNN_(LookupTableBag_updateOutput)( + THCState *state, + THCIndexTensor *input, + THCIndexTensor *offsets, + THCTensor *weight, + THCTensor *output, + THCIndexTensor *offset2bag, + int mode, + THCIndexTensor *seq_length); // [OPTIONAL] + +THC_API void THNN_(LookupTableBag_accGradParameters)( + THCState *state, + THCIndexTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCIndexTensor *offset2bag, + THCIndexTensor *count, + THCIndexTensor *sortedIndices, + THCIndexTensor *origIndices, + bool scaleGradByFreq, + int mode, + THCIndexTensor *seq_length, // [OPTIONAL] + accreal scale_); + +THC_API void THNN_(L1Cost_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +THC_API void THNN_(L1Cost_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, // [OPTIONAL] + THCTensor *gradInput); + +THC_API void THNN_(MarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage, + accreal margin); + +THC_API void THNN_(MarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage, + accreal margin); + +THC_API void THNN_(MSECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction); + +THC_API void THNN_(MSECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction); + +THC_API void THNN_(MultiLabelMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + THCTensor *istarget, + int64_t reduction); + +THC_API void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *istarget, + int64_t reduction); + +THC_API void THNN_(MultiMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + int64_t reduction, + int p, + THCTensor *weights, // [OPTIONAL] + accreal margin); + +THC_API void THNN_(MultiMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + int p, + THCTensor *weights, // [OPTIONAL] + accreal margin); + +THC_API void THNN_(PReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight); + +THC_API void THNN_(PReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight); + +THC_API void THNN_(PReLU_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradWeight, + accreal scale); + +THC_API void THNN_(SmoothL1Criterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction); + +THC_API void THNN_(SmoothL1Criterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction); + +THC_API void THNN_(SparseLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias); + +THC_API void THNN_(SparseLinear_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + accreal weightDecay, + accreal scale); + +THC_API void THNN_(SparseLinear_legacyUpdateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias); + +THC_API void THNN_(SparseLinear_legacyAccGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + accreal weightDecay, + accreal scale); + +THC_API void THNN_(SparseLinear_zeroGradParameters)( + THCState *state, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput); + +THC_API void THNN_(SparseLinear_updateParameters)( + THCState *state, + THCTensor *weight, + THCTensor *bias, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput, + accreal learningRate); + +THC_API void THNN_(IndexLinear_updateOutput)( + THCState *state, + THCIndexTensor *keys, + int64_t keysOffset, + THCTensor *values, + THCIndexTensor *sizes, + THCIndexTensor *cumSumSizes, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *normalizedValues, + int train); + +THC_API void THNN_(IndexLinear_accGradParameters)( + THCState *state, + THCIndexTensor *keys, + int64_t keysOffset, + THCTensor *values, + THCIndexTensor *sizes, + THCIndexTensor *cumSumSizes, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + THCTensor* valuesBuffer, + accreal weightDecay, + accreal scale); + +THC_API void THNN_(IndexLinear_accUpdateGradParameters)( + THCState *state, + THCIndexTensor *keys, + int64_t keysOffset, + THCTensor *values, + THCIndexTensor *sizes, + THCIndexTensor *cumSumSizes, + THCTensor *gradOutput, + THCTensor *weight, + THCTensor *bias, + accreal weightDecay, + accreal scale); + +THC_API void THNN_(IndexLinear_updateParameters)( + THCState *state, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + THCIndexTensor *runningKeys, + THCIndexTensor *cumSumSizes, + int64_t keysOffset, + accreal weightDecay, + accreal learningRate); + +THC_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int osizeW, + int osizeH); + +THC_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices); + +THC_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int osizeW, + int osizeH); + +THC_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + +THC_API void THNN_(SpatialAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +THC_API void THNN_(SpatialAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +THC_API void THNN_(SpatialClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + int64_t reduction, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight, + int64_t ignore_index); + +THC_API void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight, + int64_t ignore_index); + +THC_API void THNN_(SpatialConvolutionLocal_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight); + +THC_API void THNN_(SpatialConvolutionLocal_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight); + +THC_API void THNN_(SpatialConvolutionLocal_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight, + accreal scale); + +THC_API void THNN_(SpatialConvolutionMM_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH); + +THC_API void THNN_(SpatialConvolutionMM_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH); + +THC_API void THNN_(SpatialConvolutionMM_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + accreal scale); + +THC_API void THNN_(SpatialDepthwiseConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +THC_API void THNN_(SpatialDepthwiseConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +THC_API void THNN_(SpatialDepthwiseConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +THC_API void THNN_(SpatialCrossMapLRN_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *scale, + int size, + accreal alpha, + accreal beta, + accreal k); + +THC_API void THNN_(SpatialCrossMapLRN_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *scale, + THCTensor *output, + int size, + accreal alpha, + accreal beta, + accreal k); + +THC_API void THNN_(SpatialDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +THC_API void THNN_(SpatialDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *columns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +THC_API void THNN_(SpatialDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + accreal scale); + +THC_API void THNN_(SpatialFullDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH); + +THC_API void THNN_(SpatialFullDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *columns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH); + +THC_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH, + accreal scale); + +THC_API void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); + +THC_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); + +THC_API void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices, + THCTensor *randomSamples); + +THC_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices); + +THC_API void THNN_(SpatialFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); + +THC_API void THNN_(SpatialFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *columns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); + +THC_API void THNN_(SpatialFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + accreal scale); + +THC_API void THNN_(SpatialMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); + +THC_API void THNN_(SpatialMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); + +THC_API void THNN_(SpatialMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int owidth, int oheight); + +THC_API void THNN_(SpatialMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int owidth, int oheight); + +THC_API void THNN_(SpatialReflectionPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB); + +THC_API void THNN_(SpatialReflectionPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB); + +THC_API void THNN_(SpatialReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB); + +THC_API void THNN_(SpatialReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB); + +THC_API void THNN_(SpatialSubSampling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int kH, + int dW, int dH); + +THC_API void THNN_(SpatialSubSampling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int kH, + int dW, int dH); + +THC_API void THNN_(SpatialSubSampling_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int kH, + int dW, int dH, + accreal scale); + +THC_API void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputHeight, + int outputWidth, + bool align_corners); + +THC_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth, + bool align_corners); + +THC_API void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth); + +THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputHeight, + int outputWidth); + +THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode); + +THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode); + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode); + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode); + +THC_API void THNN_(RReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace, + void *generator); + +THC_API void THNN_(RReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace); + +THC_API void THNN_(Sigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +THC_API void THNN_(Sigmoid_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +THC_API void THNN_(SoftMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int64_t reduction); + +THC_API void THNN_(SoftMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradOutput, + THCTensor *gradInput, + int64_t reduction); + +THC_API void THNN_(SoftPlus_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal beta, + accreal threshold); + +THC_API void THNN_(SoftPlus_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + accreal beta, + accreal threshold); + +THC_API void THNN_(SoftShrink_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal lambda); + +THC_API void THNN_(SoftShrink_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal lambda); + +THC_API void THNN_(Square_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +THC_API void THNN_(Square_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + +THC_API void THNN_(Sqrt_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal eps); + +THC_API void THNN_(Sqrt_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +THC_API void THNN_(Tanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +THC_API void THNN_(Tanh_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +THC_API void THNN_(TemporalConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int dW, + int inputFrameSize, + int outputFrameSize); + +THC_API void THNN_(TemporalConvolution_updateGradInput)( + THCState* state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int dW); + +THC_API void THNN_(TemporalConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int dW, + accreal scale); + +THC_API void THNN_(TemporalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int dW); + +THC_API void THNN_(TemporalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int dW); + +THC_API void THNN_(TemporalRowConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst); + +THC_API void THNN_(TemporalRowConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst); + +THC_API void THNN_(TemporalRowConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst, + accreal scale); + +THC_API void THNN_(TemporalReflectionPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR); + +THC_API void THNN_(TemporalReflectionPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR); + +THC_API void THNN_(TemporalReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR); + +THC_API void THNN_(TemporalReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR); + +THC_API void THNN_(TemporalUpSamplingLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputWidth, + bool align_corners); + +THC_API void THNN_(TemporalUpSamplingLinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputWidth, + int outputWidth, + bool align_corners); + +THC_API void THNN_(TemporalUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputWidth, + int outputWidth); + +THC_API void THNN_(TemporalUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputWidth); + +THC_API void THNN_(Threshold_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal threshold, + accreal val, + bool inplace); + +THC_API void THNN_(Threshold_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal threshold, + accreal val, + bool inplace); + +THC_API void THNN_(VolumetricAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +THC_API void THNN_(VolumetricAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +// VolumetricConvolution is legacy and purposefully not bound by ATen +THC_API void THNN_(VolumetricConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH); + +THC_API void THNN_(VolumetricConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + int dT, int dW, int dH, + int padT, int padW, int padH); + +THC_API void THNN_(VolumetricConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + accreal scale); + +THC_API void THNN_(VolumetricDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +THC_API void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *columns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +THC_API void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + accreal scale); + +THC_API void THNN_(VolumetricFullDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH); + +THC_API void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH); + +THC_API void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, // [OPTIONAL] + THCTensor *gradBias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH, + accreal scale); + +THC_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); + +THC_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); + +THC_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THCIndexTensor *indices, + THCTensor *randomSamples); + +THC_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THCIndexTensor *indices); + +THC_API void THNN_(VolumetricFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH); + +THC_API void THNN_(VolumetricFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH); + +THC_API void THNN_(VolumetricFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, // [OPTIONAL] + THCTensor *gradBias, // [OPTIONAL] + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH, + accreal scale); + +THC_API void THNN_(VolumetricMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceilMode); + +THC_API void THNN_(VolumetricMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceilMode); + +THC_API void THNN_(VolumetricMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH); + +THC_API void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH); + +THC_API void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int osizeT, + int osizeW, + int osizeH); + +THC_API void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices); + +THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int osizeT, + int osizeW, + int osizeH); + +THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + +THC_API void THNN_(VolumetricReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +THC_API void THNN_(VolumetricReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +THC_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth); + +THC_API void THNN_(VolumetricUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputDepth, + int outputHeight, + int outputWidth); + +THC_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputDepth, + int outputHeight, + int outputWidth, + bool align_corners); + +THC_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth, + bool align_corners); + +#endif diff --git a/aten/src/THCUNN/generic/Tanh.cu b/aten/src/THCUNN/generic/Tanh.cu new file mode 100644 index 0000000..32abd47 --- /dev/null +++ b/aten/src/THCUNN/generic/Tanh.cu @@ -0,0 +1,29 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Tanh.cu" +#else + +#include "../common.h" + +void THNN_(Tanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THCTensor_(tanh)(state, output, input); +} + +void THNN_(Tanh_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_check_shape(state, output, gradOutput); + THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, tanh_updateGradInput_functor()); +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalConvolution.cu b/aten/src/THCUNN/generic/TemporalConvolution.cu new file mode 100644 index 0000000..1bb1761 --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalConvolution.cu @@ -0,0 +1,397 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalConvolution.cu" +#else + +static inline void THNN_(TemporalConvolution_shapeCheck)( + THCState *state, + THCTensor *input, + int kW, + int dW, + int *inputFrameSize) { + + THArgCheck(kW > 0, 9, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 11, + "stride should be greater than zero, but got dW: %d", dW); + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (input->dim() == 3) + { + dimS = 1; + dimF = 2; + } + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + if (inputFrameSize != NULL) { + THArgCheck(input->size[dimF] == *inputFrameSize, 2, + "invalid input frame size. Got: %d, Expected: %d", + input->size[dimF], *inputFrameSize); + } + THArgCheck(input->size[dimS] >= kW, 2, + "input sequence smaller than kernel size. Got: %d, Expected: %d", + input->size[dimS], kW); +} + +void THNN_(TemporalConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int dW, + int inputFrameSize, + int outputFrameSize) { + + THCTensor *outputWindow, *inputWindow; + int nInputFrame, nOutputFrame; + int64_t k, i; + + int dimS = 0; // sequence dimension + + THCUNN_assertSameGPU(state, 4, input, output, weight, bias); + THNN_(TemporalConvolution_shapeCheck) + (state, input, kW, dW, &inputFrameSize); + THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, "bias must be contiguous"); + + if (input->dim() == 3) + { + dimS = 1; + } + + input = THCTensor_(newContiguous)(state, input); + outputWindow = THCTensor_(new)(state); + inputWindow = THCTensor_(new)(state); + + nInputFrame = input->size[dimS]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + if (input->dim() == 2) + { + THCTensor_(resize2d)(state, output, + nOutputFrame, + outputFrameSize); + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, outputWindow, output, 0, k); + THCTensor_(copy)(state, outputWindow, bias); + } + + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THCTensor_(setStorage2d)(state, outputWindow, output->storage, + output->storageOffset + k*output->size[1], + nFrame, outputFrameStride*output->size[1], + output->size[1], 1); + + THCTensor *tweight = THCTensor_(new)(state); + THCTensor_(transpose)(state, tweight, weight, 0, 1); + THCTensor_(addmm)(state, outputWindow, ScalarConvert::to(1), outputWindow, ScalarConvert::to(1), inputWindow, tweight); + THCTensor_(free)(state, tweight); + } + } + else + { + THCTensor *outputSample = THCTensor_(new)(state); + THCTensor *inputSample = THCTensor_(new)(state); + int nBatchFrame = input->size[0]; + + THCTensor_(resize3d)(state, output, + nBatchFrame, + nOutputFrame, + outputFrameSize); + + for(i = 0; i < nBatchFrame; i++) + { + THCTensor_(select)(state, outputSample, output, 0, i); + THCTensor_(select)(state, inputSample, input, 0, i); + int64_t nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, outputWindow, outputSample, 0, k); + THCTensor_(copy)(state, outputWindow, bias); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage, + outputSample->storageOffset + k*outputSample->size[1], + nFrame, outputFrameStride*outputSample->size[1], + outputSample->size[1], 1); + + THCTensor *tweight = THCTensor_(new)(state); + THCTensor_(transpose)(state, tweight, weight, 0, 1); + THCTensor_(addmm)(state, outputWindow, ScalarConvert::to(1), outputWindow, ScalarConvert::to(1), inputWindow, tweight); + THCTensor_(free)(state, tweight); + } + } + THCTensor_(free)(state, outputSample); + THCTensor_(free)(state, inputSample); + } + + THCTensor_(free)(state, outputWindow); + THCTensor_(free)(state, inputWindow); + THCTensor_(free)(state, input); + +} + +void THNN_(TemporalConvolution_updateGradInput)( + THCState* state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int dW) { + + int64_t nInputFrame; + int64_t nOutputFrame; + + THCTensor *gradOutputWindow; + THCTensor *gradInputWindow; + int64_t k, i; + + int dimS = 0; // sequence dimension + + THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput); + THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous"); + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + THNN_(TemporalConvolution_shapeCheck) + (state, input, kW, dW, NULL); + + if (gradOutput->dim() == 3) + { + dimS = 1; + } + + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + + /* Not necessary with partial backprop: */ + gradOutputWindow = THCTensor_(new)(state); + gradInputWindow = THCTensor_(new)(state); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + if (gradOutput->dim() == 2) + { + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage, + gradInput->storageOffset+k*dW*gradInput->size[1], + nFrame, inputFrameStride*gradInput->size[1], + kW*gradInput->size[1], 1); + + THCTensor_(addmm)(state, gradInputWindow, ScalarConvert::to(1), gradInputWindow, ScalarConvert::to(1), gradOutputWindow, weight); + } + } + else + { + THCTensor *gradOutputSample = THCTensor_(new)(state); + THCTensor *gradInputSample = THCTensor_(new)(state); + int64_t nBatchFrame = input->size[0]; + for(i = 0; i < nBatchFrame; i++) + { + THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i); + THCTensor_(select)(state, gradInputSample, gradInput, 0, i); + int64_t nOutputSampleFrame = nOutputFrame; + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage, + gradInputSample->storageOffset+k*dW*gradInputSample->size[1], + nFrame, inputFrameStride*gradInputSample->size[1], + kW*gradInputSample->size[1], 1); + + THCTensor_(addmm)(state, gradInputWindow, ScalarConvert::to(1), gradInputWindow, ScalarConvert::to(1), gradOutputWindow, weight); + } + } + THCTensor_(free)(state, gradOutputSample); + THCTensor_(free)(state, gradInputSample); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, gradOutputWindow); + THCTensor_(free)(state, gradInputWindow); + +} + +void THNN_(TemporalConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int dW, + accreal scale_) { + + real scale = ScalarConvert::to(scale_); + int64_t nInputFrame; + int64_t nOutputFrame; + + THCTensor *gradOutputWindow; + THCTensor *inputWindow; + int64_t k, i; + + THNN_(TemporalConvolution_shapeCheck) + (state, input, kW, dW, NULL); + + int dimS = 0; // sequence dimension + + if (gradOutput->dim() == 3) + { + dimS = 1; + } + + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + /* Not necessary with partial backprop: */ + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + gradOutputWindow = THCTensor_(new)(state); + inputWindow = THCTensor_(new)(state); + + if (input->dim() == 2) + { + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, gradOutputWindow, gradOutput, 0, k); + THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THCTensor *tgradOutputWindow = THCTensor_(new)(state); + THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1); + THCTensor_(addmm)(state, gradWeight, ScalarConvert::to(1), gradWeight, scale, tgradOutputWindow, inputWindow); + THCTensor_(free)(state, tgradOutputWindow); + } + } + else + { + THCTensor *gradOutputSample = THCTensor_(new)(state); + THCTensor *inputSample = THCTensor_(new)(state); + int64_t nBatchFrame = input->size[0]; + + for(i = 0; i < nBatchFrame; i++) + { + THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i); + THCTensor_(select)(state, inputSample, input, 0, i); + int64_t nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, gradOutputWindow, gradOutputSample, 0, k); + THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THCTensor *tgradOutputWindow = THCTensor_(new)(state); + THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1); + THCTensor_(addmm)(state, gradWeight, ScalarConvert::to(1), gradWeight, scale, tgradOutputWindow, inputWindow); + THCTensor_(free)(state, tgradOutputWindow); + } + } + THCTensor_(free)(state, gradOutputSample); + THCTensor_(free)(state, inputSample); + } + + THCTensor_(free)(state, gradOutputWindow); + THCTensor_(free)(state, inputWindow); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, input); + +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalMaxPooling.cu b/aten/src/THCUNN/generic/TemporalMaxPooling.cu new file mode 100644 index 0000000..e355ebd --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalMaxPooling.cu @@ -0,0 +1,188 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu" +#else + +static inline void THNN_(TemporalMaxPooling_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCIndexTensor *indices, + int kW, int dW) { + int dimT = 0; // Temporal dimension + int dimF = 1; // Feature dimension + int input_w; + int input_n; + int output_w; + int ndims = input->dim(); + + if (ndims == 3) + { + dimT = 1; + dimF = 2; + } + THArgCheck(kW > 0, 5, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 6, + "stride should be greater than zero, but got dW: %d", dW); + + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(input->size[dimT] >= kW, 2, + "input sequence smaller than kernel size. Got: %d, Expected: %d", + input->size[dimT], kW); + + input_w = input->size[dimT]; + input_n = input->size[dimF]; + output_w = (input_w - kW) / dW + 1; + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndims, dimT, output_w); + THCUNN_check_dim_size(state, gradOutput, ndims, dimF, input_n) + } + if (indices != NULL) { + THCUNN_check_dim_size_indices(state, indices, ndims, dimT, output_w); + THCUNN_check_dim_size_indices(state, indices, ndims, dimF, input_n); + } +} + +void THNN_(TemporalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int dW) { + + int dimT = 0; // Temporal dimension + int dimF = 1; // Feature dimension + + int batch = 1; + int input_w; + int input_n; + int output_w; + int nthreads; + + real *input_data; + real *output_data; + THCIndex_t *indices_data; + + THCUNN_assertSameGPU(state, 3, input, output, indices); + THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW); + if (input->dim() == 3) + { + dimT = 1; + dimF = 2; + batch = input->size[0]; + } + input = THCTensor_(newContiguous)(state, input); + + input_w = input->size[dimT]; + input_n = input->size[dimF]; + output_w = (input_w - kW) / dW + 1; + + if (input->dim() == 2) + { + THCTensor_(resize2d)(state, output, output_w, input->size[dimF]); + THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]); + } + else + { + THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]); + THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]); + } + + input_data = THCTensor_(data)(state, input); + output_data = THCTensor_(data)(state, output); + indices_data = THCIndexTensor_(data)(state, indices); + + dim3 blocks(batch); + nthreads = (output_w / 32) * 32; + if (output_w % 32 > 0) { + nthreads += 32; + } + + if (nthreads > TEMPORAL_MAX_POOLING_THREADS) { + blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS; + if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) { + blocks.y += 1; + } + nthreads = TEMPORAL_MAX_POOLING_THREADS; + } + + dim3 threads(nthreads); + cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( + input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + +} + +void THNN_(TemporalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int dW) { + + int dimT = 0; // Temporal dimension + int dimF = 1; // Feature dimension + + int batch = 1; + int input_w; + int input_n; + int output_w; + int nthreads; + + real *gradInput_data; + real *gradOutput_data; + THCIndex_t *indices_data; + + THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, indices); + THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW); + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + if (input->dim() == 3) + { + dimT = 1; + dimF = 2; + batch = input->size[0]; + } + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + input_w = input->size[dimT]; + input_n = input->size[dimF]; + output_w = (input_w - kW) / dW + 1; + + gradInput_data = THCTensor_(data)(state, gradInput); + gradOutput_data = THCTensor_(data)(state, gradOutput); + indices_data = THCIndexTensor_(data)(state, indices); + + dim3 blocks(batch); + nthreads = (output_w / 32) * 32; + if (output_w % 32 > 0) { + nthreads += 32; + } + + if (nthreads > TEMPORAL_MAX_POOLING_THREADS) { + blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS; + if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) { + blocks.y += 1; + } + nthreads = TEMPORAL_MAX_POOLING_THREADS; + } + + dim3 threads(nthreads); + if (kW <= dW) { + cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( + gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW); + } else { + cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( + gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW); + } + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); + +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu new file mode 100644 index 0000000..394c796 --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu @@ -0,0 +1,119 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalReflectionPadding.cu" +#else + +void THNN_(TemporalReflectionPadding_updateOutput)(THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimw = 1; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 2 || numInputDims == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s") + + if (numInputDims == 3) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputW = THCTensor_(size)(state, input, dimw); + + THArgCheck(padL < inputW && padR < inputW, 4, + "Padding size should be less than the corresponding input dimension, " + "but got: padding (%d, %d) at dimension %d of input %s", + padL, padR, dimw, THCTensor_(sizeDesc)(state, input).str); + + int outputW = inputW + padL + padR; + + THArgCheck(outputW >= 1 , 2, + "input (W: %d)is too small." + " Calculated output W: %d", + inputW, outputW); + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + + if (numInputDims == 2) { + THCTensor_(resize2d)(state, output, numPlanes, outputW); + + devInput = toDeviceTensor(state, input).upcastOuter<3>(); + devOutput = toDeviceTensor(state, output).upcastOuter<3>(); + } else { + THCTensor_(resize3d)(state, output, numBatch, numPlanes, outputW); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + } + + int outputPlaneSize = devOutput.getSize(2); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + TemporalReflectionPadding_updateOutput<<>>( + devInput, devOutput, padL, padR); + THCudaCheck(cudaGetLastError()); +} + +void THNN_(TemporalReflectionPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR) { + + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, + "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimw = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 3) { + planeDim++; + dimw++; + } + int iwidth = input->size[dimw]; + int owidth = iwidth + padL + padR; + + THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THCTensor_(size)(state, gradOutput, dimw)); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + + if (numInputDims == 2) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<3>(); + devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<3>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + TemporalReflectionPadding_updateGradInput<<>>( + devGradInput, devGradOutput, padL, padR); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu new file mode 100644 index 0000000..11637dc --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu @@ -0,0 +1,114 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalReplicationPadding.cu" +#else + +void THNN_(TemporalReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimw = 1; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 2 || numInputDims == 3), 2, input, + "2D or 3D (batch mode) tensor expected for input, but got: %s") + + if (numInputDims == 3) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputW = THCTensor_(size)(state, input, dimw); + int outputW = inputW + padL + padR; + + THArgCheck(outputW >= 1, 2, + "input (W: %d)is too small." + " Calculated output W: %d", + inputW, outputW); + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + + if (numInputDims == 2) { + THCTensor_(resize2d)(state, output, numPlanes, outputW); + + devInput = toDeviceTensor(state, input).upcastOuter<3>(); + devOutput = toDeviceTensor(state, output).upcastOuter<3>(); + } else { + THCTensor_(resize3d)(state, output, numBatch, numPlanes, outputW); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + } + + int outputPlaneSize = devOutput.getSize(2); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + TemporalReplicationPadding_updateOutput<<>>( + devInput, devOutput, padL, padR); + +} + +void THNN_(TemporalReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR) { + + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3, + "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimw = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 3) { + planeDim++; + dimw++; + } + int iwidth = input->size[dimw]; + int owidth = iwidth + padL + padR; + + THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THCTensor_(size)(state, gradOutput, dimw)); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + + if (numInputDims == 2) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<3>(); + devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<3>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + TemporalReplicationPadding_updateGradInput<<>>( + devGradInput, devGradOutput, padL, padR); + +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalRowConvolution.cu b/aten/src/THCUNN/generic/TemporalRowConvolution.cu new file mode 100644 index 0000000..26361d4 --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalRowConvolution.cu @@ -0,0 +1,430 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalRowConvolution.cu" +#else + +static inline void THNN_(TemporalRowConvolution_shapeCheck)( + THCState *state, THCTensor *input, THCTensor *gradOutput, THCTensor *weight, + THCTensor *bias, int kW, int dW, int padW) { + + THArgCheck(kW > 0, 5, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 6, "stride should be greater than zero, but got dW: %d", + dW); + THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 3), 3, + weight, "non-empty 2D or 3D weight tensor expected, but got: %s"); + + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + } + + int ndim = input->dim(); + int dimF = 0; // feature dimension + int dimS = 1; // sequence dimension + + if (ndim == 3) { + ++dimF; + ++dimS; + } + + THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 1, input, + "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s"); + + int64_t inputFrameSize = weight->size[0]; + int64_t nInputFrame = input->size[dimS]; + int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + if (nOutputFrame < 1) { + THError("Given input size: (%d x %d). " + "Calculated output size: (%d x %d). Output size is too small", + inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame); + } + + THCUNN_check_dim_size(state, input, ndim, dimF, inputFrameSize); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndim, dimF, inputFrameSize); + THCUNN_check_dim_size(state, gradOutput, ndim, dimS, nOutputFrame); + } +} + +void THNN_(TemporalRowConvolution_updateOutput)( + THCState *state, THCTensor *input, THCTensor *output, THCTensor *weight, + THCTensor *bias, THCTensor *finput, THCTensor *fgradInput, int kW, int dW, + int padW, bool featFirst) { + + // aliases + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + + // assert same GPU + THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); + if (bias != NULL) { + THCUNN_assertSameGPU(state, 2, weight, bias); + } + + THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, "bias must be contiguous"); + + // reshape weight if necessary + int ndim = input->dim(); + + THCTensor *tinput; + + if (!featFirst) { + tinput = THCTensor_(newTranspose)(state, input, ndim - 1, ndim - 2); + input = THCTensor_(newContiguous)(state, tinput); + } else { + input = THCTensor_(newContiguous)(state, input); + } + + THNN_(TemporalRowConvolution_shapeCheck) + (state, input, NULL, weight, bias, kW, dW, padW); + + int batch = 1; + if (ndim == 2) { + // Force batch + batch = 0; + THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); + } + + // Params: + int64_t inputFrameSize = weight->size[0]; + int64_t nInputFrame = input->size[2]; + int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + // Batch size + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize3d)(state, output, batchSize, inputFrameSize, nOutputFrame); + + // Augment the input + THCTensor_(resize3d)(state, columns, inputFrameSize, kW, nOutputFrame); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever + // gets increased and always contains ones. + if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, 1, nOutputFrame); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; ++elt) { + // Matrix multiply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do bias first: + // m_, n_, k_ are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = inputFrameSize; + int64_t n_ = nOutputFrame; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm asummes + // column-major matrices) + if (bias != NULL) { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( +#elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( +#endif + state, 't', 'n', n_, m_, k_, ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, THCTensor_(data)(state, bias), k_, + ScalarConvert::to(0), THCTensor_(data)(state, output_n), + n_); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + row2col(THCState_getCurrentStream(state), THCTensor_(data)(state, input_n), + inputFrameSize, nInputFrame, kW, padW, dW, 1, + THCTensor_(data)(state, columns)); + + THCTensor *output3d = THCTensor_(newWithStorage3d)( + state, output_n->storage, output_n->storageOffset, inputFrameSize, -1, + 1, -1, nOutputFrame, -1); + + // weight: inputFrameSize x 1 x kW + // columns: inputFrameSize x kW x nOutputFrame + THCTensor_(baddbmm)(state, output3d, ScalarConvert::to(1), + output3d, ScalarConvert::to(1), weight, + columns); + // output3d: inputFrameSize x 1 x nOutputFrame + + THCTensor_(free)(state, output3d); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize2d)(state, output, inputFrameSize, nOutputFrame); + THCTensor_(resize2d)(state, input, inputFrameSize, nInputFrame); + } + + if (!featFirst) { + THCTensor_(transpose)(state, output, output, ndim - 1, ndim - 2); + THCTensor_(free)(state, tinput); + } + + THCTensor_(free)(state, input); +} + +void THNN_(TemporalRowConvolution_updateGradInput)( + THCState *state, THCTensor *input, THCTensor *gradOutput, + THCTensor *gradInput, THCTensor *weight, THCTensor *finput, + THCTensor *fgradInput, int kW, int dW, int padW, bool featFirst) { + + // aliases + THCTensor *gradColumns = finput; + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, + gradInput); + + THArgCheck(THCTensor_(isContiguous)(state, weight), 4, "weight must be contiguous"); + + int ndim = input->dim(); + + THCTensor *tinput, *tgradOutput; + + if (!featFirst) { + tinput = THCTensor_(newTranspose)(state, input, ndim - 1, ndim - 2); + tgradOutput = + THCTensor_(newTranspose)(state, gradOutput, ndim - 1, ndim - 2); + input = THCTensor_(newContiguous)(state, tinput); + gradOutput = THCTensor_(newContiguous)(state, tgradOutput); + + } else { + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + } + + THNN_(TemporalRowConvolution_shapeCheck) + (state, input, gradOutput, weight, NULL, kW, dW, padW); + + int batch = 1; + if (ndim == 2) { + // Force batch + batch = 0; + THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); + THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0], + gradOutput->size[1]); + } + + // Params: + int64_t inputFrameSize = weight->size[0]; + int64_t nInputFrame = input->size[2]; + int64_t nOutputFrame = gradOutput->size[2]; + + // Batch size + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize3d)(state, gradInput, batchSize, inputFrameSize, + nInputFrame); + + // Resize temporary columns + THCTensor_(resize3d)(state, gradColumns, inputFrameSize, kW, nOutputFrame); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + THCTensor *tweight = THCTensor_(new)(state); + THCTensor_(transpose)(state, tweight, weight, 1, 2); + + for (int elt = 0; elt < batchSize; ++elt) { + // Matrix multiply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)( + state, gradOutput_n->storage, gradOutput_n->storageOffset, + inputFrameSize, -1, 1, -1, nOutputFrame, -1); + + // weight: inputFrameSize x kW x 1 + // gradOutput3d: inputFrameSize x 1 x nOutputFrame + THCTensor_(baddbmm)(state, gradColumns, ScalarConvert::to(0), + gradColumns, ScalarConvert::to(1), tweight, + gradOutput3d); + // gradColumns: inputFrameSize x kW x nOutputFrame + + // Unpack columns back into input: + col2row(THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), inputFrameSize, + nInputFrame, kW, padW, dW, 1, + THCTensor_(data)(state, gradInput_n)); + + THCTensor_(free)(state, gradOutput3d); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize2d)(state, gradOutput, inputFrameSize, nOutputFrame); + THCTensor_(resize2d)(state, input, inputFrameSize, nInputFrame); + THCTensor_(resize2d)(state, gradInput, inputFrameSize, nInputFrame); + } + + THCTensor_(free)(state, tweight); + + if (!featFirst) { + THCTensor_(transpose)(state, gradInput, gradInput, ndim - 1, ndim - 2); + THCTensor_(free)(state, tinput); + THCTensor_(free)(state, tgradOutput); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +void THNN_(TemporalRowConvolution_accGradParameters)( + THCState *state, THCTensor *input, THCTensor *gradOutput, + THCTensor *gradWeight, THCTensor *gradBias, THCTensor *finput, + THCTensor *fgradInput, int kW, int dW, int padW, bool featFirst, + accreal scale_) { + + real scale = ScalarConvert::to(scale_); + // Aliases + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + + THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones); + if (gradBias != NULL) { + THCUNN_assertSameGPU(state, 2, gradWeight, gradBias); + } + + int ndim = input->dim(); + + THCTensor *tinput, *tgradOutput; + + if (!featFirst) { + tinput = THCTensor_(newTranspose)(state, input, ndim - 1, ndim - 2); + tgradOutput = + THCTensor_(newTranspose)(state, gradOutput, ndim - 1, ndim - 2); + input = THCTensor_(newContiguous)(state, tinput); + gradOutput = THCTensor_(newContiguous)(state, tgradOutput); + } else { + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + } + + THNN_(TemporalRowConvolution_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW); + + int batch = 1; + if (ndim == 2) { + // Force batch + batch = 0; + THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); + THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0], + gradOutput->size[1]); + } + + // Params: + int64_t inputFrameSize = gradWeight->size[0]; + int64_t nInputFrame = input->size[2]; + int64_t nOutputFrame = gradOutput->size[2]; + + // Batch size + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, 1, nOutputFrame); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // // Resize temporary columns + THCTensor_(resize3d)(state, columns, inputFrameSize, kW, nOutputFrame); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; ++elt) { + // Matrix multiply per output + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)( + state, gradOutput_n->storage, gradOutput_n->storageOffset, + inputFrameSize, -1, 1, -1, nOutputFrame, -1); + + // Extract columns + row2col(THCState_getCurrentStream(state), THCTensor_(data)(state, input_n), + inputFrameSize, nInputFrame, kW, padW, dW, 1, + THCTensor_(data)(state, columns)); + + THCTensor *tcolumns = THCTensor_(new)(state); + THCTensor_(transpose)(state, tcolumns, columns, 1, 2); + + // gradOutput3d: inputFrameSize x 1 x nOutputFrame + // columns: inputFrameSize x nOutputFrame x kW + THCTensor_(baddbmm)(state, gradWeight, ScalarConvert::to(1), + gradWeight, scale, gradOutput3d, tcolumns); + // gradWeight: inputFrameSize x 1 x kW + + THCTensor_(free)(state, tcolumns); + THCTensor_(free)(state, gradOutput3d); + + if (gradBias != NULL) { + int64_t m_ = inputFrameSize; + int64_t k_ = nOutputFrame; +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( +#endif + state, 't', k_, m_, scale, THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1); +#endif +#ifdef THC_REAL_IS_HALF // half not supported due to baddbmm + THCudaBlas_Hgemm(state, 't', 'n', m_, 1, k_, scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_); +#endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) { + THCTensor_(resize2d)(state, gradOutput, inputFrameSize, nOutputFrame); + THCTensor_(resize2d)(state, input, inputFrameSize, nInputFrame); + } + + if (!featFirst) { + THCTensor_(free)(state, tinput); + THCTensor_(free)(state, tgradOutput); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu new file mode 100644 index 0000000..6199eef --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu @@ -0,0 +1,95 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalUpSamplingLinear.cu" +#else + +#include "../linear_upsampling.h" + +static inline void THNN_(TemporalUpSamplingLinear_shapeCheck) + (THCState *state, + THCTensor *input, THCTensor *gradOutput, + int nBatch, int nChannels, + int inputWidth, + int outputWidth) { + THArgCheck(inputWidth > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (W: %d) output (W: %d)", + inputWidth, outputWidth); + if (input != NULL) { + THCUNN_argCheck(state, !input->is_empty() && input->dim() == 3, 2, input, + "non-empty 3D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch); + THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels); + THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth); + } +} + +void THNN_(TemporalUpSamplingLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputWidth, + bool align_corners) +{ + int nbatch = THCTensor_(size)(state, input, 0); + int channels = THCTensor_(size)(state, input, 1); + int inputWidth = THCTensor_(size)(state, input, 2); + THNN_(TemporalUpSamplingLinear_shapeCheck) + (state, input, NULL, + nbatch, channels, + inputWidth, outputWidth); + + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resize3d)(state, output, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + outputWidth); + THCTensor_(zero)(state, output); + THCDeviceTensor idata = toDeviceTensor(state, input); + THCDeviceTensor odata = toDeviceTensor(state, output); + THAssert(inputWidth > 0 && outputWidth > 0); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + const int num_kernels = outputWidth; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel <<>>(num_kernels, rwidth, align_corners, idata, odata); + THCudaCheck(cudaGetLastError()); +} + + +void THNN_(TemporalUpSamplingLinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputWidth, + int outputWidth, + bool align_corners) +{ + THNN_(TemporalUpSamplingLinear_shapeCheck) + (state, NULL, gradOutput, + nbatch, nchannels, + inputWidth, outputWidth); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth); + THCTensor_(zero)(state, gradInput); + THCDeviceTensor data1 = toDeviceTensor(state, gradInput); + THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + const int num_kernels = outputWidth; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rwidth, align_corners, data1, data2); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu new file mode 100644 index 0000000..55dfea2 --- /dev/null +++ b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu @@ -0,0 +1,90 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalUpSamplingNearest.cu" +#else + +#include "../common.h" + +static inline void THNN_(TemporalUpSamplingNearest_shapeCheck) + (THCState *state, + THCTensor *input, THCTensor *gradOutput, + int nBatch, int nChannels, + int inputWidth, + int outputWidth) { + THArgCheck(inputWidth > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (W: %d) output (W: %d)", + inputWidth, outputWidth); + if (input != NULL) { + THCUNN_argCheck(state, input->_dim() == 3, 2, input, + "3D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch); + THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels); + THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth); + } +} + +void THNN_(TemporalUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputWidth) +{ + THCUNN_assertSameGPU(state, 2, input, output); + int nbatch = THCTensor_(size)(state, input, 0); + int channels = THCTensor_(size)(state, input, 1); + int inputWidth = THCTensor_(size)(state, input, 2); + + THNN_(TemporalUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, inputWidth, outputWidth); + THAssert(inputWidth > 0 && outputWidth > 0); + + THCTensor_(resize3d)(state, output, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + outputWidth); + THCTensor_(zero)(state, output); + + THCDeviceTensor idata = toDeviceTensor(state, input); + THCDeviceTensor odata = toDeviceTensor(state, output); + + const int num_kernels = outputWidth; + const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + nearest_neighbor_3d_kernel <<>>(num_kernels, idata, odata); + THCudaCheck(cudaGetLastError()); +} + + +void THNN_(TemporalUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputWidth, + int outputWidth) +{ + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + THNN_(TemporalUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, inputWidth, outputWidth); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth); + + THCTensor_(zero)(state, gradInput); + THCDeviceTensor data1 = toDeviceTensor(state, gradInput); + THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); + + const int num_kernels = outputWidth; + const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + + nearest_neighbor_3d_kernel_backward <<>>(num_kernels, data1, data2); + + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/Threshold.cu b/aten/src/THCUNN/generic/Threshold.cu new file mode 100644 index 0000000..794ad45 --- /dev/null +++ b/aten/src/THCUNN/generic/Threshold.cu @@ -0,0 +1,70 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Threshold.cu" +#else + +#include "../common.h" + +void THNN_(Threshold_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + accreal threshold_, + accreal val_, + bool inplace) +{ + real threshold = ScalarConvert::to(threshold_); + real val = ScalarConvert::to(val_); + THCUNN_assertSameGPU(state, 2, input, output); + + if (inplace) + { + THC_pointwiseApply1(state, input, + ThresholdUpdateOutputIP(threshold, val) + ); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, + ThresholdUpdateOutput(threshold, val) + ); + } + + THCudaCheck(cudaGetLastError()); +} + +void THNN_(Threshold_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + accreal threshold_, + accreal val_, + bool inplace) +{ + real threshold = ScalarConvert::to(threshold_); + real val = ScalarConvert::to(val_); + (void) val; + THCUNN_check_nElement(state, input, gradOutput); + THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput); + + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, input, + ThresholdUpdateGradInputIP(threshold) + ); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, + ThresholdUpdateGradInput(threshold) + ); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu new file mode 100644 index 0000000..d297483 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu @@ -0,0 +1,173 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricAdaptiveAveragePooling.cu" +#else + +#include "../common.h" + +// 5d tensor B x D x T x H x W + +void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int osizeT, + int osizeW, + int osizeH) +{ + THCUNN_assertSameGPU(state, 2, input, output); + + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + + real *output_data; + real *input_data; + + int64_t sizeD, isizeT, isizeH, isizeW; + int64_t istrideD, istrideT, istrideH, istrideW; + int64_t totalZ; + + if (input->dim() == 4) { + sizeD = input->size[0]; + isizeT = input->size[1]; + isizeH = input->size[2]; + isizeW = input->size[3]; + + istrideD = input->stride[0]; + istrideT = input->stride[1]; + istrideH = input->stride[2]; + istrideW = input->stride[3]; + + THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW); + + totalZ = sizeD * osizeT; + } else { + input = THCTensor_(newContiguous)(state, input); + + int64_t sizeB = input->size[0]; + sizeD = input->size[1]; + isizeT = input->size[2]; + isizeH = input->size[3]; + isizeW = input->size[4]; + + istrideD = input->stride[1]; + istrideT = input->stride[2]; + istrideH = input->stride[3]; + istrideW = input->stride[4]; + + THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW); + + totalZ = sizeB * sizeD * osizeT; + } + + input_data = THCTensor_(data)(state, input); + output_data = THCTensor_(data)(state, output); + + int64_t offsetZ = 0; + dim3 threads(32, 8); + // each H*W plane is processed by blocksH thread blocks + int blocksH = max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel + <<>>( + input_data, output_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, + istrideD, istrideT, istrideH, istrideW, offsetZ + ); + + totalZ -= 65535; + offsetZ += 65535; + THCudaCheck(cudaGetLastError()); + } + + if (input->dim() == 5) { + // clean + THCTensor_(free)(state, input); + } +} + +void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + real *gradInput_data; + real *gradOutput_data; + + int64_t sizeD, isizeT, isizeH, isizeW; + int64_t osizeT, osizeH, osizeW; + int64_t totalZ; + + if (input->dim() == 4) { + sizeD = input->size[0]; + isizeT = input->size[1]; + isizeH = input->size[2]; + isizeW = input->size[3]; + + osizeT = gradOutput->size[1]; + osizeH = gradOutput->size[2]; + osizeW = gradOutput->size[3]; + } else { + sizeD = input->size[1]; + isizeT = input->size[2]; + isizeH = input->size[3]; + isizeW = input->size[4]; + + osizeT = gradOutput->size[2]; + osizeH = gradOutput->size[3]; + osizeW = gradOutput->size[4]; + } + + // somehow nonatomic is passing all test for volumetric case. + bool atomic = false; //(isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0); + + if (input->dim() == 4) { + totalZ = atomic ? sizeD * osizeT : sizeD * isizeT; + } else { + int sizeB = input->size[0]; + totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT; + } + + gradInput_data = THCTensor_(data)(state, gradInput); + gradOutput_data = THCTensor_(data)(state, gradOutput); + + int64_t offsetZ = 0; + dim3 threads(32, 8); + // each H*W plane is processed by blocksH thread blocks + int blocksH = max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + + if (atomic) + { + cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel + <<>>( + gradInput_data, gradOutput_data, isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, offsetZ + ); + } else { + cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel + <<>>( + gradInput_data, gradOutput_data, isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, offsetZ + ); + } + + totalZ -= 65535; + offsetZ += 65535; + THCudaCheck(cudaGetLastError()); + } + // clean + THCTensor_(free)(state, gradOutput); + +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu new file mode 100644 index 0000000..7f876ae --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu @@ -0,0 +1,178 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricAdaptiveMaxPooling.cu" +#else + +#include "../common.h" + +// 5d tensor B x D x T x H x W + +void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int osizeT, + int osizeW, + int osizeH) +{ + THCUNN_assertSameGPU(state, 3, input, output, indices); + + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + THCIndex_t *indices_data; + real *output_data; + real *input_data; + + int64_t sizeD, isizeT, isizeH, isizeW; + int64_t istrideD, istrideT, istrideH, istrideW; + int64_t totalZ; + + if (input->dim() == 4) { + sizeD = input->size[0]; + isizeT = input->size[1]; + isizeH = input->size[2]; + isizeW = input->size[3]; + + istrideD = input->stride[0]; + istrideT = input->stride[1]; + istrideH = input->stride[2]; + istrideW = input->stride[3]; + + THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW); + THCIndexTensor_(resize4d)(state, indices, sizeD, osizeT, osizeH, osizeW); + + totalZ = sizeD * osizeT; + } else { + input = THCTensor_(newContiguous)(state, input); + + int64_t sizeB = input->size[0]; + sizeD = input->size[1]; + isizeT = input->size[2]; + isizeH = input->size[3]; + isizeW = input->size[4]; + + istrideD = input->stride[1]; + istrideT = input->stride[2]; + istrideH = input->stride[3]; + istrideW = input->stride[4]; + + THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW); + THCIndexTensor_(resize5d)(state, indices, sizeB, sizeD, osizeT, osizeH, osizeW); + + totalZ = sizeB * sizeD * osizeT; + } + + input_data = THCTensor_(data)(state, input); + output_data = THCTensor_(data)(state, output); + indices_data = THCIndexTensor_(data)(state, indices); + + int64_t offsetZ = 0; + dim3 threads(32, 8); + // each H*W plane is processed by blocksH thread blocks + int blocksH = max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + cunn_VolumetricAdaptiveMaxPooling_updateOutput_kernel + <<>>( + input_data, output_data, indices_data, isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW, offsetZ + ); + + totalZ -= 65535; + offsetZ += 65535; + THCudaCheck(cudaGetLastError()); + } + + if (input->dim() == 5) { + // clean + THCTensor_(free)(state, input); + } +} + +void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices) +{ + THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCIndex_t *indices_data; + real *gradInput_data; + real *gradOutput_data; + + int64_t sizeD, isizeT, isizeH, isizeW; + int64_t osizeT, osizeH, osizeW; + int64_t totalZ; + + if (input->dim() == 4) { + sizeD = input->size[0]; + isizeT = input->size[1]; + isizeH = input->size[2]; + isizeW = input->size[3]; + + osizeT = gradOutput->size[1]; + osizeH = gradOutput->size[2]; + osizeW = gradOutput->size[3]; + } else { + sizeD = input->size[1]; + isizeT = input->size[2]; + isizeH = input->size[3]; + isizeW = input->size[4]; + + osizeT = gradOutput->size[2]; + osizeH = gradOutput->size[3]; + osizeW = gradOutput->size[4]; + } + + bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0); + + if (input->dim() == 4) { + totalZ = sizeD * osizeT; + } else { + int sizeB = input->size[0]; + totalZ = sizeB * sizeD * osizeT; + } + + indices_data = THCIndexTensor_(data)(state, indices); + gradInput_data = THCTensor_(data)(state, gradInput); + gradOutput_data = THCTensor_(data)(state, gradOutput); + + int64_t offsetZ = 0; + dim3 threads(32, 8); + // each H*W plane is processed by blocksH thread blocks + int blocksH = max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + + if (atomic) + { + cunn_atomic_VolumetricAdaptiveMaxPooling_updateGradInput_kernel + <<>>( + gradInput_data, gradOutput_data, indices_data, + isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ + ); + } else { + cunn_VolumetricAdaptiveMaxPooling_updateGradInput_kernel + <<>>( + gradInput_data, gradOutput_data, indices_data, + isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ + ); + } + + totalZ -= 65535; + offsetZ += 65535; + THCudaCheck(cudaGetLastError()); + } + // clean + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu new file mode 100644 index 0000000..b32643d --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu @@ -0,0 +1,383 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricAveragePooling.cu" +#else + +static inline void THNN_(VolumetricAveragePooling_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode) +{ + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + int ndim = input->dim(); + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 4) + { + THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH + && input->size[dimt] >= kT, 2, + "input image (T: %d H: %d W: %d) smaller than " + "kernel size (kT: %d kH: %d kW: %d)", + input->size[dimt], input->size[dimh], input->size[dimw], + kT, kH, kW); + + /* sizes */ + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 5) + { + THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH + && input->size[dimt] >= kT, 2, + "input image (T: %d H: %d W: %d) smaller than " + "kernel size (kT: %d kH: %d kW: %d)", + input->size[dimt], input->size[dimh], input->size[dimw], + kT, kH, kW); + + /* sizes */ + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + else + { + AT_ERROR("non-empty 4D or 5D tensor expected, but got size: ", input->sizes()); + } + + // The second argument is the index of padH. + THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11, + "pad should not be greater than half of kernel size, but got " + "padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d", + padT, padW, padH, kT, kW, kH); + + int outputTime; + int outputHeight; + int outputWidth; + + if (ceil_mode) + { + outputTime = ceil(float(inputTime - kT + 2*padT) / float(dT)) + 1; + outputHeight = ceil(float(inputHeight - kH + 2*padH) / float(dH)) + 1; + outputWidth = ceil(float(inputWidth - kW + 2*padW) / float(dW)) + 1; + } + else + { + outputTime = floor(float(inputTime - kT + 2*padT) / float(dT)) + 1; + outputHeight = floor(float(inputHeight - kH + 2*padH) / float(dH)) + 1; + outputWidth = floor(float(inputWidth - kW + 2*padW) / float(dW)) + 1; + } + if (padT || padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputTime - 1)*dT >= inputTime + padT) + --outputTime; + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (gradOutput != NULL) + { + THCUNN_check_dim_size(state, gradOutput, ndim, dimN, inputSlices); + THCUNN_check_dim_size(state, gradOutput, ndim, dimt, outputTime); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode, + bool count_include_pad) +{ + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + int dimt = 1; + int dimh = 2; + int dimw = 3; + + int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5; + if (fiveDimensionalInput) + { + dimt++; + dimh++; + dimw++; + } + + THNN_(VolumetricAveragePooling_shapeCheck) + (state, input, NULL, kT, kW, kH, dT, dW, dH, + padT, padW, padH, ceil_mode); + + if (!fiveDimensionalInput) /* 4D */ + { + /* sizes */ + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else /* 5D */ + { + /* sizes */ + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + + int outputTime; + int outputHeight; + int outputWidth; + + if (ceil_mode) + { + outputTime = ceil(float(inputTime - kT + 2*padT) / float(dT)) + 1; + outputHeight = ceil(float(inputHeight - kH + 2*padH) / float(dH)) + 1; + outputWidth = ceil(float(inputWidth - kW + 2*padW) / float(dW)) + 1; + } + else + { + outputTime = floor(float(inputTime - kT + 2*padT) / float(dT)) + 1; + outputHeight = floor(float(inputHeight - kH + 2*padH) / float(dH)) + 1; + outputWidth = floor(float(inputWidth - kW + 2*padW) / float(dW)) + 1; + } + if (padT || padH || padW) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputTime - 1)*dT >= inputTime + padT) + --outputTime; + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (!fiveDimensionalInput) /* 4D */ + { + /* resize output */ + THCTensor_(resize4d)(state, output, inputSlices, + outputTime, outputHeight, outputWidth); + } + else /* 5D */ + { + THCTensor_(resize5d)(state, output, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + } + + input = THCTensor_(newContiguous)(state, input); + if (fiveDimensionalInput) { + // Collapse batch and feature dimensions + output = THCTensor_(newFoldBatchDim)(state, output); + + THCTensor *old_input = input; + input = THCTensor_(newFoldBatchDim)(state, input); + THCTensor_(free)(state, old_input); + } else { + THCTensor_(retain)(state, output); + } + + THCDeviceTensor cudaInput; + THCDeviceTensor cudaOutput; + cudaInput = toDeviceTensor(state, input); + cudaOutput = toDeviceTensor(state, output); + + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast(block.x)), + THCCeilDiv(outputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + switch (kW) + { + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7); + default: + cuda_VolumetricAveragePooling_updateOutput + <<>>( + cudaInput, + cudaOutput, + kT, kH, kW, + dT, dH, dW, + padT, padH, padW, + count_include_pad, + offsetZ); + break; + } + totalZ -= 65535; + offsetZ += 65535; + THCudaCheck(cudaGetLastError()); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, output); +} + +void THNN_(VolumetricAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode, + bool count_include_pad) +{ + THNN_(VolumetricAveragePooling_shapeCheck) + (state, input, gradOutput, kT, kW, kH, dT, dW, dH, + padT, padW, padH, ceil_mode); + bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW); + + // Resize and initialize result tensor. + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + int outputTime; + int outputHeight; + int outputWidth; + + int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5; + if (!fiveDimensionalInput) /* 4D */ + { + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + + outputTime = THCTensor_(size)(state, gradOutput, 1); + outputHeight = THCTensor_(size)(state, gradOutput, 2); + outputWidth = THCTensor_(size)(state, gradOutput, 3); + } + else + { + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + + outputTime = THCTensor_(size)(state, gradOutput, 2); + outputHeight = THCTensor_(size)(state, gradOutput, 3); + outputWidth = THCTensor_(size)(state, gradOutput, 4); + } + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + if (fiveDimensionalInput) { + // Collapse batch and feature dimensions + gradInput = THCTensor_(newFoldBatchDim)(state, gradInput); + + THCTensor *old_gradOutput = gradOutput; + gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput); + THCTensor_(free)(state, old_gradOutput); + } else { + THCTensor_(retain)(state, gradInput); + } + + THCDeviceTensor cudaGradInput; + THCDeviceTensor cudaGradOutput; + cudaGradInput = toDeviceTensor(state, gradInput); + cudaGradOutput = toDeviceTensor(state, gradOutput); + + dim3 block(32, 8); + + // Optimizing for stride 1 is probably only of limited value, but this + // specialization yields 3x speedup over the atomicAdd implementation. + // Padding must be 0, otherwise, pool size may change. + if (dT == 1 && dH == 1 && dW == 1 && padT == 0 && padH == 0 && padW == 0) + { + int totalZ = inputTime * inputSlices * batchSize; + int offsetZ = 0; + while (totalZ > 0) { + dim3 grid(THCCeilDiv(inputWidth, static_cast(block.x)), + THCCeilDiv(inputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + cuda_VolumetricAveragePooling_updateGradInput_Stride1 + <<>>( + cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + } + else + { + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast(block.x)), + THCCeilDiv(outputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + if (kernelsOverlap) + { + cuda_VolumetricAveragePooling_updateGradInput_atomicAdd + <<>>( + cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, + padT, padH, padW, count_include_pad, offsetZ); + } + else + { + cuda_VolumetricAveragePooling_updateGradInput + <<>>( + cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, + padT, padH, padW, count_include_pad, offsetZ); + } + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + } + + THCTensor_(free)(state, gradInput); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricConvolution.cu b/aten/src/THCUNN/generic/VolumetricConvolution.cu new file mode 100644 index 0000000..e76f8cb --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricConvolution.cu @@ -0,0 +1,525 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricConvolution.cu" +#else + +static inline void THNN_(VolumetricConvolution_shapeCheck) + (THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *weight, + THCTensor *gradWeight, + THCTensor *bias, + int dT, + int dW, + int dH, + int padT, + int padW, + int padH) { + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(!weight || THCTensor_(isContiguous)(state, weight), 4, + "weight tensor has to be contiguous"); + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, + "bias tensor has to be contiguous"); + THArgCheck(!gradWeight || THCTensor_(isContiguous)(state, gradWeight), 5, + "gradWeight tensor has to be contiguous"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + + if (gradOutput != NULL) { + THCUNN_argCheck(state, !gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3, + gradOutput, + "non-empty 4D or 5D (batch mode) tensor expected for gradOutput, but got: %s"); + } + + if (weight != NULL) { + THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 5, 4, weight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + } + + if (gradWeight != NULL) { + THCUNN_argCheck(state, !gradWeight->is_empty() && gradWeight->dim() == 5, 4, gradWeight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for gradWeight, but got: %s"); + } + + if (weight == NULL) { + weight = gradWeight; + } + int64_t nOutputPlane = weight->size[0]; + int64_t nInputPlane = weight->size[1]; + int64_t kT = weight->size[2]; + int64_t kH = weight->size[3]; + int64_t kW = weight->size[4]; + + THArgCheck(kT > 0 && kW > 0 && kH > 0, 4, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + int dimd = 3; + + if (ndim == 5) + { + dimf++; + dimh++; + dimw++; + dimd++; + } + + int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size[dimh]; + int64_t inputDepth = input->size[dimd]; + + int64_t exactInputDepth = inputDepth + 2*padT; + int64_t exactInputHeight = inputHeight + 2*padH; + int64_t exactInputWidth = inputWidth + 2*padW; + + if (exactInputDepth < kT || exactInputHeight < kH || exactInputWidth < kW) { + THError("Calculated input size: (%d x %d x %d). " + "Kernel size: (%d x %d x %d). Kernel size can't be greater than actual input size", + exactInputDepth,exactInputHeight,exactInputWidth,kT,kH,kW); + } + + int64_t outputWidth = (exactInputDepth - kH) / dH + 1; + int64_t outputHeight = (exactInputHeight - kT) / dT + 1; + int64_t outputDepth = (exactInputWidth - kW) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1) + { + THError( + "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nInputPlane, inputDepth, inputHeight, inputWidth, + nOutputPlane, outputDepth, outputHeight, outputWidth + ); + } + + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + } + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth); + } +} + +void THNN_(VolumetricConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + THCUNN_assertSameGPU(state, 6, input, output, weight, bias, columns, ones); + THNN_(VolumetricConvolution_shapeCheck)( + state, input, NULL, weight, NULL, + bias, dT, dW, dH, padT, padW, padH); + input = THCTensor_(newContiguous)(state, input); + + int nOutputPlane = (int)weight->size[0]; + int nInputPlane = (int)weight->size[1]; + int kT = (int)weight->size[2]; + int kH = (int)weight->size[3]; + int kW = (int)weight->size[4]; + + int batch = 1; + if (input->dim() == 4) + { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], + input->size[2], input->size[3]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t inputDepth = input->size[4]; + int64_t outputWidth = (inputWidth + 2*padH - kH) / dH + 1; + int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1; + int64_t outputDepth = (inputDepth + 2*padW - kW) / dW + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, + outputHeight, outputWidth, outputDepth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) + { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputDepth * outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + im3d2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[0]; + int64_t n = columns->size[1]; + int64_t k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) + { + THCTensor_(resize4d)(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth); + THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); + } + THCTensor_(free)(state, input); +} + +void THNN_(VolumetricConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + + int64_t nOutputPlane = weight->size[0]; + int64_t nInputPlane = weight->size[1]; + int64_t kT = weight->size[2]; + int64_t kH = weight->size[3]; + int64_t kW = weight->size[4]; + + THCTensor *gradColumns = finput; + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, gradInput); + THNN_(VolumetricConvolution_shapeCheck)( + state, input, gradOutput, weight, NULL, + NULL, dT, dW, dH, padT, padW, padH); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int batch = 1; + if (input->dim() == 4) + { + input = THCTensor_(newContiguous)(state, input); + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t inputDepth = input->size[4]; + int64_t outputWidth = (inputWidth + 2*padH - kH) / dH + 1; + int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1; + int64_t outputDepth = (inputDepth + 2*padW - kW) / dW + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) + { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; + int64_t n = gradColumns->size[1]; + int64_t k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2im3d( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) + { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth); + THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); + THCTensor_(resize4d)(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth); + THCTensor_(free)(state, input); + } + THCTensor_(free)(state, gradOutput); + +} + +void THNN_(VolumetricConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + accreal scale_) +{ + real scale = ScalarConvert::to(scale_); + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones); + THNN_(VolumetricConvolution_shapeCheck)( + state, input, gradOutput, NULL, gradWeight, + gradBias, dT, dW, dH, padT, padW, padH); + + int nOutputPlane = (int)gradWeight->size[0]; + int nInputPlane = (int)gradWeight->size[1]; + int kT = (int)gradWeight->size[2]; + int kH = (int)gradWeight->size[3]; + int kW = (int)gradWeight->size[4]; + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int batch = 1; + if (input->dim() == 4) + { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t inputDepth = input->size[4]; + int64_t outputWidth = (inputWidth + 2*padH - kH) / dH + 1; + int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1; + int64_t outputDepth = (inputDepth + 2*padW - kW) / dW + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) + { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im3d2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = gradWeight->size[0]; + int64_t n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4]; + int64_t k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) + { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth); + THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); + } + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu new file mode 100644 index 0000000..5751ab4 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu @@ -0,0 +1,506 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricDilatedConvolution.cu" +#else + +static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *weight, + THCTensor *bias, + int kT, int kH, int kW, + int dT, int dH, int dW, + int padT, int padH, int padW, + int dilationT, int dilationH, int dilationW, + int weight_nullable) { + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, + "bias tensor has to be contiguous"); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d", + dilationT, dilationH, dilationW); + + // number of input & output planes and kernel size is indirectly defined by the weight tensor + if (weight != NULL) { + THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 5, 4, weight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) { + dimf++; + dimd++; + dimh++; + dimw++; + } + + int64_t inputDepth = input->size[dimd]; + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld x %ld). " + "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } + THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) { + + THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); + if (bias) { + THCUNN_assertSameGPU(state, 2, weight, bias); + } + THNN_(VolumetricDilatedConvolution_shapeCheck)( + state, input, NULL, weight, bias, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW, 0); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THCTensor_(newContiguous)(state, input); + weight = THCTensor_(newContiguous)(state, weight); + bias = bias ? THCTensor_(newContiguous)(state, bias) : bias; + + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } + + int64_t inputDepth = input->size[2]; + int64_t inputHeight = input->size[3]; + int64_t inputWidth = input->size[4]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputDepth * outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = columns->size[1]; + int64_t k = nInputPlane*kT*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, weight); + if (bias) THCTensor_(free)(state, bias); +} + +void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) { + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THNN_(VolumetricDilatedConvolution_shapeCheck)( + state, input, gradOutput, weight, NULL, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW, 0); + + weight = THCTensor_(newContiguous)(state, weight); + + // Params + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nInputPlane*kT*kW*kH; + int64_t n = gradColumns->size[1]; + int64_t k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2vol( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, weight); +} + +void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + accreal scale_) { + + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones); + THNN_(VolumetricDilatedConvolution_shapeCheck)( + state, input, gradOutput, gradWeight, gradBias, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW, 1); + + // Params + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t nInputPlane = input->size[1]; + int64_t nOutputPlane = gradOutput->size[1]; + int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + if (gradWeight) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = nInputPlane*kT*kW*kH; + int64_t k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu new file mode 100644 index 0000000..b694c37 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu @@ -0,0 +1,409 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.cu" +#else + +#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ + cuda_VolumetricDilatedMaxPooling_updateOutput \ + <<>>( \ + inputData, inputTime, inputHeight, inputWidth, \ + cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\ + dilationT, dilationH, dilationW, offsetZ); \ + break + +static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode) { + int ndim = input->dim(); + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + int outputTime; + int outputHeight; + int outputWidth; + int dimf = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + THArgCheck(kT > 0 && kW > 0 && kH > 0, 7, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", + kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 16, + "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d", + dilationT, dilationH, dilationW); + + if (input->dim() == 5) + { + dimf++; + dimt++; + dimh++; + dimw++; + } + + if (THCTensor_(nDimension)(state, input) == 4) + { + /* sizes */ + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (THCTensor_(nDimension)(state, input) == 5) + { + /* sizes */ + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + else + { + AT_ERROR("non-empty 4D or 5D tensor expected, got size: ", input->sizes()); + } + + THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 13, + "pad should be smaller than half of kernel size, but got " + "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d", + kT, kW, kH, padT, padW, padH); + + if (ceilMode) + { + outputTime = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; + outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputTime = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; + outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (padT || padW || padH) + { + if ((outputTime - 1)*dT >= inputTime + padT) + --outputTime; + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (outputTime < 1 || outputHeight < 1 || outputWidth < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, inputSlices); + THCUNN_check_dim_size(state, gradOutput, ndim, dimt, outputTime); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } + if (indices != NULL) { + THCUNN_check_dim_size_indices(state, indices, ndim, dimf, inputSlices); + THCUNN_check_dim_size_indices(state, indices, ndim, dimt, outputTime); + THCUNN_check_dim_size_indices(state, indices, ndim, dimh, outputHeight); + THCUNN_check_dim_size_indices(state, indices, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode) +{ + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + int outputTime; + int outputHeight; + int outputWidth; + + int dimt = 1; + int dimh = 2; + int dimw = 3; + + int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5; + + if (fiveDimensionalInput) + { + dimt++; + dimh++; + dimw++; + } + + THCUNN_assertSameGPU(state, 3, input, indices, output); + THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + state, input, NULL, NULL, kT, kW, kH, + dT, dW, dH, padT, padW, padH, + dilationT, dilationW, dilationH, ceilMode); + + if (THCTensor_(nDimension)(state, input) == 4) + { + /* sizes */ + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (fiveDimensionalInput) + { + /* sizes */ + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + else + { + AT_ERROR("non-empty 4D or 5D tensor expected, got size: ", input->sizes()); + } + + if (ceilMode) + { + outputTime = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; + outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputTime = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; + outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (padT || padW || padH) + { + if ((outputTime - 1)*dT >= inputTime + padT) + --outputTime; + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (!fiveDimensionalInput) /* 4D */ + { + /* resize output */ + THCTensor_(resize4d)(state, output, inputSlices, + outputTime, outputHeight, outputWidth); + /* indices pack ti,i,j locations for each output point as uchar into + each float of the tensor */ + THCIndexTensor_(resize4d)(state, indices, inputSlices, + outputTime, outputHeight, outputWidth); + } + else + { /* 5D */ + THCTensor_(resize5d)(state, output, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + // Index tensor packs index offsets as uchars into floats + THCIndexTensor_(resize5d)(state, indices, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + fiveDimensionalInput = 1; + } + + input = THCTensor_(newContiguous)(state, input); + if (fiveDimensionalInput) { + // Collapse batch and feature dimensions + output = THCTensor_(newFoldBatchDim)(state, output); + + THCTensor *old_input = input; + input = THCTensor_(newFoldBatchDim)(state, input); + THCTensor_(free)(state, old_input); + } else { + THCTensor_(retain)(state, output); + } + + real* inputData = THCTensor_(data)(state, input); + + THCDeviceTensor cudaOutput; + cudaOutput = toDeviceTensor(state, output); + + THLongStorage *indicesSize = THLongStorage_newWithSize(4); + int64_t indicesSizeRaw[4] = { batchSize * inputSlices, + outputTime, outputHeight, outputWidth }; + THLongStorage_rawCopy(indicesSize, indicesSizeRaw); + + THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)( + state, THCIndexTensor_(storage)(state, indices), + THCIndexTensor_(storageOffset)(state, indices), + indicesSize, NULL); + + THLongStorage_free(indicesSize); + + THCDeviceTensor cudaIndices = + toDeviceTensor(state, indices1); + + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast(block.x)), + THCCeilDiv(outputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + switch (kW) + { + UPDATE_OUTPUT_KERNEL_WIDTH(1); + UPDATE_OUTPUT_KERNEL_WIDTH(2); + UPDATE_OUTPUT_KERNEL_WIDTH(3); + UPDATE_OUTPUT_KERNEL_WIDTH(4); + UPDATE_OUTPUT_KERNEL_WIDTH(5); + UPDATE_OUTPUT_KERNEL_WIDTH(6); + UPDATE_OUTPUT_KERNEL_WIDTH(7); + default: + cuda_VolumetricDilatedMaxPooling_updateOutput<<>>( + inputData, inputTime, inputHeight, inputWidth, + cudaIndices, cudaOutput, + kT, kH, kW, dT, dH, dW, + padT, padH, padW, dilationT, dilationH, dilationW, offsetZ); + } + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, output); + THCIndexTensor_(free)(state, indices1); +} + +#undef UPDATE_OUTPUT_KERNEL_WIDTH + +void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode) +{ + // TODO: gradOutput shape check + // Resize and initialize result tensor. + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + int batchSize; + int inputSlices; + + int outputTime, outputHeight, outputWidth; + int inputTime, inputHeight, inputWidth; + + int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5; + + THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); + THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + state, input, gradOutput, indices, kT, kW, kH, + dT, dW, dH, padT, padW, padH, + dilationT, dilationW, dilationH, ceilMode); + + if (!fiveDimensionalInput) /* 4D */ + { + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + + outputTime = THCTensor_(size)(state, gradOutput, 1); + outputHeight = THCTensor_(size)(state, gradOutput, 2); + outputWidth = THCTensor_(size)(state, gradOutput, 3); + inputTime = THCTensor_(size)(state, gradInput, 1); + inputHeight = THCTensor_(size)(state, gradInput, 2); + inputWidth = THCTensor_(size)(state, gradInput, 3); + } + else + { + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + + outputTime = THCTensor_(size)(state, gradOutput, 2); + outputHeight = THCTensor_(size)(state, gradOutput, 3); + outputWidth = THCTensor_(size)(state, gradOutput, 4); + inputTime = THCTensor_(size)(state, gradInput, 2); + inputHeight = THCTensor_(size)(state, gradInput, 3); + inputWidth = THCTensor_(size)(state, gradInput, 4); + } + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + if (fiveDimensionalInput) { + // Collapse batch and feature dimensions + gradInput = THCTensor_(newFoldBatchDim)(state, gradInput); + + THCTensor *old_gradOutput = gradOutput; + gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput); + THCTensor_(free)(state, old_gradOutput); + } else { + THCTensor_(retain)(state, gradInput); + } + + THCDeviceTensor cudaGradOutput; + cudaGradOutput = toDeviceTensor(state, gradOutput); + real* gradInputData = THCTensor_(data)(state, gradInput); + + THLongStorage *indicesSize = THLongStorage_newWithSize(4); + int64_t indicesSizeRaw[4] = { batchSize * inputSlices, + outputTime, outputHeight, outputWidth }; + THLongStorage_rawCopy(indicesSize, indicesSizeRaw); + THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)( + state, THCIndexTensor_(storage)(state, indices), + THCIndexTensor_(storageOffset)(state, indices), indicesSize, NULL); + THLongStorage_free(indicesSize); + + THCDeviceTensor cudaIndices = + toDeviceTensor(state, indices1); + + int64_t totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast(block.x)), + THCCeilDiv(outputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + cuda_VolumetricDilatedMaxPooling_updateGradInput<<>>( + cudaGradOutput, + cudaIndices, + gradInputData, + inputTime, inputHeight, inputWidth, + dT, dH, dW, + padT, padH, padW, + dilationT, dilationH, dilationW, offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + // cleanup + THCTensor_(free)(state, gradInput); + THCTensor_(free)(state, gradOutput); + THCIndexTensor_(free)(state, indices1); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricFractionalMaxPooling.cu new file mode 100644 index 0000000..f4e731f --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricFractionalMaxPooling.cu @@ -0,0 +1,168 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.cu" +#else + +void THNN_(VolumetricFractionalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THCIndexTensor *indices, + THCTensor *randomSamples) +{ + int planeDim = 0; + int dimh = 1; + int dimw = 2; + int dimt = 3; + int64_t numBatch = 1; + + int64_t numInputDims = THCTensor_(nDimension)(state, input); + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 4 || numInputDims == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (numInputDims == 5) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + dimt++; + } + + /* sizes */ + int64_t numPlanes = THCTensor_(size)(state, input, planeDim); + int64_t inputH = THCTensor_(size)(state, input, dimh); + int64_t inputW = THCTensor_(size)(state, input, dimw); + int64_t inputT = THCTensor_(size)(state, input, dimt); + + THArgCheck(outputH + poolSizeH - 1 < inputH, 7, + "poolSizeH (%d) too large relative to input height (%d)", + poolSizeH, inputH); + THArgCheck(outputW + poolSizeW - 1 < inputW, 6, + "poolSizeW (%d) too large relative to input width (%d)", + poolSizeW, inputW); + THArgCheck(outputT + poolSizeT - 1 < inputW, 5, + "poolSizeT (%d) too large relative to input time (%d)", + poolSizeT, inputT); + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + THCDeviceTensor devIndices; + THCDeviceTensor devSamples = + toDeviceTensor(state, randomSamples); + + if (numInputDims == 4) { + /* resize output */ + THCTensor_(resize4d)(state, output, numPlanes, outputH, outputW, outputT); + /* indices will contain the locations for each output point */ + THCIndexTensor_(resize4d)(state, indices, numPlanes, outputH, outputW, outputT); + + devInput = toDeviceTensor(state, input).upcastOuter<5>(); + devOutput = toDeviceTensor(state, output).upcastOuter<5>(); + devIndices = toDeviceTensor(state, indices).upcastOuter<5>(); + } else { + THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputH, outputW, outputT); + /* indices will contain the locations for each output point */ + THCIndexTensor_(resize5d)(state, indices, numBatch, numPlanes, outputH, outputW, outputT); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + devIndices = toDeviceTensor(state, indices); + } + + // block is limited to 4 warps + // grid handles overflow per each plane + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) * devOutput.getSize(4); + dim3 grid(THCCeilDiv(outputPlaneSize, 128), + devInput.getSize(1), + devInput.getSize(0)); + dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); + +#define SFMP_UPDATE_OUTPUT(POOL_W) \ + VolumetricFractionalMaxPooling_updateOutput \ + <<>>( \ + devInput, devOutput, devIndices, devSamples, poolSizeT, poolSizeW, poolSizeH); + +#define SFMP_UPDATE_OUTPUT_CASE(POOL_W) \ + case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break + + switch (poolSizeW) { + SFMP_UPDATE_OUTPUT_CASE(2); + SFMP_UPDATE_OUTPUT_CASE(3); + SFMP_UPDATE_OUTPUT_CASE(4); + SFMP_UPDATE_OUTPUT_CASE(5); + SFMP_UPDATE_OUTPUT_CASE(6); + SFMP_UPDATE_OUTPUT_CASE(7); + default: + // dynamic pool width + SFMP_UPDATE_OUTPUT_CASE(-1); + } + THCudaCheck(cudaGetLastError()); +} + +void THNN_(VolumetricFractionalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THCIndexTensor *indices) +{ + int dimh = 1; + int dimw = 2; + int dimt = 3; + + int64_t numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 5) { + dimh++; + dimw++; + dimt++; + } + + /* sizes */ + int64_t inputH = THCTensor_(size)(state, input, dimh); + int64_t inputW = THCTensor_(size)(state, input, dimw); + int64_t inputT = THCTensor_(size)(state, input, dimt); + + THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3, + "gradOutput height unexpected"); + THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected"); + THArgCheck(outputT == THCTensor_(size)(state, gradOutput, dimt), 3, + "gradOutput time unexpected"); + + /* resize */ + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + THCDeviceTensor devIndices; + + /* backprop */ + if (numInputDims == 4) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<5>(); + devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<5>(); + devIndices = toDeviceTensor(state, indices).upcastOuter<5>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + devIndices = toDeviceTensor(state, indices); + } + + // block is limited to 4 warps + // grid handles overflow per each plane + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) * devGradOutput.getSize(4); + dim3 grid(THCCeilDiv(outputPlaneSize, 128), + devGradInput.getSize(1), + devGradInput.getSize(0)); + dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); + + VolumetricFractionalMaxPooling_updateGradInput + <<>>( + devGradInput, devGradOutput, devIndices); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricFullConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullConvolution.cu new file mode 100644 index 0000000..e2a2f55 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricFullConvolution.cu @@ -0,0 +1,61 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu" +#else + +void THNN_(VolumetricFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH) +{ + THNN_(VolumetricFullDilatedConvolution_updateOutput)( + state, input, output, weight, bias, finput, fgradInput, + kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH); +} + +void THNN_(VolumetricFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH) +{ + THNN_(VolumetricFullDilatedConvolution_updateGradInput)( + state, input, gradOutput, gradInput, weight, finput, fgradInput, + kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH); +} + + +void THNN_(VolumetricFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH, + accreal scale_) +{ + THNN_(VolumetricFullDilatedConvolution_accGradParameters)( + state, input, gradOutput, gradWeight, gradBias, finput, fgradInput, + kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH, scale_); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu new file mode 100644 index 0000000..bd653b9 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu @@ -0,0 +1,537 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricFullDilatedConvolution.cu" +#else + +static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *weight, + THCTensor *bias, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH, int weight_nullable) { + THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d", + dilationT, dilationH, dilationW); + THArgCheck((adjT < dT || adjT < dilationT) + && (adjW < dW || adjW < dilationW) + && (adjH < dH || adjH < dilationH), 15, + "output padding must be smaller than either stride or dilation," + " but got adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d " + "dilationT: %d dilationH: %d dilationW: %d", + adjT, adjH, adjW, dT, dH, dW, dilationT, dilationH, dilationW); + + // number of input & output planes and kernel size is indirectly defined by the weight tensor + if (weight != NULL) { + THCUNN_argCheck(state, !weight->is_empty() && weight->dim() == 5, 4, weight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + if (bias != NULL) { + THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) { + dimf++; + dimd++; + dimh++; + dimw++; + } + + if (weight != NULL) { + const int64_t nInputPlane = THCTensor_(size)(state, weight, 0); + THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); + } + + int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size[dimh]; + int64_t inputDepth = input->size[dimd]; + int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld x %ld). " + "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + const int64_t nOutputPlane = THCTensor_(size)(state, weight, 1); + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + const int64_t nOutputPlane = THCTensor_(size)(state, bias, 0); + THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); + } + THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth); + THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); + THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricFullDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH) +{ + + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + + THCUNN_assertSameGPU(state, 6, input, output, weight, + bias, columns, ones); + THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + state, input, NULL, weight, bias, kT, kW, kH, + dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH, + adjT, adjW, adjH, 0); + + THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, + "bias tensor has to be contiguous"); + input = THCTensor_(newContiguous)(state, input); + weight = THCTensor_(newContiguous)(state, weight); + + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } + + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t inputDepth = input->size[2]; + int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + int64_t n = columns->size[1]; + int64_t k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, input_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert::to(0), + THCTensor_(data)(state, columns), n + ); + + // Unpack columns back into input: + col2vol( + THCState_getCurrentStream(state), + THCTensor_(data)(state, columns), + nOutputPlane, outputDepth, outputHeight, outputWidth, + inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputDepth * outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, output_n), n_ + ); + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, weight); + +} + +void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH) +{ + THCTensor *gradColumns = finput; + + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + + THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + state, input, gradOutput, weight, NULL, kT, kW, kH, + dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH, + adjT, adjW, adjH, 0); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + weight = THCTensor_(newContiguous)(state, weight); + + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t inputDepth = input->size[2]; + int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputDepth, outputHeight, outputWidth, + inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, gradColumns) + ); + + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[0]; + int64_t n = gradColumns->size[1]; + int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradColumns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert::to(0), + THCTensor_(data)(state, gradInput_n), n + ); + } + + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (is_batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, weight); +} + + +void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + int adjT, int adjW, int adjH, + accreal scale_) +{ + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + + real scale = ScalarConvert::to(scale_); + THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, + gradBias, columns, ones); + THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + state, input, gradOutput, gradWeight, gradBias, kT, kW, kH, + dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH, + adjT, adjW, adjH, 1); + + int nOutputPlane; + if (gradWeight) { + nOutputPlane = THCTensor_(size)(state, gradWeight, 1); + } else if (gradBias) { + nOutputPlane = THCTensor_(size)(state, gradBias, 0); + } else { + return; + } + + if (gradWeight) { + THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous"); + } + if (gradBias) { + THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous"); + } + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t inputDepth = input->size[2]; + int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputDepth, outputHeight, outputWidth, + inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t n = columns->size[0]; // nOutputPlane * kt * kh * kw + int64_t m = input_n->size[0]; // nInputPlane + int64_t k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, input_n), k, + ScalarConvert::to(1), + THCTensor_(data)(state, gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (is_batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, input->size[1], inputDepth, inputHeight, inputWidth); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu new file mode 100644 index 0000000..8722ce9 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu @@ -0,0 +1,104 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.cu" +#else + +static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *gradOutput) { + THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimension)(state, input) == 5, 2, input, + "non-empty 5D input tensor expected but got: %s"); + THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimension)(state, grid) == 5, 2, grid, + "non-empty 5D grid tensor expected but got: %s"); + + int64_t nbatch = THCTensor_(size)(state, input, 0); + int64_t channels = THCTensor_(size)(state, input, 1); + int64_t idepth = THCTensor_(size)(state, input, 2); + int64_t iheight = THCTensor_(size)(state, input, 3); + int64_t iwidth = THCTensor_(size)(state, input, 4); + int64_t odepth = THCTensor_(size)(state, grid, 1); + int64_t oheight = THCTensor_(size)(state, grid, 2); + int64_t owidth = THCTensor_(size)(state, grid, 3); + + THCUNN_check_dim_size(state, grid, 5, 0, nbatch); + THCUNN_check_dim_size(state, grid, 5, 4, 3); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 5, 0, nbatch); + THCUNN_check_dim_size(state, gradOutput, 5, 1, channels); + THCUNN_check_dim_size(state, gradOutput, 5, 2, odepth); + THCUNN_check_dim_size(state, gradOutput, 5, 3, oheight); + THCUNN_check_dim_size(state, gradOutput, 5, 4, owidth); + } +} + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode) { + + THCUNN_assertSameGPU(state, 3, input, grid, output); + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, NULL); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t ID = THCTensor_(size)(state, input, 2); + int64_t IH = THCTensor_(size)(state, input, 3); + int64_t IW = THCTensor_(size)(state, input, 4); + int64_t D = THCTensor_(size)(state,grid, 1); + int64_t H = THCTensor_(size)(state,grid, 2); + int64_t W = THCTensor_(size)(state, grid, 3); + + // resize output to the same shape as input + THCTensor_(resize5d)(state, output, N, C, D, H, W); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devOutput = toDeviceTensor(state, output); + + int count = static_cast(N*D*H*W); + VolumetricGridSamplerBilinear_updateOutput_kernel + <<>>( + count, devInput, devGrid, devOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode) { + + THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput); + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t ID = THCTensor_(size)(state, input, 2); + int64_t IH = THCTensor_(size)(state, input, 3); + int64_t IW = THCTensor_(size)(state, input, 4); + int64_t D = THCTensor_(size)(state,grid, 1); + int64_t H = THCTensor_(size)(state,grid, 2); + int64_t W = THCTensor_(size)(state, grid, 3); + + THCTensor_(resize5d)(state, gradInput, N, C, ID, IH, IW); + THCTensor_(resize5d)(state, gradGrid, N, D, H, W, 3); + THCTensor_(zero)(state, gradInput); + THCTensor_(zero)(state, gradGrid); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGradInput = toDeviceTensor(state, gradInput); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devGradGrid = toDeviceTensor(state, gradGrid); + THCDeviceTensor devGradOutput = toDeviceTensor(state, gradOutput); + + int count = static_cast(N*D*H*W); + VolumetricGridSamplerBilinear_updateGradInput_kernel + <<>>( + count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricMaxPooling.cu new file mode 100644 index 0000000..c86be82 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricMaxPooling.cu @@ -0,0 +1,40 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu" +#else + +void THNN_(VolumetricMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kT, kW, kH, dT, dW, dH, padT, padW, padH, + 1, 1, 1, ceilMode); + +} + +void THNN_(VolumetricMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kT, kW, kH, dT, dW, dH, padT, padW, padH, + 1, 1, 1, ceilMode); + +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu new file mode 100644 index 0000000..0b5a17d --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu @@ -0,0 +1,271 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricMaxUnpooling.cu" +#else + +static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) { + int inputSlices = 0; + + THCUNN_check_shape_indices(state, indices, input); + + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + + if (THCTensor_(nDimension)(state, input) == 4) + { + inputSlices = THCTensor_(size)(state, input, 0); + } + else if (THCTensor_(nDimension)(state, input) == 5) + { + inputSlices = THCTensor_(size)(state, input, 1); + } + else + { + AT_ERROR("non-empty 4D or 5D tensor expected, got size: ", + input->sizes()); + } + + int dimw = 3; + int dimh = 2; + int dimt = 1; + int dimn = 0; + if (input->dim() == 5) + { + dimt++; + dimw++; + dimh++; + dimn++; + } + + if (gradOutput != NULL) { + if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh]) + { + THError( + "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d", + oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]); + } + + THCUNN_check_dim_size(state, gradOutput, input->dim(), dimn, inputSlices); + } +} + +void THNN_(VolumetricMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + int batchSize = 0; + int inputSlices = 0; + int inputTime = 0; + int inputHeight = 0; + int inputWidth = 0; + + THNN_(VolumetricMaxUnpooling_shapeCheck)( + state, input, NULL, indices, + outputTime, outputWidth, outputHeight, + dT, dW, dH, padT, padW, padH); + THCUNN_assertSameGPU(state, 3, input, indices, output); + + int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5; + if (THCTensor_(nDimension)(state, input) == 4) + { + /* sizes */ + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (fiveDimensionalInput) + { + /* sizes */ + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + + if (!fiveDimensionalInput) /* 4D */ + { + /* resize output */ + THCTensor_(resize4d)(state, output, inputSlices, + outputTime, outputHeight, outputWidth); + } + else + { /* 5D */ + THCTensor_(resize5d)(state, output, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + output = THCTensor_(newContiguous)(state, output); + THCTensor_(zero)(state, output); + + if (fiveDimensionalInput) { + // Collapse batch and feature dimensions + // newFoldBatchDim assumes contiguity so the newContiguous calls must + // preceed this + THCTensor *old_output = output; + output = THCTensor_(newFoldBatchDim)(state, output); + THCTensor_(free)(state, old_output); + + THCTensor *old_input = input; + input = THCTensor_(newFoldBatchDim)(state, input); + THCTensor_(free)(state, old_input); + + THCIndexTensor *old_indices = indices; + indices = THCIndexTensor_(newFoldBatchDim)(state, indices); + THCIndexTensor_(free)(state, old_indices); + } + + real* outputData = THCTensor_(data)(state, output); + + THCDeviceTensor cudaInput; + THCDeviceTensor cudaIndices; + + cudaInput = toDeviceTensor(state, input); + cudaIndices = toDeviceTensor(state, indices); + + int totalZ = inputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(inputWidth, static_cast(block.x)), + THCCeilDiv(inputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + cuda_VolumetricMaxUnpooling_updateOutput<<>>( + cudaInput, cudaIndices, outputData, + outputTime, outputHeight, outputWidth, + dT, dH, dW, + padT, padH, padW, offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, output); + THCIndexTensor_(free)(state, indices); +} + +void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + int batchSize = 0; + int inputSlices = 0; + int inputTime = 0; + int inputHeight = 0; + int inputWidth = 0; + + THNN_(VolumetricMaxUnpooling_shapeCheck)( + state, input, gradOutput, indices, + outputTime, outputWidth, outputHeight, + dT, dW, dH, padT, padW, padH); + THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); + + int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5; + if (!fiveDimensionalInput) /* 4D */ + { + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else + { + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + indices = THCIndexTensor_(newContiguous)(state, indices); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + // Collapse batch and feature dimensions + if (fiveDimensionalInput) { + gradInput = THCTensor_(newFoldBatchDim)(state, gradInput); + + THCIndexTensor *old_indices = indices; + indices = THCIndexTensor_(newFoldBatchDim)(state, indices); + THCIndexTensor_(free)(state, old_indices); + + THCTensor *old_gradOutput = gradOutput; + gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput); + THCTensor_(free)(state, old_gradOutput); + } else { + THCTensor_(retain)(state, gradInput); + } + + real* gradOutputData = THCTensor_(data)(state, gradOutput); + + THCDeviceTensor cudaGradInput; + THCDeviceTensor cudaIndices; + + cudaGradInput = toDeviceTensor(state, gradInput); + cudaIndices = toDeviceTensor(state, indices); + + int totalZ = inputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(inputWidth, static_cast(block.x)), + THCCeilDiv(inputHeight, static_cast(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + cuda_VolumetricMaxUnpooling_updateGradInput<<>>( + gradOutputData, + outputTime, outputHeight, outputWidth, + cudaIndices, + cudaGradInput, + dT, dH, dW, + padT, padH, padW, offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + // cleanup + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, gradInput); + THCIndexTensor_(free)(state, indices); + THCTensor_(free)(state, input); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu new file mode 100644 index 0000000..071b322 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu @@ -0,0 +1,174 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricReplicationPadding.cu" +#else + +static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + int numInputDims = THCTensor_(nDimension)(state, input); + + THCUNN_argCheck(state, !input->is_empty() && (numInputDims == 4 || numInputDims == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + int planeDim = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + if (numInputDims == 5) { + planeDim++; + dimd++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int idepth = input->size[dimd]; + int iheight = input->size[dimh]; + int iwidth = input->size[dimw]; + int odepth = idepth + pfront + pback; + int oheight = iheight + ptop + pbottom; + int owidth = iwidth + pleft + pright; + THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2, + "input (D: %d H: %d, W: %d) is too small." + " Calculated output D: %d H: %d W: %d", + idepth, iheight, iwidth, odepth, oheight, owidth); + + if (gradOutput != NULL) { + THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), + 3, "output gradient tensor must fit into 32-bit index math"); + + THArgCheck(numPlanes == THCTensor_(size)(state, gradOutput, planeDim), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + numPlanes, THCTensor_(size)(state, gradOutput, planeDim)); + THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THCTensor_(size)(state, gradOutput, dimw)); + THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THCTensor_(size)(state, gradOutput, dimh)); + THArgCheck(odepth == THCTensor_(size)(state, gradOutput, dimd), 3, + "gradOutput depth unexpected. Expected: %d, Got: %d", + odepth, THCTensor_(size)(state, gradOutput, dimd)); + } +} + +void THNN_(VolumetricReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + THNN_(VolumetricReplicationPadding_shapeCheck)( + state, input, NULL, pleft, pright, ptop, + pbottom, pfront, pback); + + int planeDim = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + + if (numInputDims == 5) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimd++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputD = THCTensor_(size)(state, input, dimd); + int inputH = THCTensor_(size)(state, input, dimh); + int inputW = THCTensor_(size)(state, input, dimw); + int outputD = inputD + pfront + pback; + int outputH = inputH + ptop + pbottom; + int outputW = inputW + pleft + pright; + + THCDeviceTensor devInput; + THCDeviceTensor devOutput; + + if (numInputDims == 4) { + THCTensor_(resize4d)(state, output, numPlanes, outputD, outputH, outputW); + + devInput = toDeviceTensor(state, input).upcastOuter<5>(); + devOutput = toDeviceTensor(state, output).upcastOuter<5>(); + } else { + THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputD, outputH, + outputW); + + devInput = toDeviceTensor(state, input); + devOutput = toDeviceTensor(state, output); + } + + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) * + devOutput.getSize(4); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + VolumetricReplicationPadding_updateOutput<<>>( + devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright); +} + +void THNN_(VolumetricReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + THNN_(VolumetricReplicationPadding_shapeCheck)( + state, input, gradOutput, pleft, pright, ptop, + pbottom, pfront, pback); + + int planeDim = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 5) { + planeDim++; + dimd++; + dimh++; + dimw++; + } + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor devGradInput; + THCDeviceTensor devGradOutput; + + if (numInputDims == 4) { + devGradInput = toDeviceTensor(state, gradInput).upcastOuter<5>(); + devGradOutput = + toDeviceTensor(state, gradOutput).upcastOuter<5>(); + } else { + devGradInput = toDeviceTensor(state, gradInput); + devGradOutput = toDeviceTensor(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) * + devGradOutput.getSize(4); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + VolumetricReplicationPadding_updateGradInput<<>>( + devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu new file mode 100644 index 0000000..06994a1 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu @@ -0,0 +1,107 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricUpSamplingNearest.cu" +#else + +#include "../common.h" + +static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck) + (THCState *state, + THCTensor *input, THCTensor *gradOutput, + int nBatch, int nChannels, + int inputDepth, int inputHeight, int inputWidth, + int outputDepth, int outputHeight, int outputWidth) { + THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 + && outputDepth && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + if (input != NULL) { + THCUNN_argCheck(state, input->_dim() == 5, 2, input, + "5D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch); + THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels); + THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth); + THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight); + THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth); + } +} + + +void THNN_(VolumetricUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputDepth, + int outputHeight, + int outputWidth) +{ + THCUNN_assertSameGPU(state, 2, input, output); + int nbatch = THCTensor_(size)(state, input, 0); + int channels = THCTensor_(size)(state, input, 1); + int inputDepth = THCTensor_(size)(state, input, 2); + int inputHeight = THCTensor_(size)(state, input, 3); + int inputWidth = THCTensor_(size)(state, input, 4); + + THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && + outputDepth > 0 && outputHeight > 0 && outputWidth > 0); + + THCTensor_(resize5d)(state, output, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + outputDepth, + outputHeight, + outputWidth); + THCTensor_(zero)(state, output); + + THCDeviceTensor idata = toDeviceTensor(state, input); + THCDeviceTensor odata = toDeviceTensor(state, output); + + const int num_kernels = outputDepth * outputHeight * outputWidth; + const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + nearest_neighbor_5d_kernel <<>>(num_kernels, idata, odata); + THCudaCheck(cudaGetLastError()); +} + + + +void THNN_(VolumetricUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth) +{ + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth); + + THCTensor_(zero)(state, gradInput); + THCDeviceTensor data1 = toDeviceTensor(state, gradInput); + THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); + const int num_kernels = outputDepth * outputHeight * outputWidth; + const int num_threads = THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + nearest_neighbor_5d_kernel_backward <<>>(num_kernels, data1, data2); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu new file mode 100644 index 0000000..1dbad86 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu @@ -0,0 +1,112 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.cu" +#else + +#include "../linear_upsampling.h" + +static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (THCState *state, + THCTensor *input, THCTensor *gradOutput, + int nBatch, int nChannels, + int inputDepth, int inputHeight, int inputWidth, + int outputDepth, int outputHeight, int outputWidth) { + THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 + && outputDepth && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + if (input != NULL) { + THCUNN_argCheck(state, !input->is_empty() && input->dim() == 5, 2, input, + "non-empty 5D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch); + THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels); + THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth); + THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight); + THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth); + } +} + +void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputDepth, + int outputHeight, + int outputWidth, + bool align_corners) +{ + int nbatch = THCTensor_(size)(state, input, 0); + int channels = THCTensor_(size)(state, input, 1); + int inputDepth = THCTensor_(size)(state, input, 2); + int inputHeight = THCTensor_(size)(state, input, 3); + int inputWidth = THCTensor_(size)(state, input, 4); + THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (state, input, NULL, + nbatch, channels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + + THCUNN_assertSameGPU(state, 2, input, output); + THCTensor_(resize5d)(state, output, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + outputDepth, outputHeight, outputWidth); + THCTensor_(zero)(state, output); + THCDeviceTensor idata = toDeviceTensor(state, input); + THCDeviceTensor odata = toDeviceTensor(state, output); + THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && outputDepth > 0 && outputHeight > 0 && outputWidth > 0); + const accreal rdepth = linear_upsampling_compute_scale(inputDepth, outputDepth, align_corners); + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + const int num_kernels = outputDepth * outputHeight * outputWidth; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel <<>>(num_kernels, rdepth, rheight, rwidth, align_corners, idata, odata); + THCudaCheck(cudaGetLastError()); +} + + +void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth, + bool align_corners) +{ + THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (state, NULL, gradOutput, + nbatch, nchannels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); + THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth); + THCTensor_(zero)(state, gradInput); + THCDeviceTensor data1 = toDeviceTensor(state, gradInput); + THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); + const accreal rdepth = linear_upsampling_compute_scale(inputDepth, outputDepth, align_corners); + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + const int num_kernels = outputDepth * outputHeight * outputWidth; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rdepth, rheight, rwidth, align_corners, data1, data2); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/aten/src/THCUNN/im2col.h b/aten/src/THCUNN/im2col.h new file mode 100644 index 0000000..ba90560 --- /dev/null +++ b/aten/src/THCUNN/im2col.h @@ -0,0 +1,130 @@ +#ifndef THCUNN_IM2COL_H +#define THCUNN_IM2COL_H + +#include "common.h" +#include "THCNumerics.cuh" + +// Kernel for fast unfold+copy +// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) +template +__launch_bounds__(CUDA_NUM_THREADS) +__global__ void im2col_kernel(const int64_t n, const Dtype* data_im, + const int64_t height, const int64_t width, + const int64_t ksize_h, const int64_t ksize_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, + const int64_t height_col, const int64_t width_col, + Dtype* data_col) { + CUDA_KERNEL_LOOP(index, n) { + int64_t w_out = index % width_col; + index /= width_col; + int64_t h_out = index % height_col; + int64_t channel_in = index / height_col; + int64_t channel_out = channel_in * ksize_h * ksize_w; + int64_t h_in = h_out * stride_h - pad_h; + int64_t w_in = w_out * stride_w - pad_w; + data_col += (channel_out * height_col + h_out) * width_col + w_out; + data_im += (channel_in * height + h_in) * width + w_in; + for (int64_t i = 0; i < ksize_h; ++i) { + for (int64_t j = 0; j < ksize_w; ++j) { + int64_t h = h_in + i * dilation_h; + int64_t w = w_in + j * dilation_w; + *data_col = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert::to(0); + data_col += height_col * width_col; + } + } + } +} + +template +void im2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t height_col, const int64_t width_col, + const int64_t ksize_h, const int64_t ksize_w, const int64_t pad_h, + const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, Dtype* data_col) { + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int64_t num_kernels = channels * height_col * width_col; + // Launch + im2col_kernel <<>> ( + num_kernels, data_im, height, width, ksize_h, ksize_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, + height_col, width_col, data_col + ); + THCudaCheck(cudaGetLastError()); +} + +template +__launch_bounds__(CUDA_NUM_THREADS) +__global__ void col2im_kernel(const int64_t n, const Dtype* data_col, + const int64_t height, const int64_t width, const int64_t channels, + const int64_t kernel_h, const int64_t kernel_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, + const int64_t height_col, const int64_t width_col, + Dtype* data_im) { + CUDA_KERNEL_LOOP(index, n) { + Acctype val = Acctype(0); + const int64_t w_im = index % width + pad_w; + const int64_t h_im = (index / width) % height + pad_h; + const int64_t c_im = index / (width * height); + int64_t kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + int64_t kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + // compute the start and end of the output + const int64_t w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const int64_t w_col_end = min(w_im / stride_w + 1, width_col); + const int64_t h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const int64_t h_col_end = min(h_im / stride_h + 1, height_col); + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int64_t h_k = (h_im - h_col * stride_h); + int64_t w_k = (w_im - w_col * stride_w); + if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { + h_k /= dilation_h; + w_k /= dilation_w; + int64_t data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * + height_col + h_col) * width_col + w_col; + val += data_col[data_col_index]; + } + } + } + data_im[index] = ScalarConvert::to(val); + } +} + +template +void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t patch_h, const int64_t patch_w, const int64_t pad_h, + const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im); + +template +void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t patch_h, const int64_t patch_w, const int64_t pad_h, + const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im) { + int64_t num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im_kernel <<>> ( + num_kernels, data_col, height, width, channels, + patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, + output_height, output_width, data_im + ); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/linear_upsampling.h b/aten/src/THCUNN/linear_upsampling.h new file mode 100644 index 0000000..bd8a601 --- /dev/null +++ b/aten/src/THCUNN/linear_upsampling.h @@ -0,0 +1,41 @@ +#ifndef THCUNN_LINEAR_UPSAMPLING_H +#define THCUNN_LINEAR_UPSAMPLING_H + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) + + +template +__host__ __forceinline__ +static Acctype linear_upsampling_compute_scale( + int inputSize, int outputSize, bool align_corners) { + if (outputSize > 1) { + return align_corners ? (Acctype) (inputSize - 1) / (outputSize - 1) + : (Acctype) inputSize / outputSize; + } else { + return Acctype(0); + } +} + +template +__device__ __forceinline__ +static Acctype linear_upsampling_compute_source_index( + Acctype scale, int dst_index, bool align_corners) { + if (align_corners) { + return scale * dst_index; + } else { + Acctype src_idx = scale * (dst_index + Acctype(0.5)) - Acctype(0.5); + return src_idx < Acctype(0) ? Acctype(0) : src_idx; + } +} + +__device__ __forceinline__ +static int nearest_neighbor_compute_source_index( + const float scale, int dst_index, int inputSize) { + const int src_index = MIN(floor(dst_index * scale), inputSize - 1); + return src_index; +} +#endif + diff --git a/aten/src/THCUNN/row2col.h b/aten/src/THCUNN/row2col.h new file mode 100644 index 0000000..04765dd --- /dev/null +++ b/aten/src/THCUNN/row2col.h @@ -0,0 +1,90 @@ +#ifndef THCUNN_ROW2COL_H +#define THCUNN_ROW2COL_H + +#include "THCNumerics.cuh" +#include "common.h" + +// Kernel for fast unfold+copy on rows +template +__global__ void +row2col_kernel(const int n, const Dtype *data_row, const int width, + const int ksize_w, const int pad_w, const int stride_w, + const int dilation_w, const int width_col, Dtype *data_col) { + CUDA_KERNEL_LOOP(index, n) { + int w_out = index % width_col; + index /= width_col; + int channel_in = index; + int channel_out = channel_in * ksize_w; + int w_in = w_out * stride_w - pad_w; + data_col += (channel_out)*width_col + w_out; + data_row += (channel_in)*width + w_in; + for (int j = 0; j < ksize_w; ++j) { + int w = w_in + j * dilation_w; + *data_col = (w >= 0 && w < width) ? data_row[j * dilation_w] + : ScalarConvert::to(0); + data_col += width_col; + } + } +} + +template +void row2col(cudaStream_t stream, const Dtype *data_row, const int channels, + const int width, const int ksize_w, const int pad_w, + const int stride_w, const int dilation_w, Dtype *data_col) { + // We are going to launch channels * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int width_col = + (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * width_col; + // Launch + row2col_kernel<<>>( + num_kernels, data_row, width, ksize_w, pad_w, stride_w, 1, width_col, + data_col); + THCudaCheck(cudaGetLastError()); +} + +template +__global__ void col2row_kernel(const int n, const Dtype *data_col, + const int width, const int channels, + const int kernel_w, const int pad_w, + const int stride_w, const int dilation_w, + const int width_col, Dtype *data_row) { + CUDA_KERNEL_LOOP(index, n) { + Acctype val = Acctype(0); + const int w_row = index % width + pad_w; + const int c_row = index / width; + int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + // compute the start and end of the output + const int w_col_start = (w_row < kernel_extent_w) + ? 0 + : (w_row - kernel_extent_w) / stride_w + 1; + const int w_col_end = min(w_row / stride_w + 1, width_col); + for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int w_k = (w_row - w_col * stride_w); + if (w_k % dilation_w == 0) { + w_k /= dilation_w; + int data_col_index = (c_row * kernel_w + w_k) * width_col + w_col; + val += data_col[data_col_index]; + } + } + data_row[index] = ScalarConvert::to(val); + } + } + +template +void col2row(cudaStream_t stream, const Dtype *data_col, const int channels, + const int width, const int patch_w, const int pad_w, + const int stride_w, const int dilation_w, Dtype *data_row) { + int width_col = + (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2row_kernel< + Dtype, Acctype><<>>( + num_kernels, data_col, width, channels, patch_w, pad_w, stride_w, + dilation_w, width_col, data_row); + + THCudaCheck(cudaGetLastError()); +} +#endif diff --git a/aten/src/THCUNN/vol2col.h b/aten/src/THCUNN/vol2col.h new file mode 100644 index 0000000..223248f --- /dev/null +++ b/aten/src/THCUNN/vol2col.h @@ -0,0 +1,139 @@ +#ifndef THCUNN_VOL2COL_H +#define THCUNN_VOL2COL_H + +#include "common.h" +#include "THCNumerics.cuh" + +// Kernel for fast unfold+copy on volumes +template +__global__ void vol2col_kernel(const int n, const Dtype* data_vol, + const int depth, const int height, const int width, + const int ksize_t, const int ksize_h, const int ksize_w, + const int pad_t, const int pad_h, const int pad_w, + const int stride_t, const int stride_h, const int stride_w, + const int dilation_t, const int dilation_h, const int dilation_w, + const int depth_col, const int height_col, const int width_col, + Dtype* data_col) { +CUDA_KERNEL_LOOP(index, n) { + int w_out = index % width_col; + index /= width_col; + int h_out = index % height_col; + index /= height_col; + int t_out = index % depth_col; + int channel_in = index / depth_col; + int channel_out = channel_in * ksize_t * ksize_h * ksize_w; + int t_in = t_out * stride_t - pad_t; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + data_col += ((channel_out * depth_col + t_out) * height_col + h_out) * width_col + w_out; + data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in; + for (int i = 0; i < ksize_t; ++i) { + for (int j = 0; j < ksize_h; ++j) { + for (int k = 0; k < ksize_w; ++k) { + int t = t_in + i * dilation_t; + int h = h_in + j * dilation_h; + int w = w_in + k * dilation_w; + *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && w < width) ? + data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : ScalarConvert::to(0); + data_col += depth_col * height_col * width_col; + } + } + } + } +} + +template +void vol2col(cudaStream_t stream, const Dtype* data_vol, const int channels, + const int depth, const int height, const int width, + const int depth_col, const int height_col, const int width_col, + const int ksize_t, const int ksize_h, const int ksize_w, + const int pad_t, const int pad_h, const int pad_w, + const int stride_t, const int stride_h, const int stride_w, + const int dilation_t, const int dilation_h, const int dilation_w, + Dtype* data_col) { + // We are going to launch channels * depth_col * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int num_kernels = channels * depth_col * height_col * width_col; + // Launch + vol2col_kernel <<>> ( + num_kernels, data_vol, depth, height, width, ksize_t, ksize_h, ksize_w, + pad_t, pad_h, pad_w, stride_t, stride_h, stride_w, + dilation_t, dilation_h, dilation_w, + depth_col, height_col, width_col, data_col + ); + THCudaCheck(cudaGetLastError()); +} + +template +__global__ void vol2im_kernel(const int n, const Dtype* data_col, + const int depth, const int height, const int width, const int channels, + const int kernel_t, const int kernel_h, const int kernel_w, + const int pad_t, const int pad_h, const int pad_w, + const int stride_t, const int stride_h, const int stride_w, + const int dilation_t, const int dilation_h, const int dilation_w, + const int depth_col, const int height_col, const int width_col, + Dtype* data_vol) { + CUDA_KERNEL_LOOP(index, n) { + Acctype val = Acctype(0); + const int w_im = index % width + pad_w; + const int h_im = (index / width) % height + pad_h; + const int t_im = (index / width / height) % depth + pad_t; + const int c_im = index / (width * height * depth); + int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + int kernel_extent_t = (kernel_t - 1) * dilation_t + 1; + // compute the start and end of the output + const int w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const int w_col_end = min(w_im / stride_w + 1, width_col); + const int h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const int h_col_end = min(h_im / stride_h + 1, height_col); + const int t_col_start = + (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1; + const int t_col_end = min(t_im / stride_t + 1, depth_col); + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (int t_col = t_col_start; t_col < t_col_end; t_col += 1) { + for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int t_k = (t_im - t_col * stride_t); + int h_k = (h_im - h_col * stride_h); + int w_k = (w_im - w_col * stride_w); + if (t_k % dilation_t == 0 && h_k % dilation_h == 0 && w_k % dilation_w == 0) { + t_k /= dilation_t; + h_k /= dilation_h; + w_k /= dilation_w; + int data_col_index = + (((((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k) + * depth_col + t_col) * height_col + h_col) * width_col + w_col; + val += data_col[data_col_index]; + } + } + } + } + data_vol[index] = ScalarConvert::to(val); + } +} + +template +void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels, + const int depth, const int height, const int width, + const int output_depth, const int output_height, const int output_width, + const int patch_t, const int patch_h, const int patch_w, + const int pad_t, const int pad_h, const int pad_w, + const int stride_t, const int stride_h, const int stride_w, + const int dilation_t, const int dilation_h, const int dilation_w, + Dtype* data_vol) { + int num_kernels = channels * depth * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + vol2im_kernel <<>> ( + num_kernels, data_col, depth, height, width, channels, + patch_t, patch_h, patch_w, pad_t, pad_h, pad_w, stride_t, stride_h, stride_w, + dilation_t, dilation_h, dilation_w, + output_depth, output_height, output_width, data_vol + ); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THNN/CMakeLists.txt b/aten/src/THNN/CMakeLists.txt new file mode 100644 index 0000000..e61624c --- /dev/null +++ b/aten/src/THNN/CMakeLists.txt @@ -0,0 +1,5 @@ +set(ATen_CPU_SRCS ${ATen_CPU_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/init.cpp +PARENT_SCOPE) +INSTALL(FILES THNN.h Reduction.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THNN") +INSTALL(FILES generic/THNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THNN/generic") diff --git a/aten/src/THNN/README.md b/aten/src/THNN/README.md new file mode 100644 index 0000000..da4d549 --- /dev/null +++ b/aten/src/THNN/README.md @@ -0,0 +1,27 @@ +# THNN + +THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions; most users will want to use ATen, which provides a C++ wrapper around these functions. + +There is also a CUDA counterpart of THNN, THCUNN. + +Looking to add an implementation? Consider writing an ATen native function +instead! See [../ATen/native](ATen/native). + +## Links + +* [API reference](doc/api_reference.md) +* [Style guidelines](doc/style_guidelines.md) + +## API + +THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations: + +* **updateOutput** - applies the module to an input +* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input +* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters + +For information on argument types please check the [API reference](doc/api_reference.md). + +## Developer docs + +* [Style guidelines](doc/style_guidelines.md) diff --git a/aten/src/THNN/Reduction.h b/aten/src/THNN/Reduction.h new file mode 100644 index 0000000..fea4c2f --- /dev/null +++ b/aten/src/THNN/Reduction.h @@ -0,0 +1,17 @@ +#ifndef REDUCE_H +#define REDUCE_H + +namespace Reduction { + +// NB: Keep this in sync with Reduction class in torch/nn/modules/functional.py +// These constants control the reduction behavior of loss functions. +// Ideally, this would be a scoped enum, but jit doesn't support that +enum Reduction { + None, // Do not reduce + ElementwiseMean, // Sum losses and take mean over each individually computed loss element + Sum, // Sum losses + END +}; +} + +#endif diff --git a/aten/src/THNN/THNN.h b/aten/src/THNN/THNN.h new file mode 100644 index 0000000..e216e62 --- /dev/null +++ b/aten/src/THNN/THNN.h @@ -0,0 +1,33 @@ +#ifndef THNN_H +#define THNN_H + +#include +#include +#ifdef _OPENMP +#include +#endif + +#define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME) + +#define THIndexTensor THLongTensor +#define THIndexTensor_(NAME) THLongTensor_ ## NAME + +#define THIntegerTensor THIntTensor +#define THIntegerTensor_(NAME) THIntTensor_ ## NAME + +typedef int64_t THIndex_t; +typedef int32_t THInteger_t; +typedef void THNNState; + +#define THNN_resizeAs_indices(I1, I2) \ + THLongStorage *size2 = THIndexTensor_(newSizeOf)(I2); \ + if (!THTensor_(isSize)(I1, size2)) \ + { \ + THTensor_(resize)(I1, size2, NULL); \ + } \ + THLongStorage_free(size2); + +#include "generic/THNN.h" +#include + +#endif diff --git a/aten/src/THNN/doc/api_reference.md b/aten/src/THNN/doc/api_reference.md new file mode 100644 index 0000000..2372bba --- /dev/null +++ b/aten/src/THNN/doc/api_reference.md @@ -0,0 +1,27 @@ +# API docs + +This document describes the conventions behind the THNN API. + +### The API + +All functions provided by THNN are stored in `aten/src/THNN/generic/THNN.h`. +Look at this file. + +### Note on function names + +Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions: + +* `void THNN_FloatAbs_updateOutput(...)` +* `void THNN_DoubleAbs_updateOutput(...)` + +In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type. + +### Argument types + +Some arguments have additional tags placed in square brackets in their header declarations: + +* **[OUT]** - This is the output argument. It will be reshaped if needed. +* **[OPTIONAL]** - This argument is optional and can be safely set to NULL +* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call. +* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output). + diff --git a/aten/src/THNN/doc/style_guidelines.md b/aten/src/THNN/doc/style_guidelines.md new file mode 100644 index 0000000..a725454 --- /dev/null +++ b/aten/src/THNN/doc/style_guidelines.md @@ -0,0 +1,59 @@ +## API design guidelines + +Functions should return `void`. + +All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this: +``` +[weight], [bias], [any buffers], [additional arguments], [optional arguments] +``` + +### Modules +``` +updateOutput: state, input, output, ... +updateGradInput: state, input, gradOutput, gradInput, ... +accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ... +``` + +e.g. +```C +void THNN_(HardShrink_updateGradInput)( + THNNState* state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + real lambda) +``` + +### Criterions +``` +updateOutput: state, input, target, output, ... +updateGradInput: state, input, target, gradInput, ... +``` + +e.g. + +```C +void THNN_(ClassNLLCriterion_updateOutput)( + THNNState* state, + THTensor *input, + THLongTensor *target, + THTensor *output, + THTensor *weights, + THTensor *total_weight, + bool sizeAverage) +``` + +## Code style guide + +```C +void THNN_Linear_updateOutput( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias); +//<- 10 -> +``` + +All arguments should start on a new line after function name, and they should be indented using 10 spaces. + +Use 2 spaces for block indentation. diff --git a/aten/src/THNN/generic/Abs.c b/aten/src/THNN/generic/Abs.c new file mode 100644 index 0000000..28721ec --- /dev/null +++ b/aten/src/THNN/generic/Abs.c @@ -0,0 +1,28 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Abs.c" +#else + +void THNN_(Abs_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(resizeAs)(output, input); + THTensor_(abs)(output, input); +} + +void THNN_(Abs_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + real z = *input_data; + *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1); + ); +} + +#endif diff --git a/aten/src/THNN/generic/AbsCriterion.c b/aten/src/THNN/generic/AbsCriterion.c new file mode 100644 index 0000000..73552a2 --- /dev/null +++ b/aten/src/THNN/generic/AbsCriterion.c @@ -0,0 +1,64 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/AbsCriterion.c" +#else + +void THNN_(AbsCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + + if (reduction == Reduction::None) { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY3(real, input, real, target, real, output, + *output_data = fabs(*input_data - *target_data); + ); + return; + } + + real sum = 0; + THTensor_(resize1d)(output, 1); + TH_TENSOR_APPLY2(real, input, real, target, + sum += fabs(*input_data - *target_data); + ); + + if (reduction == Reduction::ElementwiseMean) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(AbsCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + THTensor_(resizeAs)(gradInput, input); + + if (reduction == Reduction::None) { + THNN_CHECK_SHAPE(gradOutput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = ((*input_data - *target_data) >= 0 ? 1 : -1); + ); + TH_TENSOR_APPLY2(real, gradInput, real, gradOutput, + *gradInput_data *= *gradOutput_data; + ); + return; + } + + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.) * THTensor_(fastGet1d)(gradOutput, 0); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm; + ); +} + +#endif diff --git a/aten/src/THNN/generic/BCECriterion.c b/aten/src/THNN/generic/BCECriterion.c new file mode 100644 index 0000000..f3f74ca --- /dev/null +++ b/aten/src/THNN/generic/BCECriterion.c @@ -0,0 +1,118 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/BCECriterion.c" +#else + +#define EPS 1e-12 + +static inline real safe_log(real a) { + if (a == 0.) { + return log(EPS); + } + return log(a); +} + +void THNN_(BCECriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction, + THTensor *weights) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_NELEMENT(input, weights); + + if (reduction == Reduction::None) { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY3(real, input, real, target, real, output, + real x = *input_data; + real y = *target_data; + THAssertMsg(x >= 0. && x <= 1., + "input value should be between 0~1, but got %f", + (double) x); + *output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y)); + ); + if (weights) { + THTensor_(cmul)(output, output, weights); + } + return; + } + + THTensor_(resize1d)(output, 1); + real sum = 0; + + if (weights) { + TH_TENSOR_APPLY3(real, input, real, target, real, weights, + real x = *input_data; + real y = *target_data; + real w = *weights_data; + THAssertMsg(x >= 0. && x <= 1., + "input value should be between 0~1, but got %f", + (double) x); + sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w; + ); + } else { + TH_TENSOR_APPLY2(real, input, real, target, + real x = *input_data; + real y = *target_data; + THAssertMsg(x >= 0. && x <= 1., + "input value should be between 0~1, but got %f", + (double) x); + sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y); + ); + } + + + if (reduction == Reduction::ElementwiseMean) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(BCECriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction, + THTensor *weights) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_NELEMENT(input, weights); + THTensor_(resizeAs)(gradInput, input); + + if (reduction == Reduction::None) { + THNN_CHECK_NELEMENT(gradOutput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real x = *input_data; + real y = *target_data; + *gradInput_data = -(y - x) / ((1. - x + EPS) * (x + EPS)); + ); + + if (weights) { + TH_TENSOR_APPLY3(real, gradInput, real, weights, real, gradOutput, + *gradInput_data = *gradInput_data * *weights_data * *gradOutput_data; + ); + } else { + THTensor_(cmul)(gradInput, gradInput, gradOutput); + } + return; + } + + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real x = *input_data; + real y = *target_data; + *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS)) * THTensor_(fastGet1d)(gradOutput, 0); + ); + + if(weights) + THTensor_(cmul)(gradInput, gradInput, weights); +} + +#undef EPS + +#endif diff --git a/aten/src/THNN/generic/BatchNormalization.c b/aten/src/THNN/generic/BatchNormalization.c new file mode 100644 index 0000000..1f2aa3c --- /dev/null +++ b/aten/src/THNN/generic/BatchNormalization.c @@ -0,0 +1,160 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/BatchNormalization.c" +#else + +void THNN_(BatchNormalization_updateOutput)( + THNNState *state, THTensor *input, THTensor *output, + THTensor *weight, THTensor *bias, + THTensor *running_mean, THTensor *running_var, + THTensor *save_mean, THTensor *save_std, + bool train, double momentum, double eps) +{ + THTensor_(resizeAs)(output, input); + int64_t nInput = THTensor_(size)(input, 1); + int64_t f; + ptrdiff_t n = THTensor_(nElement)(input) / nInput; + + if (train) { + THTensor_(resize1d)(save_mean, nInput); + THTensor_(resize1d)(save_std, nInput); + } + + #pragma omp parallel for + for (f = 0; f < nInput; ++f) { + THTensor *in = THTensor_(newSelect)(input, 1, f); + THTensor *out = THTensor_(newSelect)(output, 1, f); + + real mean, invstd; + + if (train) { + // compute mean per input + accreal sum = 0; + TH_TENSOR_APPLY(real, in, sum += *in_data;); + + mean = (real) sum / n; + THTensor_(set1d)(save_mean, f, (real) mean); + + // compute variance per input + sum = 0; + TH_TENSOR_APPLY(real, in, + sum += (*in_data - mean) * (*in_data - mean);); + + if (sum == 0 && eps == 0.0) { + invstd = 0; + } else { + invstd = (real) (1 / sqrt(sum/n + eps)); + } + THTensor_(set1d)(save_std, f, (real) invstd); + + // update running averages + if (running_mean) { + THTensor_(set1d)(running_mean, f, + (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f))); + } + if (running_var) { + accreal unbiased_var = sum / (n - 1); + THTensor_(set1d)(running_var, f, + (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f))); + } + } else { + mean = THTensor_(get1d)(running_mean, f); + invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps); + } + + // compute output + real w = weight ? THTensor_(get1d)(weight, f) : 1; + real b = bias ? THTensor_(get1d)(bias, f) : 0; + + TH_TENSOR_APPLY2(real, in, real, out, + *out_data = (real) (((*in_data - mean) * invstd) * w + b);); + + THTensor_(free)(out); + THTensor_(free)(in); + } +} + +void THNN_(BatchNormalization_backward)( + THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, + THTensor *gradWeight, THTensor *gradBias, THTensor *weight, + THTensor *running_mean, THTensor *running_var, + THTensor *save_mean, THTensor *save_std, + bool train, double scale, double eps) +{ + THNN_CHECK_SHAPE(input, gradOutput); + int64_t nInput = THTensor_(size)(input, 1); + int64_t f; + ptrdiff_t n = THTensor_(nElement)(input) / nInput; + + if (gradInput) { + THTensor_(resizeAs)(gradInput, input); + } + + #pragma omp parallel for + for (f = 0; f < nInput; ++f) { + THTensor *in = THTensor_(newSelect)(input, 1, f); + THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f); + real w = weight ? THTensor_(get1d)(weight, f) : 1; + real mean, invstd; + if (train) { + mean = THTensor_(get1d)(save_mean, f); + invstd = THTensor_(get1d)(save_std, f); + } else { + mean = THTensor_(get1d)(running_mean, f); + invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps); + } + + // sum over all gradOutput in feature plane + accreal sum = 0; + TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;); + + // dot product of the Q(X) and gradOuput + accreal dotp = 0; + TH_TENSOR_APPLY2(real, in, real, gradOut, + dotp += (*in_data - mean) * (*gradOut_data);); + + if (gradInput) { + THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f); + + if (train) { + // when in training mode + // Q(X) = X - E[x] ; i.e. input centered to zero mean + // Y = Q(X) / σ ; i.e. BN output before weight and bias + // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w + + // projection of gradOutput on to output scaled by std + real k = (real) dotp * invstd * invstd / n; + TH_TENSOR_APPLY2(real, gradIn, real, in, + *gradIn_data = (*in_data - mean) * k;); + + accreal gradMean = sum / n; + TH_TENSOR_APPLY2(real, gradIn, real, gradOut, + *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;); + + } else { + // when in evaluation mode + // Q(X) = X - running_mean ; i.e. input centered to zero mean + // Y = Q(X) / running_std ; i.e. BN output before weight and bias + // dL/dX = w / running_std + TH_TENSOR_APPLY2(real, gradIn, real, gradOut, + *gradIn_data = *gradOut_data * invstd * w;); + } + + THTensor_(free)(gradIn); + } + + if (gradWeight) { + real val = THTensor_(get1d)(gradWeight, f); + THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd); + } + + if (gradBias) { + real val = THTensor_(get1d)(gradBias, f); + THTensor_(set1d)(gradBias, f, val + scale * sum); + } + + THTensor_(free)(gradOut); + THTensor_(free)(in); + } +} + +#endif diff --git a/aten/src/THNN/generic/ClassNLLCriterion.c b/aten/src/THNN/generic/ClassNLLCriterion.c new file mode 100644 index 0000000..a434efa --- /dev/null +++ b/aten/src/THNN/generic/ClassNLLCriterion.c @@ -0,0 +1,219 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c" +#else + +void THNN_(ClassNLLCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + int64_t reduction, + THTensor *weights, + THTensor *total_weight, + int64_t ignore_index) +{ + THTensor_(resize1d)(total_weight, 1); + int n_dims = THTensor_(_nDimension)(input); + int n_classes = THTensor_(size)(input, n_dims - 1); + ignore_index -= TH_INDEX_BASE; + + if (THIndexTensor_(_nDimension)(target) > 1) { + THError("multi-target not supported"); + } + if (THTensor_(_nDimension)(input) > 2) { + THError("input tensor should be 1D or 2D"); + } + if (weights && THTensor_(nElement)(weights) != n_classes) { + THDescBuff s1 = THTensor_(sizeDesc)(weights); + THError("weight tensor should be defined either for all %d classes or no classes" + " but got weight tensor of shape: %s", n_classes, s1.str); + } + + if (reduction == Reduction::None && n_dims == 2) { + int batch_size = THTensor_(size)(input, 0); + THTensor_(resize1d)(output, batch_size); + + std::atomic invalid_target(-1); // We cannot throw an exception inside omp parallel + int i; + #pragma omp parallel for private(i) + for (i = 0; i < batch_size; i++) { + int cur_target = THLongTensor_fastGet1d(target, i) - TH_INDEX_BASE; + + if (cur_target >= 0 && cur_target < n_classes) { + if (cur_target == ignore_index) { + THTensor_(fastSet1d)(output, i, 0.0f); + continue; + } + real cur_weight = weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f; + THTensor_(fastSet1d)(output, i, -THTensor_(fastGet2d)(input, i, cur_target) * cur_weight); + } else { + int tmp = -1; + invalid_target.compare_exchange_strong(tmp, cur_target); + } + } + + if (invalid_target.load() >= 0) { + THError("Target %d out of bounds", invalid_target.load()); + } + + return; + } + + THTensor_(resize1d)(output, 1); + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + real *input_data = THTensor_(data)(input); + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *output_data = THTensor_(data)(output); + real *total_weight_data = THTensor_(data)(total_weight); + + output_data[0] = total_weight_data[0] = 0.0; + + if (THTensor_(_nDimension)(input) == 1) { + int cur_target = target_data[0] - TH_INDEX_BASE; + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f; + output_data[0] = -input_data[cur_target] * total_weight_data[0]; + } + } else if (THTensor_(_nDimension)(input) == 2) { + int batch_size = THTensor_(size)(input, 0); + THAssert(THIndexTensor_(size)(target, 0) == batch_size); + + int n_target = THTensor_(size)(input, 1); + + int i; + for (i = 0; i < batch_size; i++) { + int cur_target = target_data[i] - TH_INDEX_BASE; + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + + real cur_weight = weights ? weights_data[cur_target] : 1.0f; + total_weight_data[0] += cur_weight; + output_data[0] -= input_data[i * n_target + cur_target] * cur_weight; + } + } + } + + if (reduction == Reduction::ElementwiseMean && total_weight_data[0]) { + output_data[0] /= total_weight_data[0]; + } + + if (weights) { + THTensor_(free)(weights); + } + THTensor_(free)(input); + THIndexTensor_(free)(target); +} + +void THNN_(ClassNLLCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction, + THTensor *weights, + THTensor *total_weight, + int64_t ignore_index) +{ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + int n_dims = THTensor_(_nDimension)(input); + int n_classes = THTensor_(size)(input, n_dims - 1); + ignore_index -= TH_INDEX_BASE; + + if (!THTensor_(isContiguous)(gradInput)) { + THError("gradInput must be contiguous"); + } + + if (THIndexTensor_(_nDimension)(target) > 1) { + THError("multi-target not supported"); + } + + if (THTensor_(_nDimension)(input) > 2) { + THError("input tensor should be 1D or 2D"); + } + + if (weights && THTensor_(nElement)(weights) != n_classes) { + THError("weight tensor should be defined either for all or no classes"); + } + + if (reduction == Reduction::None && n_dims == 2) { + int batch_size = THTensor_(size)(input, 0); + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, batch_size); + + int i; + #pragma omp parallel for private(i) + for (i = 0; i < batch_size; i++) { + int cur_target = THLongTensor_fastGet1d(target, i) - TH_INDEX_BASE; + if (cur_target == ignore_index) { + continue; + } + real weight = weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f; + THTensor_(fastSet2d)(gradInput, i, cur_target, -weight * THTensor_(fastGet1d)(gradOutput, i)); + } + return; + } + + real *total_weight_data = THTensor_(data)(total_weight); + if (*total_weight_data <= 0) { + return; + } + + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *gradInput_data = THTensor_(data)(gradInput); + + real gradOutput_value = THTensor_(get1d)(gradOutput, 0); + + if (THTensor_(_nDimension)(input) == 1) { + int cur_target = target_data[0] - TH_INDEX_BASE; + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + + gradInput_data[cur_target] = + (reduction != Reduction::ElementwiseMean && weights) ? -weights_data[cur_target] : -1; + gradInput_data[cur_target] *= gradOutput_value; + } + + } else if (THTensor_(_nDimension)(input) == 2) { + int batch_size = THTensor_(size)(input, 0); + THAssert(THIndexTensor_(size)(target, 0) == batch_size); + + int n_target = THTensor_(size)(input, 1); + + int i; + for (i = 0; i < batch_size; i++){ + int cur_target = target_data[i] - TH_INDEX_BASE; + + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + + gradInput_data[i * n_target + cur_target] = + -(weights ? weights_data[cur_target] : 1.0f) * gradOutput_value; + + if (reduction == Reduction::ElementwiseMean && *total_weight_data) { + gradInput_data[i * n_target + cur_target] /= *total_weight_data; + } + } + } + } + + THIndexTensor_(free)(target); + if (weights) { + THTensor_(free)(weights); + } +} + +#endif diff --git a/aten/src/THNN/generic/Col2Im.c b/aten/src/THNN/generic/Col2Im.c new file mode 100644 index 0000000..aa5174d --- /dev/null +++ b/aten/src/THNN/generic/Col2Im.c @@ -0,0 +1,232 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Col2Im.c" +#else + +// Note [im2col/col2im output padding] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Our implementations of im2col and col2im take both the input height/width as +// well as a seemingly redundant output height/width. In principle, you could +// compute the output height/width by using the convolution shape formulas. So, +// what's up with that? +// +// The trouble arises when one runs the backward of a transposed convolution +// with output_padding >= stride. (BTW, output_padding is known as adj inside +// THNN.) Let's consider a simple case where we have kernel=2, dilation=2, +// stride=1, output_padding=1 for a 4x4 input: +// +// Input: X +// +// Output: X.X. +// .... +// X.X. +// .... +// +// If we compute backwards of output with a standard convolution on the output +// with the same parameters, we would end up with a 2x2 grad_input (because you +// can slide the stencil over to the right once and down once). But that is all +// out-of-bounds if you're computing backwards for a 1x1 input. +// +// "Now Edward," you might say, "the real problem is that you set output_padding +// >= stride, surely an error should have been raised in this case." To +// understand why it is useful to handle this case, we have to understand how we +// compute the weight gradient of a convolution. Suppose we have a convolution +// with kernel=2, stride=2 on a 5x5 input. Let us see all the contributions of +// weight[0][0] (which we have labeled w) in the output: +// +// Input: a.b.. Weight: w. +// ..... .. +// c.d.. +// ..... +// ..... +// +// Output: [ aw+... bw+... ] +// [ cw+... dw+... ] +// +// From this diagram, it easy to see that we can compute the weight gradient +// by performing a *dilated* convolution between the input and the +// output gradients with kernel=2, dilation=2, stride=1. But there's a rub: if +// we do a dilated convolution directly, we'll end up with a 3x3 weight +// gradient, when we clearly wanted a 2x2. So how do we avoid going out +// of bounds? We could add a notion of 'output_padding' for non-transposed +// convolution, but another simple and effective fix is to just accept +// the desired output size directly, and compute only within those bounds. +// +// +// ALSO do vol2col + +static void THNN_(im2col)(const real* data_im, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t kernel_h, const int64_t kernel_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, + real* data_col) { + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + for (int64_t h_col = 0; h_col < height_col; ++h_col) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (int64_t w_col = 0; w_col < width_col; ++w_col) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + data_col[(c_col * height_col + h_col) * width_col + w_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? + data_im[(c_im * height + h_im) * width + w_im] : 0; + } + } + } +} + +static void THNN_(col2im)(const real* data_col, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t kernel_h, const int64_t kernel_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, + real* data_im) { + memset(data_im, 0, sizeof(real) * height * width * channels); + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + for (int64_t h_col = 0; h_col < height_col; ++h_col) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (int64_t w_col = 0; w_col < width_col; ++w_col) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) + data_im[(c_im * height + h_im) * width + w_im] += + data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + } +} + +static inline void THNN_(Col2Im_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { + + THArgCheck(kW > 0 && kH > 0, 6, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(sW > 0 && sH > 0, 12, + "stride should be greater than zero, but got sH: %d sW: %d", sH, sW); + THArgCheck(dW > 0 && dH > 0, 8, + "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int64_t ndim = THTensor_(nDimension)(input); + THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input, + "Expected non-empty 2D or 3D input tensor, but got input of shape %s"); + + int64_t batch_dim = (ndim == 3) ? 0 : -1; + int64_t nInputPlane = input->size[batch_dim + 1]; + + if (nInputPlane % (kW * kH) != 0) { + THError("Expected size of input's dimension 1 to be divisible by the " + "product of kernel_size, but got input.size(1)=%lld and " + "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW); + } + + int64_t inputLength = input->size[batch_dim + 2]; + int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH; + int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW; + + if (inputLength != (nBlocksH * nBlocksW)) { + THError("Given output_size=(%d, %d), kernel_size=(%d, %d), " + "dilation=(%d, %d), padding=(%d, %d), stride=(%d, %d), expected " + "size of input's dimension 2 to match the calculated number of " + "sliding blocks %lld * %lld = %lld, but got input.size(2)=%lld.", + outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, sH, sW, + (long long) nBlocksH, (long long) nBlocksW, + (long long) (nBlocksH * nBlocksW), (long long) inputLength); + } + + if (outputWidth < 1 || outputHeight < 1) { + THError("Expected output spatial size to be positive, but got: output_size=(%d, %d).", + outputHeight, outputWidth); + } +} + +void THNN_(Col2Im_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, sH, sW); + + bool batched_input = true; + if (input->dim() == 2) { + // Force batch + batched_input = false; + THTensor_(resize3d)(input, 1, input->size[0], input->size[1]); + } + + long batchSize = input->size[0]; + long nInputPlane = input->size[1]; + long nOutputPlane = nInputPlane / (kW * kH); + + input = THTensor_(newContiguous)(input); + + THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); + THTensor_(zero)(output); + + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + + for (int64_t elt = 0; elt < batchSize; elt++) { + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + THNN_(col2im)( + THTensor_(data)(input_n), + nOutputPlane, + outputHeight, outputWidth, + height_col, width_col, + kH, kW, + padH, padW, + sH, sW, + dH, dW, THTensor_(data)(output_n)); + } + + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + if (!batched_input) { + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + } + THTensor_(free)(input); +} + +void THNN_(Col2Im_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput, + kH, kW, dH, dW, padH, padW, sH, sW); +} + +#endif diff --git a/aten/src/THNN/generic/DistKLDivCriterion.c b/aten/src/THNN/generic/DistKLDivCriterion.c new file mode 100644 index 0000000..8233608 --- /dev/null +++ b/aten/src/THNN/generic/DistKLDivCriterion.c @@ -0,0 +1,64 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c" +#else + +void THNN_(DistKLDivCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + + if (reduction == Reduction::None) { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY3(real, input, real, target, real, output, + *output_data = *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0; + ); + return; + } + + THTensor_(resize1d)(output, 1); + + real sum = 0; + + TH_TENSOR_APPLY2(real, input, real, target, + sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0; + ); + + if (reduction == Reduction::ElementwiseMean) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(DistKLDivCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + THTensor_(resizeAs)(gradInput, input); + + if (reduction == Reduction::None) { + THNN_CHECK_SHAPE(input, gradOutput); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, target, + *gradInput_data = *target_data > 0 ? (-*target_data) * *gradOutput_data : 0; + ); + return; + } + + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + + real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = *target_data > 0 ? norm * (-*target_data) * THTensor_(fastGet1d)(gradOutput, 0) : 0; + ); +} + +#endif diff --git a/aten/src/THNN/generic/ELU.c b/aten/src/THNN/generic/ELU.c new file mode 100644 index 0000000..f2d8718 --- /dev/null +++ b/aten/src/THNN/generic/ELU.c @@ -0,0 +1,45 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/ELU.c" +#else + +void THNN_(ELU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal alpha_, + accreal scale, + bool inplace) +{ + real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale); + real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale); + if (inplace) { + TH_TENSOR_APPLY(real, input, + *input_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef; + ); + THTensor_(set)(output, input); + } else { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, input, real, output, + *output_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef; + ); + } +} + +void THNN_(ELU_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output, + accreal alpha_, + accreal scale) +{ + real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale); + real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale); + THNN_CHECK_NELEMENT(output, gradOutput); + THTensor_(resizeAs)(gradInput, output); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + negcoef) : *gradOutput_data * poscoef; + ); +} + +#endif diff --git a/aten/src/THNN/generic/FeatureLPPooling.c b/aten/src/THNN/generic/FeatureLPPooling.c new file mode 100644 index 0000000..fdb4bbe --- /dev/null +++ b/aten/src/THNN/generic/FeatureLPPooling.c @@ -0,0 +1,360 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/FeatureLPPooling.c" +#else + +#ifndef FEATURE_LP_DEFS +#define FEATURE_LP_DEFS + +#ifdef _MSC_VER + #define FEATURE_LP_SIZE_TYPE int64_t + #define FEATURE_LP_CAST_TYPE (int64_t) +#else + #define FEATURE_LP_SIZE_TYPE size_t + #define FEATURE_LP_CAST_TYPE +#endif + +typedef struct { + size_t size[4]; + size_t stride[4]; +} FeatureLPPoolingSizes; + +static inline size_t flpGetOffset(FeatureLPPoolingSizes* s, + FEATURE_LP_SIZE_TYPE batch, + FEATURE_LP_SIZE_TYPE feature, + FEATURE_LP_SIZE_TYPE opt1, + FEATURE_LP_SIZE_TYPE opt2) { + return s->stride[0] * batch + + s->stride[1] * feature + + s->stride[2] * opt1 + + s->stride[3] * opt2; +} + +static inline size_t flpOutputSize(FEATURE_LP_SIZE_TYPE inputSize, + FEATURE_LP_SIZE_TYPE width, + FEATURE_LP_SIZE_TYPE stride) { + return ((inputSize - width) / stride) + 1; +} + +#endif // FEATURE_LP_DEFS + +FeatureLPPoolingSizes +THNN_(FeatureLPPooling_upcastCPU)(THTensor* t, bool batchMode) { + int dim = THTensor_(_nDimension)(t); + + // Upcast to [batch dim][feature dim][opt dim 1][opt dim 2] + FeatureLPPoolingSizes s; + for (int i = 0; i < 4; ++i) { + s.size[i] = 1; + s.stride[i] = 1; + } + + if (dim == 1) { + THAssert(!batchMode); + // [feature dim] + s.size[1] = THTensor_(size)(t, 0); + s.stride[1] = THTensor_(stride)(t, 0); + } else if (dim == 2) { + if (batchMode) { + // [batch dim][feature dim] + for (int i = 0; i < 2; ++i) { + s.size[i] = THTensor_(size)(t, i); + s.stride[i] = THTensor_(stride)(t, i); + } + } else { + // [feature dim][opt dim 1] + s.size[1] = THTensor_(size)(t, 0); + s.stride[1] = THTensor_(stride)(t, 0); + s.size[2] = THTensor_(size)(t, 1); + s.stride[2] = THTensor_(stride)(t, 1); + } + } else if (dim == 3) { + if (batchMode) { + // [batch dim][feature dim][opt dim 1] + for (int i = 0; i < 3; ++i) { + s.size[i] = THTensor_(size)(t, i); + s.stride[i] = THTensor_(stride)(t, i); + } + } else { + // [feature dim][opt dim 1][opt dim 2] + for (int i = 1; i < 4; ++i) { + s.size[i] = THTensor_(size)(t, i - 1); + s.stride[i] = THTensor_(stride)(t, i - 1); + } + } + } else if (dim == 4) { + // [batch dim][feature dim][opt dim 1][opt dim 2] + THAssert(batchMode); + for (int i = 0; i < 4; ++i) { + s.size[i] = THTensor_(size)(t, i); + s.stride[i] = THTensor_(stride)(t, i); + } + } + + return s; +} + +void +THNN_(FeatureLPPooling_resizeForOutputCPU)(THTensor* toResize, + THTensor* input, + bool batchMode, + int width, + int stride) { + int inputDim = THTensor_(_nDimension)(input); + THAssert(inputDim >= 1 && inputDim <= 4); + + int64_t outSize = + flpOutputSize(THTensor_(size)(input, 0), width, stride); + if (batchMode) { + THAssert(inputDim > 1); + outSize = + flpOutputSize(THTensor_(size)(input, 1), width, stride); + } else { + THAssert(inputDim < 4); + } + + if (inputDim == 1) { + THTensor_(resize1d)(toResize, outSize); + } else if (inputDim == 2) { + if (batchMode) { + THTensor_(resize2d)(toResize, + THTensor_(size)(input, 0), + outSize); + } else { + THTensor_(resize2d)(toResize, + outSize, + THTensor_(size)(input, 1)); + } + } else if (inputDim == 3) { + if (batchMode) { + THTensor_(resize3d)(toResize, + THTensor_(size)(input, 0), outSize, + THTensor_(size)(input, 2)); + } else { + THTensor_(resize3d)(toResize, + outSize, THTensor_(size)(input, 1), + THTensor_(size)(input, 2)); + } + } else if (inputDim == 4) { + THTensor_(resize4d)(toResize, + THTensor_(size)(input, 0), + outSize, + THTensor_(size)(input, 2), + THTensor_(size)(input, 3)); + } +} + +// Makes `toResize` the same size/dimensionality as `src` +void +THNN_(FeatureLPPooling_resizeCPU)(THTensor* toResize, + THTensor* src) { + int inputDim = THTensor_(_nDimension)(src); + THAssert(inputDim >= 1 && inputDim <= 4); + + if (inputDim == 1) { + THTensor_(resize1d)(toResize, + THTensor_(size)(src, 0)); + } else if (inputDim == 2) { + THTensor_(resize2d)( + toResize, + THTensor_(size)(src, 0), + THTensor_(size)(src, 1)); + } else if (inputDim == 3) { + THTensor_(resize3d)( + toResize, + THTensor_(size)(src, 0), + THTensor_(size)(src, 1), + THTensor_(size)(src, 2)); + } else if (inputDim == 4) { + THTensor_(resize4d)( + toResize, + THTensor_(size)(src, 0), + THTensor_(size)(src, 1), + THTensor_(size)(src, 2), + THTensor_(size)(src, 3)); + } +} + +void +THNN_(FeatureLPPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal power, + int width, + int stride, + bool batchMode) { + int inputDim = THTensor_(_nDimension)(input); + + if (batchMode) { + THArgCheck(inputDim >= 2 && inputDim <= 4, 2, + "input must be 2-4 dimensions for batch mode"); + } else { + THArgCheck(inputDim >= 1 && inputDim <= 3, 2, + "input must be 1-3 dimensions for non-batch mode"); + } + + FeatureLPPoolingSizes inputDesc = + THNN_(FeatureLPPooling_upcastCPU)(input, batchMode); + + // Make sure the feature dimension is properly sized + THArgCheck(inputDesc.size[1] >= (FEATURE_LP_SIZE_TYPE) width, 3, + "input: feature dimension must be >= width"); + + // Make sure that width and stride are within range + THArgCheck(width >= 2 && width <= 16, 5, + "width must be between 2 - 16"); + + THArgCheck(stride >= 1 && stride <= 4, 6, + "stride must be between 1 - 4"); + + // Resize output + + THNN_(FeatureLPPooling_resizeForOutputCPU)( + output, input, batchMode, width, stride); + + FeatureLPPoolingSizes outputDesc = + THNN_(FeatureLPPooling_upcastCPU)(output, batchMode); + + real* inputP = THTensor_(data)(input); + real* outputP = THTensor_(data)(output); + + FEATURE_LP_SIZE_TYPE batch, opt1, opt2, outputFeature, i; + +#pragma omp parallel for + for (batch = 0; batch < FEATURE_LP_CAST_TYPE inputDesc.size[0]; ++batch) { + for (opt1 = 0; opt1 < FEATURE_LP_CAST_TYPE inputDesc.size[2]; ++opt1) { + for (opt2 = 0; opt2 < FEATURE_LP_CAST_TYPE inputDesc.size[3]; ++opt2) { + for (outputFeature = 0; + outputFeature < FEATURE_LP_CAST_TYPE outputDesc.size[1]; ++outputFeature) { + + accreal v = (accreal) 0; + for (i = 0; i < (FEATURE_LP_SIZE_TYPE) width; ++i) { + FEATURE_LP_SIZE_TYPE inputFeature = outputFeature * stride + i; + if (inputFeature >= FEATURE_LP_CAST_TYPE inputDesc.size[1]) { + break; + } + + v += + pow(inputP[flpGetOffset(&inputDesc, + batch, + inputFeature, + opt1, + opt2)], power); + } + + outputP[flpGetOffset(&outputDesc, batch, outputFeature, opt1, opt2)] = + pow(v, (accreal) 1 / power); + } + } + } + } +} + +void +THNN_(FeatureLPPooling_updateGradInput)( + THNNState *state, + THTensor* gradOutput, + THTensor* input, + THTensor* output, + THTensor* gradInput, + accreal power, + int width, + int stride, + bool batchMode) { + int inputDim = THTensor_(_nDimension)(input); + + if (batchMode) { + THArgCheck(inputDim >= 2 && inputDim <= 4, 3, + "input must be 2-4 dimensions for batch mode"); + } else { + THArgCheck(inputDim >= 1 && inputDim <= 3, 3, + "input must be 1-3 dimensions for non-batch mode"); + } + + FeatureLPPoolingSizes inputDesc = + THNN_(FeatureLPPooling_upcastCPU)(input, batchMode); + FeatureLPPoolingSizes gradOutputDesc = + THNN_(FeatureLPPooling_upcastCPU)(gradOutput, batchMode); + FeatureLPPoolingSizes outputDesc = + THNN_(FeatureLPPooling_upcastCPU)(output, batchMode); + + // Make sure the feature dimension is properly sized + THArgCheck(inputDesc.size[1] >= (FEATURE_LP_SIZE_TYPE) width, 3, + "input: feature dimension must be >= width"); + + // Make sure that width and stride are within range + THArgCheck(width >= 2 && width <= 16, 7, + "width must be between 2 - 16"); + + THArgCheck(stride >= 1 && stride <= 4, 8, + "stride must be between 1 - 4"); + + for (int i = 0; i < 4; ++i) { + THAssertMsg(outputDesc.size[i] == gradOutputDesc.size[i], + "output and gradOutput sizes do not match"); + } + + // Make sure that the input sizes produce the output sizes + THArgCheck(flpOutputSize(FEATURE_LP_CAST_TYPE inputDesc.size[1], width, stride) == + outputDesc.size[1], 3, + "input and output sizes do not match with respect to " + "width and stride"); + + // Resize `gradInput` based on `input` + THNN_(FeatureLPPooling_resizeCPU)(gradInput, input); + + // Zero gradInput for accumulation + THTensor_(zero)(gradInput); + + FeatureLPPoolingSizes gradInputDesc = + THNN_(FeatureLPPooling_upcastCPU)(gradInput, batchMode); + + real* gradOutputP = THTensor_(data)(gradOutput); + real* gradInputP = THTensor_(data)(gradInput); + real* outputP = THTensor_(data)(output); + real* inputP = THTensor_(data)(input); + + FEATURE_LP_SIZE_TYPE batch, opt1, opt2, outputFeature, i; + +#pragma omp parallel for + for (batch = 0; batch < FEATURE_LP_CAST_TYPE inputDesc.size[0]; ++batch) { + for (opt1 = 0; opt1 < FEATURE_LP_CAST_TYPE inputDesc.size[2]; ++opt1) { + for (opt2 = 0; opt2 < FEATURE_LP_CAST_TYPE inputDesc.size[3]; ++opt2) { + for (outputFeature = 0; + outputFeature < FEATURE_LP_CAST_TYPE outputDesc.size[1]; ++outputFeature) { + + // Load output (f(x_is)). It is possible that this is zero, in + // which case we'll ignore this point. + real outputV = + outputP[ + flpGetOffset(&outputDesc, batch, outputFeature, opt1, opt2)]; + + if (outputV == (real) 0) { + continue; + } + + for (i = 0; i < (FEATURE_LP_SIZE_TYPE) width; ++i) { + FEATURE_LP_SIZE_TYPE inputFeature = outputFeature * stride + i; + THAssert(inputFeature < inputDesc.size[1]); + + real gradOutputV = + gradOutputP[ + flpGetOffset(&gradOutputDesc, batch, outputFeature, opt1, opt2)]; + real inputV = + inputP[ + flpGetOffset(&inputDesc, batch, inputFeature, opt1, opt2)]; + + // Calculate grad * (x_i / f(x_is))^(p - 1) + real v = gradOutputV * pow(inputV / outputV, power - (accreal) 1); + + gradInputP[ + flpGetOffset(&gradInputDesc, batch, inputFeature, opt1, opt2)] + += v; + } + } + } + } + } +} + +#endif diff --git a/aten/src/THNN/generic/FusedRNNKernel.c b/aten/src/THNN/generic/FusedRNNKernel.c new file mode 100644 index 0000000..30788b0 --- /dev/null +++ b/aten/src/THNN/generic/FusedRNNKernel.c @@ -0,0 +1,55 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/FusedRNNKernel.c" +#else + +void THNN_(GRUFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, + THTensor *bias2, + THTensor *hx, + THTensor *hy, + THTensor *storage) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +void THNN_(GRUFused_updateGradInput)( + THNNState *state, + THTensor *gradInInput, + THTensor *gradInHidden, + THTensor *gradOutput, + THTensor *gradInputHx, + THTensor *storage) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +void THNN_(LSTMFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, + THTensor *bias2, + THTensor *cx, + THTensor *hy, + THTensor *cy) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +void THNN_(LSTMFused_updateGradInput)( + THNNState *state, + THTensor *storage, + THTensor *gradInGates, + THTensor *prevC, + THTensor *cy, + THTensor *gradOutput, + THTensor *gradOutputCell, + THTensor *gradInputCx) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +#endif diff --git a/aten/src/THNN/generic/GatedLinearUnit.c b/aten/src/THNN/generic/GatedLinearUnit.c new file mode 100644 index 0000000..68cdc37 --- /dev/null +++ b/aten/src/THNN/generic/GatedLinearUnit.c @@ -0,0 +1,73 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/GatedLinearUnit.c" +#else + +void THNN_(GatedLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int dim) +{ + // size output to half of input + dim = dim - TH_INDEX_BASE; + const int64_t nIn = THTensor_(size)(input, dim); + THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", + dim + TH_INDEX_BASE, nIn); + + const int64_t inputSize = THTensor_(size)(input, dim) / 2; + THLongStorage *newSizes = THTensor_(newSizeOf)(input); + THLongStorage_set(newSizes, dim, inputSize); + THTensor_(resize)(output, newSizes, NULL); + + // halve tensor + THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize); + THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize); + + // x = x1:cmul( sigmoid(x2) ) + THTensor_(sigmoid)(output, secondHalf); + THTensor_(cmul)(output, output, firstHalf); + + THLongStorage_free(newSizes); + THTensor_(free)(firstHalf); + THTensor_(free)(secondHalf); +} + +void THNN_(GatedLinear_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int dim) +{ + // set up tensors + dim = dim - TH_INDEX_BASE; + const int64_t nIn = THTensor_(size)(input, dim); + THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", + dim + TH_INDEX_BASE, nIn); + + THTensor_(resizeAs)(gradInput, input); + const int64_t inputSize = THTensor_(size)(input, dim) / 2; + THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize); + THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize); + THTensor *gradInputfirstHalf = THTensor_(newNarrow)(gradInput, dim, 0, inputSize); + THTensor *gradInputsecondHalf = THTensor_(newNarrow)(gradInput, dim, inputSize, inputSize); + + THTensor_(sigmoid)(gradInputfirstHalf, secondHalf); + + TH_TENSOR_APPLY2(real, gradInputsecondHalf, real, gradInputfirstHalf, + real z = *gradInputfirstHalf_data; + *gradInputsecondHalf_data = (1. - z) * z; + ); + + THTensor_(cmul)(gradInputfirstHalf, gradInputfirstHalf, gradOutput); + + THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, gradOutput); + THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, firstHalf); + + THTensor_(free)(firstHalf); + THTensor_(free)(secondHalf); + THTensor_(free)(gradInputfirstHalf); + THTensor_(free)(gradInputsecondHalf); +} + +#endif diff --git a/aten/src/THNN/generic/HardShrink.c b/aten/src/THNN/generic/HardShrink.c new file mode 100644 index 0000000..18dea95 --- /dev/null +++ b/aten/src/THNN/generic/HardShrink.c @@ -0,0 +1,42 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/HardShrink.c" +#else + +void THNN_(HardShrink_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THTensor_(resizeAs)(output, input); + + TH_TENSOR_APPLY2(real, output, real, input, + if (*input_data > lambda) + *output_data = *input_data; + else if (*input_data >= -lambda) + *output_data = 0; + else + *output_data = *input_data; // let NaN case pass through here + ); +} + +void THNN_(HardShrink_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if (*input_data >= -lambda && *input_data <= lambda) + *gradInput_data = 0; + else + *gradInput_data = *gradOutput_data; // let NaN case pass through here + ); +} + +#endif diff --git a/aten/src/THNN/generic/HardTanh.c b/aten/src/THNN/generic/HardTanh.c new file mode 100644 index 0000000..a19c0ce --- /dev/null +++ b/aten/src/THNN/generic/HardTanh.c @@ -0,0 +1,138 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/HardTanh.c" +#else + +void THNN_(HardTanh_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal min_val_, + accreal max_val_, + bool inplace) +{ + real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_); + real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_); + if (inplace) + THTensor_(set)(output, input); + else + THTensor_(resizeAs)(output, input); + + if (input->_dim() == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data < min_val) + *input_data = min_val; + else if (*input_data > max_val) + *input_data = max_val; + ); + } + else + { + TH_TENSOR_APPLY2(real, output, real, input, + if (*input_data < min_val) + *output_data = min_val; + else if (*input_data > max_val) + *output_data = max_val; + else + *output_data = *input_data; + ); + } + } + else + { + real* ptr_input = THTensor_(data)(input); + real* ptr_output = THTensor_(data)(output); + ptrdiff_t i; + ptrdiff_t n = THTensor_(nElement)(input); + + if (inplace) +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] < min_val) + ptr_input[i] = min_val; + else if (ptr_input[i] > max_val) + ptr_input[i] = max_val; + } + else +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] < min_val) + ptr_output[i] = min_val; + else if (ptr_input[i] <= max_val) + ptr_output[i] = ptr_input[i]; + else + ptr_output[i] = max_val; + } + } +} + +void THNN_(HardTanh_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal min_val_, + accreal max_val_, + bool inplace) +{ + real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_); + real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_); + + THNN_CHECK_NELEMENT(input, gradOutput); + if (inplace) + THTensor_(set)(gradInput, gradOutput); + else + THTensor_(resizeAs)(gradInput, input); + + if (input->_dim() == 1 || + !THTensor_(isContiguous)(input) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if (*input_data <= min_val || *input_data >= max_val) + *gradOutput_data = 0; + ); + } + else + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if (*input_data <= min_val || *input_data >= max_val) + *gradInput_data = 0; + else + *gradInput_data = *gradOutput_data; + ); + } + else + { + real* ptr_gradOutput = THTensor_(data)(gradOutput); + real* ptr_gradInput = THTensor_(data)(gradInput); + real* ptr_input = THTensor_(data)(input); + ptrdiff_t i; + ptrdiff_t n = THTensor_(nElement)(input); + + if (inplace) +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] <= min_val || ptr_input[i] >= max_val) + ptr_gradInput[i] = 0; + } + else +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] <= min_val || ptr_input[i] >= max_val) + ptr_gradInput[i] = 0; + else + ptr_gradInput[i] = ptr_gradOutput[i]; + } + } +} + +#endif diff --git a/aten/src/THNN/generic/Im2Col.c b/aten/src/THNN/generic/Im2Col.c new file mode 100644 index 0000000..bbb0dd8 --- /dev/null +++ b/aten/src/THNN/generic/Im2Col.c @@ -0,0 +1,119 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Im2Col.c" +#else + +static inline void THNN_(Im2Col_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { + + THArgCheck(kW > 0 && kH > 0, 4, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 6, + "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(sW > 0 && sH > 0, 10, + "stride should be greater than zero, but got sH: %d sW: %d", sH, sW); + + int64_t ndim = THTensor_(nDimension)(input); + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "Expected non-empty 3D or 4D input tensor, but got input of shape %s"); + + int64_t dim_batch = 0; + if (ndim == 3) { + dim_batch = -1; + } + int64_t nInputPlane = THTensor_(size)(input, dim_batch + 1); + int64_t inputHeight = THTensor_(size)(input, dim_batch + 2); + int64_t inputWidth = THTensor_(size)(input, dim_batch + 3); + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nOutputPlane = nInputPlane * kW * kH; + int64_t outputLength = outputHeight * outputWidth; + + if (outputHeight < 1 || outputWidth < 1) { + THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), " + "dilation=(%d, %d), padding=(%d, %d), calculated " + "shape of the array of sliding blocks as (%d, %d), which is " + "too small (non-positive).", + inputHeight, inputHeight, kH, kW, dH, dW, padH, padW, + outputHeight, outputWidth); + } +} + +void THNN_(Im2Col_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW); + + input = THTensor_(newContiguous)(input); + bool batched_input = true; + if (input->dim() == 3) { + batched_input = false; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + } + + int64_t batchSize = THTensor_(size)(input, 0); + int64_t nInputPlane = THTensor_(size)(input, 1); + int64_t inputHeight = THTensor_(size)(input, 2); + int64_t inputWidth = THTensor_(size)(input, 3); + + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nOutputPlane = nInputPlane * kW * kH; + int64_t outputLength = outputHeight * outputWidth; + + THTensor_(resize3d)(output, batchSize, nOutputPlane, outputLength); + THTensor_(zero)(output); + + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + for (int64_t elt = 0; elt < batchSize; elt++) { + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + THNN_(im2col)( + THTensor_(data)(input_n), + nInputPlane, + inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, sH, sW, + dH, dW, THTensor_(data)(output_n)); + } + + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + if (!batched_input) { + THTensor_(resize2d)(output, nOutputPlane, outputLength); + } + THTensor_(free)(input); +} + +void THNN_(Im2Col_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { + + + THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput, + inputHeight, inputWidth, + kH, kW, dH, dW, + padH, padW, sH, sW); +} + + +#endif diff --git a/aten/src/THNN/generic/IndexLinear.c b/aten/src/THNN/generic/IndexLinear.c new file mode 100644 index 0000000..50aa93d --- /dev/null +++ b/aten/src/THNN/generic/IndexLinear.c @@ -0,0 +1,727 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/IndexLinear.c" +#else + +#ifdef _OPENMP +#include +#endif + +/* Threshold used to trigger multithreading */ +#ifndef THNN_SPARSE_OMP_THRESHOLD +#define THNN_SPARSE_OMP_THRESHOLD 100000 +#endif + +/* Threshold used to trigger BLAS axpy call */ +#ifndef THNN_SPARSE_OUTDIM_THRESHOLD +#define THNN_SPARSE_OUTDIM_THRESHOLD 49 +#endif + +/* sign MACRO */ +#ifndef THNN_INDEXLINEAR_SIGN +#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 ) ? -1 : ( (a) > 0 ) ) +#endif + +static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values) +{ + return THLongTensor_size(keys, 0) == THTensor_(nElement)(values) + && THTensor_(_nDimension)(values) == 1 + && THLongTensor__nDimension(keys) == 1; +} + +void THNN_(IndexLinear_updateOutput)( + THNNState *state, + THLongTensor *keys, + int64_t keysOffset, + THTensor *values, + THLongTensor *sizes, + THLongTensor *cumSumSizes, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *normalizedValues, + int train) +{ + /* Retrieve all the dimensions of the problem */ + int64_t batchSize = THLongTensor_size(sizes, 0); + int64_t keysSize = THLongTensor_size(keys, 0); + int64_t outDim = THTensor_(size)(bias, 0); + int64_t woutDim = THTensor_(size)(weight, 1); + int maxNormalize = woutDim - outDim; + int64_t* sizesData = THLongTensor_data(sizes); + int64_t* cumSumSizesData = THLongTensor_data(cumSumSizes); + + /* Define/resize the normalized values tensor if maxNormalize is > 0 */ + real* normalizedValuesData = NULL; + if (maxNormalize) + { + THTensor_(resize1d)(normalizedValues, keysSize); + normalizedValuesData = THTensor_(data)(normalizedValues); + } + + /* Resize the output */ + THTensor_(resize2d)(output, batchSize, outDim); + + /* Access the storage data/strides */ + real* outputData = THTensor_(data)(output); + real* valuesData = THTensor_(data)(values); + real* weightData = THTensor_(data)(weight); + int64_t weightStride0 = weight->stride[0]; + real* biasData = THTensor_(data)(bias); + int64_t* keysData = THLongTensor_data(keys); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous"); + THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); + THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous"); + int64_t i,j,k; + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. */ + if (outDim == 1) + { + THVector_(fill)(outputData, *biasData, batchSize); + if (maxNormalize) + { + /* Parallelize on the batch itself */ +#pragma omp parallel \ + for private(i,j) \ + firstprivate(outDim, keysOffset, \ + weightData, keysData, \ + valuesData, outputData, \ + cumSumSizesData, sizesData) \ + schedule(static) \ + if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) + for (j = 0; j < batchSize; j++) + { + real* loutputData = outputData + j; + real val = 0; + real absVal = 0; + int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; + + for (i = 0; i < sizesData[j]; i++) + { + int64_t woffset = weightStride0*(keysData[offset] + keysOffset); + absVal = fabs(valuesData[offset]); + if (train) + { + if (absVal > weightData[woffset]) + { + weightData[woffset] = absVal; + weightData[woffset+1] = 1/absVal; + } + + /* + * The following can be used to scale the size of the updates + * depending on some rule, e.g. the frequency of a feature, ... + * This is used at update time. + * TODO: implement a smarter update scale. + */ + weightData[woffset+2] = 1; + } + normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3]; + val += normalizedValuesData[offset] * weightData[woffset+maxNormalize]; + offset++; + } + *loutputData += val; + } + } + else + { + /* Parallelize on the batch itself */ +#pragma omp parallel \ + for private(i,j) \ + firstprivate(outDim, weightData, \ + keysData, valuesData, \ + outputData, cumSumSizesData, \ + sizesData) \ + schedule(static) \ + if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) + for (j = 0; j < batchSize; j++) + { + int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; + real* loutputData = outputData + j; + real val = 0; + + for (i = 0; i < sizesData[j]; i++) + { + val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset]; + offset++; + } + *loutputData += val; + } + } + } + else { +#pragma omp parallel \ + for private(i,j,k) \ + firstprivate(outDim, weightData, \ + keysData, valuesData, \ + biasData, outputData, \ + cumSumSizesData, sizesData) \ + schedule(static) \ + if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) + for (j = 0; j < batchSize; j++) + { + int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; + real val; + real* loutputData = outputData + j*outDim; + real* lweightData = weightData; + memcpy(loutputData, biasData, outDim*sizeof(real)); + for (i = 0; i < sizesData[j]; i++) + { + int64_t woffset = weightStride0*(keysData[offset] + keysOffset); + if (maxNormalize) + { + val = valuesData[offset]; + real absVal = fabs(val); + if (train) + { + if (absVal > weightData[woffset]) + { + weightData[woffset] = absVal; + weightData[woffset+1] = 1/absVal; + } + + /* + * The following can be used to scale the size of the updates + * depending on some rule, e.g. the frequency of a feature, ... + * The commented section thereafter is just an example of what can be done: + * + *``` + * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1)); + * real alpha = 1; + * real beta = 0.01; + * real gamma = 1 - 0.000001; + * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta); + * l = gamma*l; + * weightData[woffset+2] = (alpha-beta)*l + beta; + * ``` + * + * TODO: implement a smarter update scale. + */ + weightData[woffset+2] = 1; + } + + /* Normalize + Clamp */ + val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3]; + normalizedValuesData[offset] = val; + + lweightData = weightData + woffset + maxNormalize; + } + else + { + val = valuesData[offset]; + lweightData = weightData + woffset; + } + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + loutputData[k] += lweightData[k] * val; + } + } + offset++; + } + } + } + return; +} + +void THNN_(IndexLinear_updateParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THLongTensor *runningKeys, + THLongTensor *cumSumSizes, + int64_t keysOffset, + accreal weightDecay_, + accreal learningRate_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); + /* Retrieve all the dimensions of the problem */ + int64_t outDim = THTensor_(size)(bias, 0); + int64_t woutDim = THTensor_(size)(weight, 1); + int maxNormalize = woutDim - outDim; + int64_t keysSize = THLongTensor_size(runningKeys, 0); + + /* Access the storage data/strides */ + real* gradWeightData = THTensor_(data)(gradWeight); + real* weightData = THTensor_(data)(weight); + int64_t weightStride0 = weight->stride[0]; + real* gradBiasData = THTensor_(data)(gradBias); + real* biasData = THTensor_(data)(bias); + int64_t* keysData = THLongTensor_data(runningKeys); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous"); + THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous"); + + int j, k; + + /* Update the bias first */ + THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim); + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. + * No multithreading here as this could + * corrupt the results (hogwild style) */ + if (outDim == 1) + { + if (maxNormalize) + { + if (weightDecay) + { + for (j = 0; j < keysSize; j++) + { + int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; + real lr = learningRate*weightData[woffset-2]; + weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; + weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset]; + } + } + else + { + for (j = 0; j < keysSize; j++) + { + int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; + real lr = learningRate*weightData[woffset-2]; + weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; + weightData[woffset] -= gradWeightData[2*j+1]*lr; + } + } + } + else + { + if (weightDecay) + { + for (j = 0; j < keysSize; j++) + { + int64_t woffset = weightStride0*(keysData[j] + keysOffset); + weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset]; + } + } + else + { + for (j = 0; j < keysSize; j++) + { + weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate; + } + } + } + } + else + { + for (j = 0; j < keysSize; j++) + { + real lr = learningRate; + real wd = weightDecay; + real* lweightData; + int64_t woffset = weightStride0*(keysData[j] + keysOffset); + real* lgradWeightData = gradWeightData + j*outDim; + if (maxNormalize) + { + lgradWeightData += j*outDim; + /* weightData[woffset + 2] */ + lweightData = weightData + woffset + maxNormalize - 2; + lr = lr*lweightData[0]; + wd = weightDecay*lweightData[0]; + /* weightData[woffset + 3] */ + lweightData++; + for (k=0; k < outDim; k++) + { + lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr; + } + lweightData++; + lgradWeightData += outDim; + } + else + { + lweightData = weightData + woffset; + } + + /* We do sparse weight decay. + * We think it makes more sense. */ + if (weightDecay) + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= lweightData[k]*wd; + } + } + + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= lgradWeightData[k]*lr; + } + } + } + } +} + + +void THNN_(IndexLinear_accUpdateGradParameters)( + THNNState *state, + THLongTensor *keys, + int64_t keysOffset, + THTensor *values, + THLongTensor *sizes, + THLongTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + /* Retrieve all the dimensions of the problem */ + int64_t batchSize = THLongTensor_size(sizes, 0); + int64_t outDim = THTensor_(size)(bias, 0); + int64_t woutDim = THTensor_(size)(weight, 1); + int maxNormalize = woutDim - outDim; + THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); + + /* Access the storage data/strides */ + real* gradOutputData = THTensor_(data)(gradOutput); + real* valuesData =THTensor_(data)(values); + real* weightData = THTensor_(data)(weight); + real* biasData = THTensor_(data)(bias); + int64_t weightStride0 = weight->stride[0]; + int64_t* keysData = THLongTensor_data(keys); + int64_t* sizesData = THLongTensor_data(sizes); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous"); + + int i,j,k; + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. + * No multithreading here as this could + * corrupt the results (hogwild style) */ + if (outDim == 1) + { + if (maxNormalize) + { + int64_t offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lgradOutputData = gradOutputData + j; + *biasData -= *lgradOutputData * scale; + real val = *lgradOutputData * scale; + for (i = 0; i < sizesData[j]; i++) + { + int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; + weightData[idx-1] -= weightData[idx]*val*weightData[idx-2]; + weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2]; + offset++; + } + } + + offset = 0; + for (j = 0; j < batchSize; j++) + { + for (i = 0; i < sizesData[j]; i++) + { + int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; + weightData[idx-2] = 0; + offset++; + } + } + } + else + { + if (weightDecay) + { + int64_t offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lgradOutputData = gradOutputData + j; + *biasData -= *lgradOutputData * scale; + real val = *lgradOutputData * scale; + for (i = 0; i < sizesData[j]; i++) + { + int64_t idx = weightStride0*(keysData[offset] + keysOffset); + weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay; + offset++; + } + } + } + else + { + int64_t offset = 0; + for (j = 0; j < batchSize; j++) + { + real val = gradOutputData[j] * scale; + for (i = 0; i < sizesData[j]; i++) + { + weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset]; + offset++; + } + *biasData -= val; + } + } + } + } + else { + int64_t offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lgradOutputData = gradOutputData + j*outDim; + real* lweightData = weightData; + THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim); + for (i = 0; i < sizesData[j]; i++) + { + real val = valuesData[offset] * scale; + real wd = weightDecay; + + // Max normalize case + if (maxNormalize) + { + lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); + val *= lweightData[0]; + wd *= lweightData[0]; + for (k=0; k < outDim; k++) + { + lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0]; + } + lweightData += 2; + } + else + { + lweightData = weightData + weightStride0*(keysData[offset] + keysOffset); + } + + /* We do sparse weight decay. + * We think it makes more sense. */ + if (weightDecay) + { + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= wd * lweightData[k]; + } + } + } + + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= val * lgradOutputData[k]; + } + } + offset++; + } + } + + /* Max Normalize case: + * Reset the smart update scaling if + * one does it batch-wise. + * TODO: Decide what to do with that piece of code. + * NB: If the code belowe is uncommented, so should the commented + * code in IndexLinear:zeroGradParameters() */ + + /* + if (maxNormalize) + { + offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lweightData = weightData; + for (i = 0; i < sizesData[j]; i++) + { + real val = valuesData[offset] * scale; + real wd = weightDecay; + + lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); + lweightData[0] = 0; + offset++; + } + } + } + */ + } + return; +} + +void THNN_(IndexLinear_accGradParameters)( + THNNState *state, + THLongTensor *keys, + int64_t keysOffset, + THTensor *values, + THLongTensor *sizes, + THLongTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THTensor *valuesBuffer, + accreal weightDecay_, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + /* Retrieve all the dimensions of the problem */ + int64_t batchSize = THLongTensor_size(sizes, 0); + int64_t keysSize = THLongTensor_size(keys, 0); + int64_t outDim = THTensor_(size)(bias, 0); + int64_t woutDim = THTensor_(size)(weight, 1); + int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0; + THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); + int64_t* sizesData = THLongTensor_data(sizes); + + /* COmpute the cumulative sizes */ + THLongTensor* cumSizes = THLongTensor_new(); + THLongTensor_cumsum(cumSizes, sizes, 0); + int64_t* cumSizesData = THLongTensor_data(cumSizes); + + /* Resize the gradWeight buffer to keep it dense. + * That speeds up updates A LOT assuming random mem access. */ + THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); + + /* Access the storage data/strides */ + real* gradOutputData = THTensor_(data)(gradOutput); + real* valuesData =THTensor_(data)(values); + real* gradWeightData = THTensor_(data)(gradWeight); + real* gradBiasData = THTensor_(data)(gradBias); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); + + int i,j,k; + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. + * No multithreading here as this could + * corrupt the results (hogwild style) */ + if (outDim == 1) + { + for (j = 0; j < batchSize; j++) + { + int64_t offset = j==0?0:cumSizesData[j-1]; + real val = gradOutputData[j] * scale; + real* lgradWeightData = gradWeightData + offset; + real* lvaluesData = valuesData + offset; + int64_t end = sizesData[j]; + + if (maxNormalize) + { + lgradWeightData += offset; + i = 0; + for(;i < end; i++) + { + lgradWeightData[2*i] = val; + lgradWeightData[2*i+1] = val * lvaluesData[i]; + } + } + else + { + i = 0; + for(;i < end-4; i += 4) + { + lgradWeightData[i] = val * lvaluesData[i]; + lgradWeightData[i+1] = val * lvaluesData[i+1]; + lgradWeightData[i+2] = val * lvaluesData[i+2]; + lgradWeightData[i+3] = val * lvaluesData[i+3]; + } + + for(; i < end; i++) + { + lgradWeightData[i] = val * lvaluesData[i]; + } + } + *gradBiasData += val; + offset += end; + } + } + else { + for (j = 0; j < batchSize; j++) + { + int64_t offset = j==0?0:cumSizesData[j-1]; + real* lgradOutputData = gradOutputData + j*outDim; + real* lgradWeightData = gradWeightData; + THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); + for (i = 0; i < sizesData[j]; i++) + { + real val = valuesData[offset] * scale; + lgradWeightData = gradWeightData + offset*outDim; + if (maxNormalize) + { + lgradWeightData += offset*outDim; + k = 0; + for(;k < outDim-4; k += 4) + { + lgradWeightData[k] = lgradOutputData[k]*scale; + lgradWeightData[k+1] = lgradOutputData[k+1]*scale; + lgradWeightData[k+2] = lgradOutputData[k+2]*scale; + lgradWeightData[k+3] = lgradOutputData[k+3]*scale; + } + + for(; k < outDim; k++) + { + lgradWeightData[k] = lgradOutputData[k]*scale; + } + lgradWeightData += outDim; + } + k = 0; + for(;k < outDim-4; k += 4) + { + lgradWeightData[k] = val * lgradOutputData[k]; + lgradWeightData[k+1] = val * lgradOutputData[k+1]; + lgradWeightData[k+2] = val * lgradOutputData[k+2]; + lgradWeightData[k+3] = val * lgradOutputData[k+3]; + } + + for(; k < outDim; k++) + { + lgradWeightData[k] = val * lgradOutputData[k]; + } + offset++; + } + } + } + THLongTensor_free(cumSizes); + return; +} +#endif diff --git a/aten/src/THNN/generic/L1Cost.c b/aten/src/THNN/generic/L1Cost.c new file mode 100644 index 0000000..8f5eb17 --- /dev/null +++ b/aten/src/THNN/generic/L1Cost.c @@ -0,0 +1,38 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/L1Cost.c" +#else + +void THNN_(L1Cost_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + accreal sum = 0; + + TH_TENSOR_APPLY(real, input, + sum += fabs(*input_data); + ); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(L1Cost_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY2(real, gradInput, real, input, + if (*input_data > 0) + *gradInput_data = 1; + else if (*input_data < 0) + *gradInput_data = -1; + else + *gradInput_data = 0; + ); +} + +#endif diff --git a/aten/src/THNN/generic/LeakyReLU.c b/aten/src/THNN/generic/LeakyReLU.c new file mode 100644 index 0000000..abca9fb --- /dev/null +++ b/aten/src/THNN/generic/LeakyReLU.c @@ -0,0 +1,58 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LeakyReLU.c" +#else + +void THNN_(LeakyReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal negval_, + bool inplace) +{ + real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_); + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data <= 0) + *input_data *= negval; + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, output, real, input, + const real r = (*input_data > 0) ? 1 : negval; + *output_data = *input_data * r; + ); + } +} + +void THNN_(LeakyReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal negval_, + bool inplace) +{ + real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_); + THNN_CHECK_NELEMENT(input, gradOutput); + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if (*input_data <= 0) + *gradOutput_data *= negval; + ); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval; + ); + } +} + +#endif diff --git a/aten/src/THNN/generic/Linear.c b/aten/src/THNN/generic/Linear.c new file mode 100644 index 0000000..630dc4c --- /dev/null +++ b/aten/src/THNN/generic/Linear.c @@ -0,0 +1,114 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Linear.c" +#else + +void THNN_(Linear_updateAddBuffer)( + THNNState *state, + THTensor *input, + THTensor *addBuffer) +{ + int64_t nframe = THTensor_(size)(input,0); + int64_t nElement = THTensor_(nElement)(addBuffer); + if (nElement != nframe) { + THTensor_(resize1d)(addBuffer,nframe); + THTensor_(fill)(addBuffer,1.0); + } +} + +void THNN_(Linear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *addBuffer) +{ + int64_t dim = THTensor_(_nDimension)(input); + if (dim == 1) { + THTensor_(resize1d)(output,THTensor_(size)(weight,0)); + if (bias) { + THTensor_(copy)(output,bias); + } + else { + THTensor_(zero)(output); + } + THTensor_(addmv)(output,1,output,1,weight,input); + } + else if (dim == 2) { + int64_t nframe = THTensor_(size)(input,0); + int64_t nElement = THTensor_(nElement)(output); + THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0)); + if (THTensor_(nElement)(output) != nElement) { + THTensor_(zero)(output); + } + THNN_(Linear_updateAddBuffer)(state,input,addBuffer); + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight,weight,0,1); + THTensor_(addmm)(output,0,output,1,input,tweight); + THTensor_(free)(tweight); + if (bias) { + THTensor_(addr)(output,1,output,1,addBuffer,bias); + } + } +} + +void THNN_(Linear_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight) +{ + if (gradInput) { + int64_t nElement = THTensor_(nElement)(gradInput); + THTensor_(resizeAs)(gradInput,input); + if (THTensor_(nElement)(gradInput) != nElement) { + THTensor_(zero)(gradInput); + } + + int64_t dim = THTensor_(_nDimension)(input); + if (dim == 1) { + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight,weight,0,1); + THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput); + THTensor_(free)(tweight); + } + else if (dim == 2) { + THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight); + } + } +} + +void THNN_(Linear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *addBuffer, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int64_t dim = THTensor_(_nDimension)(input); + if (dim == 1) { + THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input); + if (bias) { + THTensor_(cadd)(gradBias,gradBias,scale,gradOutput); + } + } + else if (dim == 2) { + THTensor *tgradOutput = THTensor_(new)(); + THTensor_(transpose)(tgradOutput,gradOutput,0,1); + THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input); + if (bias) { + THNN_(Linear_updateAddBuffer)(state,input,addBuffer); + THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer); + } + THTensor_(free)(tgradOutput); + } +} + +#endif diff --git a/aten/src/THNN/generic/LogSigmoid.c b/aten/src/THNN/generic/LogSigmoid.c new file mode 100644 index 0000000..556af4f --- /dev/null +++ b/aten/src/THNN/generic/LogSigmoid.c @@ -0,0 +1,51 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LogSigmoid.c" +#else + +void THNN_(LogSigmoid_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *buffer) +{ + THTensor_(resizeAs)(output, input); + THTensor_(resizeAs)(buffer, input); + //Use the LogSumExp trick to make this stable against overflow + TH_TENSOR_APPLY3(real, output, real, input, real, buffer, + real max_elem = fmax(0, -*input_data); + real z = exp(-max_elem) + exp(-*input_data - max_elem); + *buffer_data = z; + *output_data = -(max_elem + log(z)); + ); +} + +void THNN_(LogSigmoid_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *buffer) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, buffer); +/* deriv of -max(0,-x) - log(e(0 - max(0,-x)) + e(-x - max(0,-x))) is + * -max_deriv - (-max_deriv*e(0-max(0,-x)) + (-1 - max_deriv)*e(-x - max(0,-x)))/z + * where z = e(0 - max(0,-x)) + e(-x - max(0,-x)) + * which simplifies to + * -max_deriv - (z-1)/z if x is >= 0 or + * -max_deriv + (z-1)/z if x is < 0 + */ + TH_TENSOR_APPLY3(real, input, real, gradInput, real, buffer, + real z = *buffer_data; + real max_deriv = 0.0; + real sign = -1.0; + if (*input_data < 0){ + max_deriv = -1.0; + sign = 1.0; + } + *gradInput_data = -max_deriv - sign*((z - 1.0)/ z); + ); + THTensor_(cmul)(gradInput, gradOutput, gradInput); +} + +#endif diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c new file mode 100644 index 0000000..05694fc --- /dev/null +++ b/aten/src/THNN/generic/LookupTable.c @@ -0,0 +1,225 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LookupTable.c" +#else + +static void THNN_(LookupTable_resetCount)( + THInteger_t *count_data, + THIndexTensor *input) +{ + ptrdiff_t i; + THIndex_t *input_data = THIndexTensor_(data)(input); + ptrdiff_t numel = THIndexTensor_(nElement)(input); + + for (i = 0; isize[0]); + count_data = THIntegerTensor_(data)(count); + } + + if (!THTensor_(isContiguous)(gradWeight)) + THError("gradWeight must be contiguous"); + if (!THIndexTensor_(isContiguous)(input)) + THError("input must be contiguous"); + if (input->is_empty() || (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)) { + THDescBuff s1 = THIndexTensor_(sizeDesc)(input); + THError("input must be a non-empty vector or matrix, but is of shape: %s", s1.str); + } + + THIndex_t *input_data = THIndexTensor_(data)(input); + ptrdiff_t numel = THIndexTensor_(nElement)(input); + int64_t numw = THTensor_(size)(gradWeight, 0); + + // check that inputs are all within range + for (i=0; i= numw + TH_INDEX_BASE) { + THError("inputs need to be in the range %ld <= input < %ld, " + "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE), + input_data[i]); + } + + gradOutput = THTensor_(newContiguous)(gradOutput); + + real *gw = THTensor_(data)(gradWeight); + real *go = THTensor_(data)(gradOutput); + int64_t stride = THTensor_(stride)(gradWeight, 0); + + if (count_data) + THNN_(LookupTable_resetCount)(count_data, input); + +#ifdef _OPENMP + if (numel > 1000) + { + // The strategy is to parallelize over sections of the vocabulary, so that + // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread + // has to traverse the entire input, but the dominating factor is the axpy + // BLAS call. + #pragma omp parallel private(i) + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + + int64_t start = tid * (numw/nthreads + 1); + int64_t end = start + (numw/nthreads + 1); + for (i=0; i= start && k < end) + { + real scale_ = scale; + if (count_data) scale_ /= count_data[k]; + THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); + } + } + } + } + + THTensor_(free)(gradOutput); + return; + } +#endif + + for (i=0; i maxNorm) + { + new_norm = maxNorm / (norm + 1e-7); + for (j=0; jis_empty() || THIndexTensor_(nDimension)(idx) != 1) + THError("idx must be a non-empty vector"); + if (normType <= 0) + THError("non-positive-norm not supported"); + + ptrdiff_t i; + THIndex_t *row_idx = THIndexTensor_(data)(idx); + ptrdiff_t numel = THIndexTensor_(nElement)(idx); + + int64_t numw = THTensor_(size)(weight, 0); + int64_t stride = THTensor_(stride)(weight, 0); + real *gw = THTensor_(data)(weight); + for (i=0; i= numw + TH_INDEX_BASE) { + THError("input need to be in the range %ld <= input < %ld, " + "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE), + row_idx[i]); + } + } + // get unique indices + qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex)); + ptrdiff_t ptr = 0; + for (i=0; i 1000) + { + // The strategy is to parallelize over the rows that appear in + // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads]. + // This distributes the work evenly to each thread. + #pragma omp parallel for private(i) + for (i=0; i0 ? z : 0; + ); + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(MarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + THNN_CHECK_NELEMENT(input, target); + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0; + ); +} + +#endif diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c new file mode 100644 index 0000000..3072c03 --- /dev/null +++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c @@ -0,0 +1,256 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c" +#else + +// TODO: improve error messages +void THNN_(MultiLabelMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + THTensor *isTarget, + int64_t reduction) +{ + real *input_data, *isTarget_data; + THIndex_t *target_data; + int64_t nframe, dim; + int64_t t, d, dt, ddt; + real sum; + + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + "non-empty vector or matrix expected, got size: ", input->sizes()); + + if (input->dim() == 1) + { + nframe = 1; + dim = input->size[0]; + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), + "inconsistent target size"); + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size[0] == nframe) + && (target->size[1] == dim), "inconsistent target size"); + } + + THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range"); + THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range"); + + target = THIndexTensor_(newContiguous)(target); + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); + target_data = THIndexTensor_(data)(target); + + THNN_resizeAs_indices(isTarget, target); + THTensor_(zero)(isTarget); + isTarget_data = THTensor_(data)(isTarget); + + if (reduction != Reduction::None) + { + THTensor_(resize1d)(output, 1); + + sum = 0; + for (t = 0; t < nframe; t++) + { + for (ddt = 0; ddt < dim; ddt++) + { + THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE; + if (target_idx < 0) + break; + isTarget_data[target_idx] = 1; + } + for (dt = 0; dt < dim; dt++) + { + THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE; + real input_target; + if (target_idx < 0) + break; + + input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + if (!isTarget_data[d]) + { + real z = 1 - input_target + input_data[d]; + if (z > 0) + sum += z; + } + } + } + input_data += dim; + target_data += dim; + isTarget_data += dim; + } + + sum /= dim; + if (reduction == Reduction::ElementwiseMean) + sum /= nframe; + THTensor_(fastSet1d)(output, 0, sum); + + THTensor_(free)(input); + THIndexTensor_(free)(target); + return; + } + + THTensor_(resize1d)(output, nframe); + + for (t = 0; t < nframe; t++) + { + for (ddt = 0; ddt < dim; ddt++) + { + THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE; + if (target_idx < 0) + break; + isTarget_data[target_idx] = 1; + } + + sum = 0; + for (dt = 0; dt < dim; dt++) + { + THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE; + real input_target; + if (target_idx < 0) + break; + + input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + if (!isTarget_data[d]) + { + real z = 1 - input_target + input_data[d]; + if (z > 0) + sum += z; + } + } + } + + sum /= dim; + THTensor_(fastSet1d)(output, t, sum); + + input_data += dim; + target_data += dim; + isTarget_data += dim; + } + + THTensor_(free)(input); + THIndexTensor_(free)(target); +} + +void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *isTarget, + int64_t reduction) +{ + real *input_data; + real *gradInput_data; + THIndex_t *target_data; + real *isTarget_data; + int64_t nframe, dim; + int64_t t, d, dt; + real g; + + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + "vector or matrix expected, got size: ", input->sizes()); + + if (input->dim() == 1) + { + nframe = 1; + dim = input->size[0]; + AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size[0] == dim), + "inconsistent target size"); + AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size[0] == dim), + "inconsistent isTarget size"); + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe) + && (target->size[1] == dim), 3, "inconsistent target size"); + AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size[0] == nframe) + && (isTarget->size[1] == dim), 3, "inconsistent isTarget size"); + } + + THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range"); + THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range"); + + THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range"); + THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range"); + + target = THIndexTensor_(newContiguous)(target); + input = THTensor_(newContiguous)(input); + isTarget = THTensor_(newContiguous)(isTarget); + input_data = THTensor_(data)(input); + target_data = THIndexTensor_(data)(target); + isTarget_data = THTensor_(data)(isTarget); + + THTensor_(resizeAs)(gradInput, input); + gradInput = THTensor_(newContiguous)(gradInput); + THTensor_(zero)(gradInput); + gradInput_data = THTensor_(data)(gradInput); + + g = reduction == Reduction::ElementwiseMean ? (1./((real)(nframe*dim))) : (1./((real)dim)); + + for (t = 0; t < nframe; t++) + { + for (dt = 0; dt < dim; dt++) + { + THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE; + real input_target; + if (target_idx < 0) + break; + + input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + if (!isTarget_data[d]) + { + real z = 1 - input_target + input_data[d]; + if (z > 0) + { + gradInput_data[target_idx] -= g; + gradInput_data[d] += g; + } + } + } + } + input_data += dim; + target_data += dim; + isTarget_data += dim; + gradInput_data += dim; + } + gradInput_data = THTensor_(data)(gradInput); + + if (reduction != Reduction::None) + { + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + for (t = 0; t < nframe*dim; t++) + { + gradInput_data[t] *= THTensor_(fastGet1d)(gradOutput, 0); + } + } + else + { + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, nframe); + for (t = 0; t < nframe; t++) + { + for (d = 0; d < dim; d++) + { + gradInput_data[t * dim + d] *= THTensor_(fastGet1d)(gradOutput, t); + } + } + } + + THTensor_(free)(input); + THIndexTensor_(free)(target); + THTensor_(free)(isTarget); + THTensor_(free)(gradInput); +} + +#endif diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c new file mode 100644 index 0000000..620e13c --- /dev/null +++ b/aten/src/THNN/generic/MultiMarginCriterion.c @@ -0,0 +1,223 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c" +#else + +// TODO: improve error messages +void THNN_(MultiMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + int64_t reduction, + int p, + THTensor *weights, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + real *input_data, *weights_data; + THIndex_t *target_data; + int64_t nframe, dim; + int64_t t, d; + real sum; + + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + "non-empty vector or matrix expected, got size: ", input->sizes()); + + if (input->dim() == 1) + { + nframe = 1; + dim = input->size[0]; + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), + "inconsistent target size, got: ", target->sizes()); + } + + for (t = 0; t < nframe; t++) + { + THIndex_t idx = THIndexTensor_(get1d)(target, t); + THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, + "target out of range"); + } + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + input_data = THTensor_(data)(input); + target_data = THIndexTensor_(data)(target); + weights_data = weights ? THTensor_(data)(weights) : NULL; + + if (reduction == Reduction::None) + { + THTensor_(resize1d)(output, nframe); + + for (t = 0; t < nframe; t++) + { + sum = 0; + THIndex_t target_idx = target_data[t] - TH_INDEX_BASE; + real input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + real z = margin - input_target + input_data[d]; + if (d == target_idx) + continue; + + if (z > 0) { + real h = (p==1) ? z : z*z; + if(weights_data) + h *= weights_data[target_idx]; + sum += h; + } + } + + sum /= dim; + THTensor_(fastSet1d)(output, t, sum); + input_data += dim; + } + } + else + { + THTensor_(resize1d)(output, 1); + + sum = 0; + for (t = 0; t < nframe; t++) + { + THIndex_t target_idx = target_data[t] - TH_INDEX_BASE; + real input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + real z = margin - input_target + input_data[d]; + if (d == target_idx) + continue; + + if (z > 0) { + real h = (p==1) ? z : z*z; + if(weights_data) + h *= weights_data[target_idx]; + sum += h; + } + } + input_data += dim; + } + + sum /= dim; + if(reduction == Reduction::ElementwiseMean) + sum /= nframe; + + THTensor_(set1d)(output, 0, sum); + } + + THTensor_(free)(input); + THIndexTensor_(free)(target); + if(weights) + THTensor_(free)(weights); +} + +void THNN_(MultiMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction, + int p, + THTensor *weights, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + real *input_data; + real *gradInput_data; + THIndex_t *target_data; + real *weights_data; + int64_t nframe, dim; + int64_t t, d; + real g; + + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + "non-empty vector or matrix expected, got size: ", input->sizes()); + + if (input->dim() == 1) + { + nframe = 1; + dim = input->size[0]; + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), + "inconsistent target size, got: ", target->sizes()); + } + + g = (reduction == Reduction::ElementwiseMean ? 1./((real)(nframe*dim)) : 1./((real)dim)); + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + input_data = THTensor_(data)(input); + + THTensor_(resizeAs)(gradInput, input); + THArgCheck(THTensor_(isContiguous)(gradInput), 5, "gradInput must be contiguous"); + gradInput_data = THTensor_(data)(gradInput); + + target_data = THIndexTensor_(data)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + weights_data = weights ? THTensor_(data)(weights) : NULL; + + for (t = 0; t < nframe; t++) + { + THIndex_t target_idx = target_data[t] - TH_INDEX_BASE; + real input_target = input_data[target_idx]; + real gradInput_target = 0; + for (d = 0; d < dim; d++) + { + real z = margin - input_target + input_data[d]; + if (d == target_idx) + continue; + + if (z > 0) + { + real h = (p == 1) ? g : 2*g*z; + if(weights_data) + h *= weights_data[target_idx]; + gradInput_target -= h; + gradInput_data[d] = h; + } + else + gradInput_data[d] = 0; + } + gradInput_data[target_idx] = gradInput_target; + + input_data += dim; + gradInput_data += dim; + } + gradInput_data = THTensor_(data)(gradInput); + + if (reduction != Reduction::None) + { + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + for (t = 0; t < nframe * dim; t++) { + gradInput_data[t] *= THTensor_(fastGet1d)(gradOutput, 0); + } + } + else + { + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, nframe); + for (t = 0; t < nframe; t++) + { + for (d = 0; d < dim; d++) + { + gradInput_data[t * dim + d] *= THTensor_(fastGet1d)(gradOutput, t); + } + } + } + + THTensor_(free)(input); + THIndexTensor_(free)(target); + if(weights) + THTensor_(free)(weights); +} + +#endif diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c new file mode 100644 index 0000000..462280c --- /dev/null +++ b/aten/src/THNN/generic/PReLU.c @@ -0,0 +1,202 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/PReLU.c" +#else + +void THNN_(PReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight) +{ + THTensor_(resizeAs)(output, input); + int64_t nOutputPlane = THTensor_(numel)(weight); + + if (nOutputPlane == 1) + { + // handle shared parameter case + real w = *THTensor_(data)(weight); + TH_TENSOR_APPLY2(real, output, real, input, + const real r = (*input_data > 0) ? 1 : w; + *output_data = *input_data * r; + ); + return; + } + + input = THTensor_(newContiguous)(input); + int64_t bs = 1, ks = 1; + { + int64_t input_ndim = THTensor_(_nDimension)(input); + if (input->size[input_ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + + if (input_ndim > 1) { + bs = input->size[0]; + for (int d = 2; d < input_ndim; d++) { + ks *= input->size[d]; + } + } + } + + real *output_data = THTensor_(data)(output); + real *input_data = THTensor_(data)(input); + real *weight_data = THTensor_(data)(weight); + THIndex_t i, j, k; + #pragma omp parallel for private(j,k) + for (i = 0; i < bs; ++i) + { + real* n_input_data = input_data + i*nOutputPlane*ks; + real* n_output_data = output_data + i*nOutputPlane*ks; + for (j = 0; j < nOutputPlane; ++j) + { + for (k = 0; k < ks; ++k) + n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k]; + n_input_data += ks; + n_output_data += ks; + } + } + THTensor_(free)(input); +} + +void THNN_(PReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + int64_t nOutputPlane = THTensor_(numel)(weight); + + if (nOutputPlane == 1) + { + real w = THTensor_(data)(weight)[0]; + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if ((*input_data) > 0) + *gradInput_data = *gradOutput_data; + else + *gradInput_data = w * (*gradOutput_data); + ); + return; + } + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + const real *input_data = THTensor_(data)(input); + const real *gradOutput_data = THTensor_(data)(gradOutput); + const real *weight_data = THTensor_(data)(weight); + real *gradInput_data = THTensor_(data)(gradInput); + + int64_t bs = 1, ks = 1; + { + int64_t input_ndim = THTensor_(_nDimension)(input); + if (input->size[input_ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + + if (input_ndim > 1) { + bs = input->size[0]; + for (int d = 2; d < input_ndim; d++) { + ks *= input->size[d]; + } + } + } + + THIndex_t i, j, k; + #pragma omp parallel for private(j,k) + for (i = 0; i < bs; ++i) + { + const real *n_input_data = input_data + i*nOutputPlane*ks; + const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks; + real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks; + + for (j = 0; j < nOutputPlane; ++j) + { + real w = weight_data[j]; + for (k = 0; k < ks; ++k) + { + if (n_input_data[k] > 0) + n_gradInput_data[k] = n_gradOutput_data[k]; + else + n_gradInput_data[k] = n_gradOutput_data[k] * w; + } + n_input_data += ks; + n_gradInput_data += ks; + n_gradOutput_data += ks; + } + } + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +void THNN_(PReLU_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradWeight, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_CHECK_NELEMENT(input, gradOutput); + int64_t nOutputPlane = THTensor_(numel)(weight); + + if (nOutputPlane == 1) + { + real *gradWeight_data = THTensor_(data)(gradWeight); + real sum = 0; + TH_TENSOR_APPLY2(real, input, real, gradOutput, + if ((*input_data) <= 0) + sum += (*input_data) * (*gradOutput_data); + ); + gradWeight_data[0] += scale * sum; + return; + } + + THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous"); + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + int64_t bs = 1, ks = 1; + { + int64_t input_ndim = THTensor_(_nDimension)(input); + if (input->size[input_ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + + if (input_ndim > 1) { + bs = input->size[0]; + for (int d = 2; d < input_ndim; d++) { + ks *= input->size[d]; + } + } + } + + const real *input_data = THTensor_(data)(input); + const real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradWeight_data = THTensor_(data)(gradWeight); + + THIndex_t i, j, k; + for (i = 0; i < bs; ++i) + { + const real *n_input_data = input_data + i*nOutputPlane*ks; + const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks; + + for (j = 0; j < nOutputPlane; ++j) + { + real sum = 0; + for (k = 0; k < ks; ++k) + if (n_input_data[k] <= 0) + sum += n_gradOutput_data[k] * n_input_data[k]; + gradWeight_data[j] += scale * sum; + n_input_data += ks; + n_gradOutput_data += ks; + } + } + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +#endif diff --git a/aten/src/THNN/generic/RReLU.c b/aten/src/THNN/generic/RReLU.c new file mode 100644 index 0000000..8fd46d3 --- /dev/null +++ b/aten/src/THNN/generic/RReLU.c @@ -0,0 +1,132 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/RReLU.c" +#else + +void THNN_(RReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *noise, + accreal lower_, + accreal upper_, + bool train, + bool inplace, + THGenerator *generator) +{ + real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_); + real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_); + if (train) + { + // get default random generator + THTensor_(resizeAs)(noise, input); + if (inplace) + { + TH_TENSOR_APPLY2(real, input, real, noise, + if (*input_data <= 0) + { + const real r = (real)THRandom_uniform(generator, lower, upper); + *input_data = (*input_data) * r; + *noise_data = r; + } + else + { + *noise_data = 1; + } + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY3(real, input, real, output, real, noise, + if (*input_data <= 0) + { + const real r = (real)THRandom_uniform(generator, lower, upper); + *output_data = (*input_data) * r; + *noise_data = r; + } + else + { + *output_data = *input_data; + *noise_data = 1; + } + ); + } + } + else + { + const real negSlope = (lower + upper) / 2; + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data <= 0) + { + *input_data = *input_data * negSlope; + } + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, input, real, output, + const real r = (*input_data) <= 0 ? negSlope : 1; + *output_data = *input_data * r; + ); + } + } +} + +void THNN_(RReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *noise, + accreal lower_, + accreal upper_, + bool train, + bool inplace) +{ + real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_); + real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_); + THNN_CHECK_NELEMENT(input, gradOutput); + if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU + { + // multiply the gradient by the noise tensor + if (inplace) + { + THTensor_(cmul)(gradOutput, gradOutput, noise); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + THTensor_(cmul)(gradInput, gradOutput, noise); + } + } + else + { + // use constant factor for negative input values + const real negSlope = (lower + upper) / 2; + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if (*input_data <= 0) + { + *gradOutput_data = (*gradOutput_data) * negSlope; + } + ); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data); + ); + } + } +} + +#endif diff --git a/aten/src/THNN/generic/Sigmoid.c b/aten/src/THNN/generic/Sigmoid.c new file mode 100644 index 0000000..2b218dd --- /dev/null +++ b/aten/src/THNN/generic/Sigmoid.c @@ -0,0 +1,27 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Sigmoid.c" +#else + +void THNN_(Sigmoid_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(sigmoid)(output, input); +} + +void THNN_(Sigmoid_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_NELEMENT(output, gradOutput); + THTensor_(resizeAs)(gradInput, output); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + real z = *output_data; + *gradInput_data = *gradOutput_data * (1. - z) * z; + ); +} + +#endif diff --git a/aten/src/THNN/generic/SmoothL1Criterion.c b/aten/src/THNN/generic/SmoothL1Criterion.c new file mode 100644 index 0000000..b9eca65 --- /dev/null +++ b/aten/src/THNN/generic/SmoothL1Criterion.c @@ -0,0 +1,80 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c" +#else + +void THNN_(SmoothL1Criterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + + if (reduction == Reduction::None) { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY3(real, input, real, target, real, output, + real z = fabs(*input_data - *target_data); + *output_data = z < 1 ? 0.5 * z * z : z - 0.5; + ); + return; + } + + THTensor_(resize1d)(output, 1); + + real sum = 0; + TH_TENSOR_APPLY2(real, input, real, target, + real z = fabs(*input_data - *target_data); + sum += z < 1 ? 0.5*z*z : z - 0.5; + ); + + if (reduction == Reduction::ElementwiseMean) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(SmoothL1Criterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + THTensor_(resizeAs)(gradInput, input); + + if (reduction == Reduction::None) { + THNN_CHECK_SHAPE(gradOutput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real x = *input_data - *target_data; + if (x < -1.) { + *gradInput_data = -1.; + } else if (x > 1.) { + *gradInput_data = 1.; + } else { + *gradInput_data = x; + } + ); + TH_TENSOR_APPLY2(real, gradInput, real, gradOutput, + *gradInput_data *= *gradOutput_data; + ); + return; + } + + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.) * THTensor_(fastGet1d)(gradOutput, 0); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real x = *input_data - *target_data; + if (x < -1.) + *gradInput_data = - norm; + else if (x > 1.) + *gradInput_data = norm; + else + *gradInput_data = norm * x; + ); +} + +#endif diff --git a/aten/src/THNN/generic/SoftMarginCriterion.c b/aten/src/THNN/generic/SoftMarginCriterion.c new file mode 100644 index 0000000..8fb31f9 --- /dev/null +++ b/aten/src/THNN/generic/SoftMarginCriterion.c @@ -0,0 +1,65 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c" +#else + +void THNN_(SoftMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + + if (reduction == Reduction::None) { + THTensor_(resizeAs)(output, input); + + TH_TENSOR_APPLY3(real, input, real, target, real, output, + *output_data = log(1. + exp(-*input_data * *target_data));) + return; + } + + THTensor_(resize1d)(output, 1); + + real sum; + + sum = 0; + TH_TENSOR_APPLY2(real, input, real, target, + real z = log(1. + exp(-*input_data* *target_data)); + sum += z;) + + if (reduction == Reduction::ElementwiseMean) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(SoftMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction) +{ + THNN_CHECK_SHAPE(input, target); + THTensor_(resizeAs)(gradInput, input); + + if (!reduction) { + THNN_CHECK_SHAPE(gradOutput, input); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real z = exp(-*target_data * *input_data); + *gradInput_data = -*target_data * z/(1. + z);) + THTensor_(cmul)(gradInput, gradInput, gradOutput); + return; + } + + real norm = (reduction == Reduction::ElementwiseMean ? 1./((real)THTensor_(nElement)(input)) : 1.); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real z = exp(-*target_data * *input_data); + *gradInput_data = -norm*(*target_data)*z/(1. + z) * THTensor_(fastGet1d)(gradOutput, 0);) +} + +#endif diff --git a/aten/src/THNN/generic/SoftPlus.c b/aten/src/THNN/generic/SoftPlus.c new file mode 100644 index 0000000..6491e66 --- /dev/null +++ b/aten/src/THNN/generic/SoftPlus.c @@ -0,0 +1,47 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftPlus.c" +#else + +void THNN_(SoftPlus_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal beta_, + accreal threshold_) +{ + real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_); + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + THTensor_(resizeAs)(output, input); + + // f(x) = 1/beta * log(1 + exp(beta * x)) + TH_TENSOR_APPLY2(real, output, real, input, \ + *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta; + ); +} + +void THNN_(SoftPlus_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output, + accreal beta_, + accreal threshold_) +{ + real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_); + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, output); + + // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1) + // SINCE + // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1) + // THEREFORE: + // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y) + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + real z = exp(*output_data * beta); + *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z; + ); +} + +#endif diff --git a/aten/src/THNN/generic/SoftShrink.c b/aten/src/THNN/generic/SoftShrink.c new file mode 100644 index 0000000..e779508 --- /dev/null +++ b/aten/src/THNN/generic/SoftShrink.c @@ -0,0 +1,42 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftShrink.c" +#else + +void THNN_(SoftShrink_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THTensor_(resizeAs)(output, input); + + TH_TENSOR_APPLY2(real, output, real, input, + if ((*input_data) > lambda) + *output_data = *input_data - lambda; + else if ((*input_data) < -lambda) + *output_data = *input_data + lambda; + else + *output_data = 0; + ); +} + +void THNN_(SoftShrink_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if ((*input_data) > lambda || (*input_data) < -lambda) + *gradInput_data = (*gradOutput_data); + else + *gradInput_data = 0; + ); +} + +#endif diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c new file mode 100644 index 0000000..a0c078b --- /dev/null +++ b/aten/src/THNN/generic/SparseLinear.c @@ -0,0 +1,564 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SparseLinear.c" +#else + +#ifdef _OPENMP +#include +#endif + +#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0]) +#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1]) + +static bool THNN_(checkLegacyInput)(THTensor* t) +{ + return !t->is_empty() && t->dim() == 3 && t->size[2] == 2; +} + +static bool THNN_(checkInput)(THTensor* t) +{ + return!t->is_empty() && t->dim() == 2 && t->size[1] == 3; +} + +static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1) +{ + return !t->is_empty() && t->dim() == 2 && t->size[0] == size0 && t->size[1] == size1; +} + +static bool THNN_(checkSize1D)(THTensor* t, int64_t size0) +{ + return !t->is_empty() && t->dim() == 1 && t->size[0] == size0; +} + +static void THNN_(set1d)(THTensor *t, int64_t x0, real value) { + THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value); +} +static real THNN_(get3d)(const THTensor *t, int64_t x0, int64_t x1, int64_t x2) { + return THStorage_(get)(t->storage, t->storageOffset + + x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]); +} +static real THNN_(get2d)(const THTensor *t, int64_t x0, int64_t x1) { + return THStorage_(get)(t->storage, t->storageOffset + + x0*t->stride[0] + x1*t->stride[1]); +} + +void THNN_(SparseLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias) +{ + int64_t h, i, hp0, hp1; + int64_t outDim = THTensor_(size)(weight, 0); + int64_t inDim = THTensor_(size)(weight, 1); + int64_t batchSize = THTensor_(size)(output, 0); + + THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); + + int64_t nnz = THTensor_(size)(input, 0); + + THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1); + THLongTensor_zero(csr); + + weight = THTensor_(newContiguous)(weight); + +//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) + for (i=0; i 10000) + for (h = 0; h < batchSize; h++) { + int64_t i_start = THLongTensor_get1d(csr, h); + int64_t i_end = THLongTensor_get1d(csr, h+1); + for (i = i_start; i < i_end; i++) { + real val = THNN_(get2d)(input, i, 2); + if (val == 0) { + continue; + } + + int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + COL_PTR2(weight, offset), weight->stride[0], + ROW_PTR2(output, h), output->stride[1]); + } else { + THError("index out of bound. updateOutput: %d not between 1 and %d", + offset + 1, inDim); + } + } + } + + THTensor* output_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(output_row, output, 0, h); + THTensor_(cadd)(output_row, bias, 1.0, output_row); + } + THTensor_(free)(output_row); + THLongTensor_free(csr); + THTensor_(free)(weight); +} + +void THNN_(SparseLinear_legacyUpdateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias) +{ + int64_t h, i; + int64_t outDim = THTensor_(size)(weight, 0); + int64_t inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); + + weight = THTensor_(newContiguous)(weight); + + int64_t batchSize = THTensor_(size)(input, 0); + int64_t nnz = THTensor_(size)(input, 1); + THTensor_(resize2d)(output, batchSize, outDim); + + // output = weight * input + bias + THTensor_(zero)(output); +#pragma omp parallel for private(h, i) schedule(static) if ( \ + batchSize > 1 && batchSize * nnz * outDim > 10000) + for (h = 0; h < batchSize; h++) { + for (i = 0; i < nnz; i++) { + real val = THNN_(get3d)(input, h, i, 1); + if (val == 0) { + continue; + } + + int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + COL_PTR2(weight, offset), weight->stride[0], + ROW_PTR2(output, h), output->stride[1]); + } else { + THError("index out of bound. updateOutput: %d not between 1 and %d", + offset + 1, inDim); + } + } + } + + THTensor* output_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(output_row, output, 0, h); + THTensor_(cadd)(output_row, bias, 1.0, output_row); + } + THTensor_(free)(output_row); + THTensor_(free)(weight); +} + +void THNN_(SparseLinear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int64_t h, i, col, hp0, hp1; + int64_t outDim = THTensor_(size)(weight, 0); + int64_t inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkInput)(input), 2, + "input must be in coo format, nnz x 3"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, + "gradBias size wrong"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 1, + "gradOutput must be contiguous"); + + int64_t nnz = THTensor_(size)(input, 0); + + THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); + THLongTensor_zero(csc); + weight = THTensor_(newContiguous)(weight); + +#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) + for (i = 0; i < nnz; i++) { + hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; + hp1 = (i+1 == nnz) ? + inDim : + (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1; + if (hp0 != hp1) for (h = hp0; h < hp1; h++) { + THLongTensor_set1d(csc, h+1, i+1); + } + } + + // gradWeight += gradOutput * input +#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) + for (col = 0; col < inDim; col++) { + int64_t i_start = THLongTensor_get1d(csc, col); + int64_t i_end = THLongTensor_get1d(csc, col+1); + for (i = i_start; i < i_end; i++) { + real val = scale * THNN_(get2d)(input, i, 2); + + h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1; + int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + ROW_PTR2(gradOutput, h), gradOutput->stride[1], + COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + } else { + THError( + "index out of bound. accGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } + + // gradBias += gradOutput + THTensor* buf = THTensor_(new)(); + THTensor_(sum)(buf, gradOutput, 0, 1); + THTensor_(cadd)(gradBias, gradBias, scale, buf); + THTensor_(free)(buf); + THLongTensor_free(csc); + + if (weightDecay != 0) { + THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); + } + THTensor_(free)(weight); +} + +void THNN_(SparseLinear_legacyAccGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int64_t h, i; + int64_t outDim = THTensor_(size)(weight, 0); + int64_t inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkLegacyInput)(input), 2, + "input size must be batchsize x nnz x 2"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, + "gradBias size wrong"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 1, + "gradOutput must be contiguous"); + + int64_t batchSize = THTensor_(size)(input, 0); + int64_t nnz = THTensor_(size)(input, 1); + THTensor_(resize2d)(gradOutput, batchSize, outDim); + + // gradWeight += gradOutput * input +#pragma omp parallel for private(h, i) schedule(static) if (\ + batchSize * nnz * outDim > 10000) + for (i = 0; i < nnz; i++) { + for (h = 0; h < batchSize; h++) { + real val = scale * THNN_(get3d)(input, h, i, 1); + if (val == 0) { + continue; + } + + int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + ROW_PTR2(gradOutput, h), gradOutput->stride[1], + COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + } else { + THError( + "index out of bound. accGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } + + // gradBias += gradOutput + THTensor* gradOutput_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(gradOutput_row, gradOutput, 0, h); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); + } + THTensor_(free)(gradOutput_row); + + if (weightDecay != 0) { + THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); + } +} + +void THNN_(SparseLinear_updateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate_) +{ + real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); + int64_t i; + int64_t outDim = weight->size[0]; + int64_t inDim = weight->size[1]; + + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); + THArgCheck(THNN_(checkInput)(lastInput), 6, + "input must be in coo format, nnz x 3"); + + + int64_t nnz = THTensor_(size)(lastInput, 0); + + // collect unique offsets of non-0 val in input + THTensor* offsets = THTensor_(newWithSize1d)(nnz); + int64_t cnt = 0; + for (i = 0; i < nnz; i++) { + real val = THNN_(get2d)(lastInput, i, 2); + if (val == 0) { + continue; + } + int64_t offset = (int64_t)(THNN_(get2d)(lastInput, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + THNN_(set1d)(offsets, cnt++, offset); + } else { + THError( + "index out of bound. updateParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + if (cnt == 0) return; + THTensor_(resize1d)(offsets, cnt); + + THTensor* uniqueOffsets = THTensor_(new)(); + THLongTensor* ri = THLongTensor_new(); + THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); + THLongTensor_free(ri); + THTensor_(free)(offsets); + + cnt = 1; + real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); + for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { + if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { + uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; + } + } + THTensor_(resize1d)(uniqueOffsets, cnt); + + // weight += -learningRate * gradWeight + THTensor_(cadd)(bias, bias, -learningRate, gradBias); +#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) + for (i = 0; i < cnt; i++) { + int64_t offset = (int64_t)uniqueOffsets_p[i]; + THBlas_(axpy)(outDim, + -learningRate, + COL_PTR2(gradWeight, offset), gradWeight->stride[0], + COL_PTR2(weight, offset), weight->stride[0]); + } + + THTensor_(free)(uniqueOffsets); +} + +void THNN_(SparseLinear_legacyUpdateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate_) +{ + real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); + int64_t h, i; + int64_t outDim = weight->size[0]; + int64_t inDim = weight->size[1]; + + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); + THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, + "input size must be batchsize x nnz x 2"); + + + int64_t batchSize = THTensor_(size)(lastInput, 0); + int64_t nnz = THTensor_(size)(lastInput, 1); + + // collect unique offsets of non-0 val in input + THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); + int64_t cnt = 0; + for (h = 0; h < batchSize; h++) { + for (i = 0; i < nnz; i++) { + real val = THNN_(get3d)(lastInput, h, i, 1); + if (val == 0 ) { + continue; + } + int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THNN_(set1d)(offsets, cnt++, offset); + } else { + THError( + "index out of bound. updateParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } + THTensor_(resize1d)(offsets, cnt); + + THTensor* uniqueOffsets = THTensor_(new)(); + THLongTensor* ri = THLongTensor_new(); + THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); + THLongTensor_free(ri); + THTensor_(free)(offsets); + + cnt = 1; + real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); + for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { + if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { + uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; + } + } + THTensor_(resize1d)(uniqueOffsets, cnt); + + // weight += -learningRate * gradWeight + THTensor_(cadd)(bias, bias, -learningRate, gradBias); +#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) + for (i = 0; i < cnt; i++) { + int64_t offset = (int64_t)uniqueOffsets_p[i]; + THBlas_(axpy)(outDim, + -learningRate, + COL_PTR2(gradWeight, offset), gradWeight->stride[0], + COL_PTR2(weight, offset), weight->stride[0]); + } + + THTensor_(free)(uniqueOffsets); +} + +void THNN_(SparseLinear_zeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput) +{ + int64_t i, j; + + int64_t outDim = gradWeight->size[0]; + int64_t inDim = gradWeight->size[1]; + + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); + THArgCheck(THNN_(checkInput)(lastInput), 4, + "input must be in coo format, nnz x 3"); + + THTensor_(zero)(gradBias); + + int64_t nnz = THTensor_(size)(lastInput, 0); + +#pragma omp parallel for private(i, j) schedule(static) if ( \ + nnz * outDim > 10000) + for (i = 0; i < nnz; i++) { + if (THNN_(get2d)(lastInput, i, 2) == 0 ) { + continue; + } + + int64_t offset = (int64_t)(THNN_(get2d)(lastInput, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + real* pGradWeight = COL_PTR2(gradWeight, offset); + if (gradWeight->stride[0] == 1) { + THVector_(fill)(pGradWeight, 0, outDim); + } else { + int64_t stride = gradWeight->stride[0]; + for (j = 0; j < outDim; ++j) { + pGradWeight[j * stride] = 0; + } + } + } else { + THError( + "index out of bound. zeroGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } +} + +void THNN_(SparseLinear_legacyZeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput) +{ + int64_t h, i, j; + + int64_t outDim = gradWeight->size[0]; + int64_t inDim = gradWeight->size[1]; + + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); + THArgCheck(THNN_(checkLegacyInput)(lastInput), 4, + "input size must be batchsize x nnz x 2"); + + THTensor_(zero)(gradBias); + + int64_t batchSize = THTensor_(size)(lastInput, 0); + int64_t nnz = THTensor_(size)(lastInput, 1); + +#pragma omp parallel for private(h, i, j) schedule(static) if ( \ + batchSize > 1 && batchSize * nnz * outDim > 10000) + for (h = 0; h < batchSize; h++) { + for (i = 0; i < nnz; i++) { + if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) { + continue; + } + + int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + real* pGradWeight = COL_PTR2(gradWeight, offset); + if (gradWeight->stride[0] == 1) { + THVector_(fill)(pGradWeight, 0, outDim); + } else { + int64_t stride = gradWeight->stride[0]; + for (j = 0; j < outDim; ++j) { + pGradWeight[j * stride] = 0; + } + } + } else { + THError( + "index out of bound. zeroGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } +} + +#undef ROW_PTR2 +#undef COL_PTR2 + +#endif diff --git a/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c new file mode 100644 index 0000000..c81657f --- /dev/null +++ b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c @@ -0,0 +1,266 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c" +#else + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +// 4d tensor B x D x H x W + +static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)( + real *input_p, + real *output_p, + int64_t sizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW, + int64_t istrideD, + int64_t istrideH, + int64_t istrideW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + /* loop over output */ + int64_t oh, ow; + for(oh = 0; oh < osizeH; oh++) + { + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = 0; ow < osizeW; ow++) + { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + /* local pointers */ + real *ip = input_p + d*istrideD + istartH*istrideH + istartW*istrideW; + real *op = output_p + d*osizeH*osizeW + oh*osizeW + ow; + + /* compute local average: */ + real sum = 0; + int ih, iw; + for(ih = 0; ih < kH; ih++) + { + for(iw = 0; iw < kW; iw++) + { + real val = *(ip + ih*istrideH + iw*istrideW); + sum += val; + } + } + + /* set output to local average */ + *op = sum / kW / kH; + } + } + } +} + +void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeW, + int osizeH) +{ + int dimD = 0; + int dimH = 1; + int dimW = 2; + int64_t sizeB = 1; + int64_t sizeD = 0; + int64_t isizeH = 0; + int64_t isizeW = 0; + + int64_t istrideB = 0; + int64_t istrideD = 0; + int64_t istrideH = 0; + int64_t istrideW = 0; + + real *input_data = nullptr; + real *output_data = nullptr; + + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 4) + { + istrideB = input->stride[0]; + sizeB = input->size[0]; + dimD++; + dimH++; + dimW++; + } + + /* sizes */ + sizeD = input->size[dimD]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + /* strides */ + istrideD = input->stride[dimD]; + istrideH = input->stride[dimH]; + istrideW = input->stride[dimW]; + + /* resize output */ + if (input->dim() == 3) + { + THTensor_(resize3d)(output, sizeD, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data, + sizeD, + isizeH, isizeW, + osizeH, osizeW, + istrideD, + istrideH, istrideW); + } + else + { + int64_t b; + + THTensor_(resize4d)(output, sizeB, sizeD, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeH*osizeW, + sizeD, + isizeH, isizeW, + osizeH, osizeW, + istrideD, + istrideH, istrideW); + } + } +} + +static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + int64_t sizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + real *gradInput_p_d = gradInput_p + d*isizeW*isizeH; + real *gradOutput_p_d = gradOutput_p + d*osizeW*osizeH; + + /* calculate average */ + int64_t oh, ow; + for(oh = 0; oh < osizeH; oh++) + { + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = 0; ow < osizeW; ow++) + { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + real grad_delta = gradOutput_p_d[oh*osizeW +ow] / kH / kW; + + int ih, iw; + for(ih = istartH; ih < iendH; ih++) + { + for(iw = istartW; iw < iendW; iw++) + { + /* update gradient */ + gradInput_p_d[ih*isizeW + iw] += grad_delta; + } + } + } + } + } +} + +void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + int dimD = 0; + int dimH = 1; + int dimW = 2; + int64_t sizeB = 1; + int sizeD; + int isizeH; + int isizeW; + int osizeH; + int osizeW; + real *gradInput_data; + real *gradOutput_data; + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 4) { + sizeB = input->size[0]; + dimD++; + dimH++; + dimW++; + } + + /* sizes */ + sizeD = input->size[dimD]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + osizeH = gradOutput->size[dimH]; + osizeW = gradOutput->size[dimW]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + + /* backprop */ + if (input->dim() == 3) + { + THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + sizeD, + isizeH, isizeW, + osizeH, osizeW); + } + else + { + int64_t b; +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeH*isizeW, gradOutput_data+b*sizeD*osizeH*osizeW, + sizeD, + isizeH, isizeW, + osizeH, osizeW); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif + +#undef START_IND +#undef END_IND diff --git a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c new file mode 100644 index 0000000..711fa73 --- /dev/null +++ b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c @@ -0,0 +1,270 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c" +#else + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +// 4d tensor B x D x H x W + +static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *ind_p, + int64_t sizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW, + int64_t istrideD, + int64_t istrideH, + int64_t istrideW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + /* loop over output */ + int64_t oh, ow; + for(oh = 0; oh < osizeH; oh++) + { + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = 0; ow < osizeW; ow++) + { + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + /* local pointers */ + real *ip = input_p + d*istrideD + istartH*istrideH + istartW*istrideW; + real *op = output_p + d*osizeH*osizeW + oh*osizeW + ow; + THIndex_t *indp = ind_p + d*osizeH*osizeW + oh*osizeW + ow; + + /* compute local max: */ + int64_t maxindex = -1; + real maxval = -FLT_MAX; + int ih, iw; + for(ih = 0; ih < kH; ih++) + { + for(iw = 0; iw < kW; iw++) + { + real val = *(ip + ih*istrideH + iw*istrideW); + if ((val > maxval) || std::isnan(val)) + { + maxval = val; + maxindex = (ih+istartH)*isizeW + (iw+istartW); + } + } + } + + /* set output to local max */ + *op = maxval; + + /* store location of max */ + *indp = maxindex + TH_INDEX_BASE; + } + } + } +} + +void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int osizeW, + int osizeH) +{ + int dimW = 2; + int dimH = 1; + int64_t sizeB = 1; + int64_t sizeD = 0; + int64_t isizeH = 0; + int64_t isizeW = 0; + + int64_t istrideD = 0; + int64_t istrideH = 0; + int64_t istrideW = 0; + int64_t istrideB = 0; + + real *input_data = nullptr; + real *output_data = nullptr; + THIndex_t *indices_data = nullptr; + + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 4) + { + istrideB = input->stride[0]; + sizeB = input->size[0]; + dimW++; + dimH++; + } + + /* sizes */ + sizeD = input->size[dimH-1]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + /* strides */ + istrideD = input->stride[dimH-1]; + istrideH = input->stride[dimH]; + istrideW = input->stride[dimW]; + + /* resize output */ + if (input->dim() == 3) + { + THTensor_(resize3d)(output, sizeD, osizeH, osizeW); + /* indices will contain i,j locations for each output point */ + THIndexTensor_(resize3d)(indices, sizeD, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data, + indices_data, + sizeD, + isizeH, isizeW, + osizeH, osizeW, + istrideD, + istrideH, istrideW); + } + else + { + int64_t b; + + THTensor_(resize4d)(output, sizeB, sizeD, osizeH, osizeW); + /* indices will contain i,j locations for each output point */ + THIndexTensor_(resize4d)(indices, sizeB, sizeD, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeH*osizeW, + indices_data+b*sizeD*osizeH*osizeW, + sizeD, + isizeH, isizeW, + osizeH, osizeW, + istrideD, + istrideH, istrideW); + } + } +} + +static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *ind_p, + int64_t sizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + real *gradInput_p_d = gradInput_p + d*isizeH*isizeW; + real *gradOutput_p_d = gradOutput_p + d*osizeH*osizeW; + THIndex_t *ind_p_d = ind_p + d*osizeH*osizeW; + + /* calculate max points */ + int64_t oh, ow; + for(oh = 0; oh < osizeH; oh++) + { + for(ow = 0; ow < osizeW; ow++) + { + /* retrieve position of max */ + int64_t maxp = ind_p_d[oh*osizeW + ow] - TH_INDEX_BASE; + + /* update gradient */ + gradInput_p_d[maxp] += gradOutput_p_d[oh*osizeW + ow]; + } + } + } +} + +void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices) +{ + int dimW = 2; + int dimH = 1; + int64_t sizeB = 1; + int sizeD; + int isizeH; + int isizeW; + int osizeH; + int osizeW; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 4) { + sizeB = input->size[0]; + dimW++; + dimH++; + } + + /* sizes */ + sizeD = input->size[dimH-1]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + osizeH = gradOutput->size[dimH]; + osizeW = gradOutput->size[dimW]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->dim() == 3) + { + THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + indices_data, + sizeD, + isizeH, isizeW, + osizeH, osizeW); + } + else + { + int64_t b; +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeH*isizeW, gradOutput_data+b*sizeD*osizeH*osizeW, + indices_data+b*sizeD*osizeH*osizeW, + sizeD, + isizeH, isizeW, + osizeH, osizeW); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialAveragePooling.c b/aten/src/THNN/generic/SpatialAveragePooling.c new file mode 100644 index 0000000..2a057e4 --- /dev/null +++ b/aten/src/THNN/generic/SpatialAveragePooling.c @@ -0,0 +1,329 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c" +#else + +static inline void THNN_(SpatialAveragePooling_shapeCheck)( + THTensor *input, THTensor *gradOutput, + int kH, int kW, int dH, int dW, int padH, int padW, + bool ceil_mode) { + + THArgCheck(kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size, but got " + "padW = %d, padH = %d, kW = %d, kH = %d", + padW, padH, kW, kH); + + int64_t nInputPlane = input->size[dimh-1]; + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputHeight, outputWidth; + int64_t nOutputPlane = nInputPlane; + + if(ceil_mode) + { + outputHeight = (int64_t)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1; + outputWidth = (int64_t)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1; + } + else + { + outputHeight = (int64_t)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1; + outputWidth = (int64_t)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%d). " + "Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode, + bool count_include_pad) +{ + real *output_data; + real *input_data; + + int dimw = 2; + int dimh = 1; + int dimc = 0; + int64_t nbatch = 1; + + int64_t inputWidth; + int64_t inputHeight; + int64_t outputWidth; + int64_t outputHeight; + int64_t nInputPlane; // number of channels (or colors) + + int64_t k; + + THNN_(SpatialAveragePooling_shapeCheck) + (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode); + + if (input->dim() == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + dimc++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + nInputPlane = input->size[dimc]; + + if(ceil_mode) + { + outputWidth = (int64_t)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (int64_t)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + else + { + outputWidth = (int64_t)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (int64_t)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (input->dim() == 3) + THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); + else + THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + int64_t p; + for(p = 0; p < nbatch; p++) + { + int64_t xx, yy; + /* For all output pixels... */ + real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight; + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + int64_t i; + for(i = 0; i < outputWidth*outputHeight; i++) + ptr_output[i] = 0; + + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + /* Compute the mean of the input image... */ + int64_t hstart = yy * dH - padH; + int64_t wstart = xx * dW - padW; + int64_t hend = fminf(hstart + kH, inputHeight + padH); + int64_t wend = fminf(wstart + kW, inputWidth + padW); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = fmaxf(hstart, 0); + wstart = fmaxf(wstart, 0); + hend = fminf(hend, inputHeight); + wend = fminf(wend, inputWidth); + + real sum = 0; + + int divide_factor; + if(count_include_pad) + divide_factor = pool_size; + else + divide_factor = (hend - hstart) * (wend - wstart); + + int64_t kx, ky; + + for(ky = hstart; ky < hend; ky++) + { + for(kx = wstart; kx < wend; kx++) + sum += ptr_input[ky*inputWidth + kx]; + } + /* Update output */ + *ptr_output++ += sum/divide_factor; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode, + bool count_include_pad) +{ + int dimw = 2; + int dimh = 1; + int dimc = 0; + int64_t nbatch = 1; + int64_t ndim = 3; + + int64_t inputWidth; + int64_t inputHeight; + int64_t outputWidth; + int64_t outputHeight; + int64_t nInputPlane; // number of channels (or colors) + + real *gradOutput_data; + real *gradInput_data; + + int64_t k; + + THNN_(SpatialAveragePooling_shapeCheck) + (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode); + + + if (input->dim() == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + dimc++; + ndim = 4; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + nInputPlane = input->size[dimc]; + + if(ceil_mode) + { + outputWidth = (int64_t)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (int64_t)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + else + { + outputWidth = (int64_t)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (int64_t)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + + THTensor_(resizeAs)(gradInput, input); + + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous"); + + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + int64_t p; + for(p = 0; p < nbatch; p++) + { + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; + int64_t xx, yy; + + real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + + int64_t i; + for(i=0; i= 0 && cur_target < n_classes); + + real cur_weight = weights ? weights_data[cur_target] : 1.0f; + total_weight_acc += cur_weight; + output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight; + } + } + *total_weight_data = total_weight_acc; + *output_data = output_acc; + + if (reduction == Reduction::ElementwiseMean && *total_weight_data) + *output_data /= *total_weight_data; + + THTensor_(free)(input); + THIndexTensor_(free)(target); + if (weights) + THTensor_(free)(weights); +} + +void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction, + THTensor *weights, + THTensor *total_weight, + int64_t ignore_index) +{ + INITIAL_CHECK; + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + THArgCheck(THTensor_(isContiguous)(gradInput), 4, + "gradInput must be contiguous"); + THNN_CHECK_SHAPE(input, gradInput); + ignore_index -= TH_INDEX_BASE; + + if (reduction == Reduction::None) { + GRADOUTPUT_SHAPE_CHECK; + + int64_t batch_size = THTensor_(size)(input, 0); + int64_t H = THTensor_(size)(input, 2); + int64_t W = THTensor_(size)(input, 3); + + int64_t b, h, w; + #pragma omp parallel for private(b, h, w) + for (b = 0; b < batch_size; b++) { + for (h = 0; h < H; h++) { + for (w = 0; w < W; w++) { + int64_t cur_target = (int64_t)THIndexTensor_(get3d)(target, b, h, w) - TH_INDEX_BASE; + if (cur_target == ignore_index) { + continue; + } + real value = -(weights ? THTensor_(fastGet1d)(weights, cur_target) : 1.0f); + real gradOutput_value = THTensor_(fastGet3d)(gradOutput, b, h, w); + THTensor_(fastSet4d)(gradInput, b, cur_target, h, w, value * gradOutput_value); + } + } + } + return; + } + + THNN_CHECK_DIM_SIZE(gradOutput, 1, 0, 1); + + real *total_weight_data = THTensor_(data)(total_weight); + if (*total_weight_data <= 0) + return; + + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *gradInput_data = THTensor_(data)(gradInput); + + int64_t batch_size = THTensor_(size)(input, 0); + int64_t n_classes = THTensor_(size)(input, 1); + int64_t map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3); + int64_t sample_size = map_size * n_classes; + + real normalize = (reduction == Reduction::ElementwiseMean) ? *total_weight_data : 1.0f; + + int b; + #pragma omp parallel for + for (b = 0; b < batch_size; b++) { + int elem; + for (elem = 0; elem < map_size; elem++) { + int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE; + if (cur_target == ignore_index) continue; + THAssert(cur_target >= 0 && cur_target < n_classes); + + int index = b * sample_size + cur_target * map_size + elem; + gradInput_data[index] = + -(weights ? weights_data[cur_target] : 1.0f) / normalize * THTensor_(fastGet1d)(gradOutput, 0); + } + } + + THIndexTensor_(free)(target); + if (weights) + THTensor_(free)(weights); +} + +#undef INITIAL_CHECK + +#endif diff --git a/aten/src/THNN/generic/SpatialConvolutionLocal.c b/aten/src/THNN/generic/SpatialConvolutionLocal.c new file mode 100644 index 0000000..443901a --- /dev/null +++ b/aten/src/THNN/generic/SpatialConvolutionLocal.c @@ -0,0 +1,366 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c" +#else + +static inline void THNN_(SpatialConvolutionLocal_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, + int dW, int padH, int padW, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t nInputPlane = weight->size[2] / (kH * kW); + int64_t nOutputPlane = weight->size[1]; + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane); + THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight); + THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth); + } + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +static THTensor* THNN_(view_weight_local)(THTensor *_weight) +{ + THTensor *weight = THTensor_(newContiguous)(_weight); + AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), + "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes()); + if (weight->dim() == 6) { + int64_t s1 = weight->size[0] * weight->size[1]; + int64_t s2 = weight->size[2]; + int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5]; + THTensor *old_weight = weight; + weight = THTensor_(newWithStorage3d)(weight->storage, + weight->storageOffset, + s1, -1, s2, -1, s3, -1); + THTensor_(free)(old_weight); + } + return weight; +} + +static void THNN_(SpatialConvolutionLocal_updateOutput_frame) + ( + THTensor *input, THTensor *output, + THTensor *weight, THTensor *bias, THTensor *finput, + int kW, int kH, int dW, int dH, int padW, int padH, + int64_t nInputPlane, int64_t inputWidth, int64_t inputHeight, + int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight) +{ + THTensor *output3d, *finput3d; + + THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + + THTensor_(copy)(output, bias); + + output3d = THTensor_(newWithStorage3d) + (output->storage, output->storageOffset, + outputHeight * outputWidth, 1, + nOutputPlane, outputHeight * outputWidth, + 1, nOutputPlane * outputHeight * outputWidth); + + finput3d = THTensor_(newWithStorage3d) + (finput->storage, finput->storageOffset, + outputHeight * outputWidth, 1, + kW * kH * nInputPlane, outputHeight * outputWidth, + 1, kW * kH * nInputPlane * outputHeight * outputWidth); + + // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW + // finput3d: oH*oW x nInputPlane*kH*kW x 1 + THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d); + // output3d: oH*oW x nOutputPlane x 1 + + THTensor_(free)(output3d); + THTensor_(free)(finput3d); +} + +void THNN_(SpatialConvolutionLocal_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight) +{ + weight = THNN_(view_weight_local)(weight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + + int64_t nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH); + int64_t nOutputPlane = THTensor_(size)(weight, 1); + + if(input->dim() == 3) + { + THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + + THNN_(SpatialConvolutionLocal_updateOutput_frame) + (input, output, weight, bias, finput, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + int64_t T = input->size[0]; + int64_t t; + + THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth); + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionLocal_updateOutput_frame) + (input_t, output_t, weight, bias, finput_t, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(weight); +} + + +static void THNN_(SpatialConvolutionLocal_updateGradInput_frame) + (THTensor *gradInput, THTensor *gradOutput, + THTensor *weight, THTensor *fgradInput, + int kW, int kH, int dW, int dH, int padW, int padH, + int64_t nInputPlane, int64_t inputWidth, int64_t inputHeight, + int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight) +{ + THTensor *gradOutput3d, *fgradInput3d; + gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset, + outputHeight*outputWidth, 1, + kW*kH*nInputPlane, outputHeight*outputWidth, + 1, kW*kH*nInputPlane*outputHeight*outputWidth); + // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane + // gradOutput3d: oH*oW x nOutputPlane x 1 + THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d); + // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1 + + THTensor_(free)(gradOutput3d); + THTensor_(free)(fgradInput3d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + +} + +void THNN_(SpatialConvolutionLocal_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight) +{ + weight = THNN_(view_weight_local)(weight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + int64_t nInputPlane = THTensor_(size)(weight,2)/(kW*kH); + int64_t nOutputPlane = THTensor_(size)(weight,1); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resizeAs)(fgradInput, finput); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 1, 2); + + if(input->dim() == 3) + { + THNN_(SpatialConvolutionLocal_updateGradInput_frame) + (gradInput, gradOutput, tweight, + fgradInput, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + int64_t T = input->size[0]; + int64_t t; + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(SpatialConvolutionLocal_updateGradInput_frame) + (gradInput_t, gradOutput_t, tweight, fgradInput_t, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +static void THNN_(SpatialConvolutionLocal_accGradParameters_frame) + (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, + THTensor *finput, real scale, + int kW, int kH, int dW, int dH, int padW, int padH, + int64_t nInputPlane, int64_t inputWidth, int64_t inputHeight, + int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight) +{ + + THTensor *gradOutput3d, *finput3d; + gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset, + outputHeight*outputWidth, 1, + 1, kW*kH*nInputPlane*outputHeight*outputWidth, + kW*kH*nInputPlane, outputHeight*outputWidth); + // gradOutput3d: oH*oW x nOutputPlane x 1 + // finput3d: oH*oW x 1 x kW*kH*nInputPlane + THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d); + // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane + + THTensor_(cadd)(gradBias, gradBias, scale, gradOutput); + + THTensor_(free)(gradOutput3d); + THTensor_(free)(finput3d); +} + +void THNN_(SpatialConvolutionLocal_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight, + accreal scale_) +{ + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + gradWeight = THNN_(view_weight_local)(gradWeight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + int64_t nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH); + int64_t nOutputPlane = THTensor_(size)(gradWeight,1); + + if(input->dim() == 3) + { + THNN_(SpatialConvolutionLocal_accGradParameters_frame) + (gradOutput, gradWeight, gradBias, finput, scale, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + int64_t T = input->size[0]; + int64_t t; + + for(t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionLocal_accGradParameters_frame) + (gradOutput_t, gradWeight, gradBias, finput_t, scale, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(gradWeight); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c new file mode 100644 index 0000000..cdbff69 --- /dev/null +++ b/aten/src/THNN/generic/SpatialConvolutionMM.c @@ -0,0 +1,413 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c" +#else + +static inline void THNN_(SpatialConvolutionMM_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + if (weight != NULL) { + THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, + "non-empty 2D or 4D weight tensor expected, but got: %s"); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + + int64_t exactInputHeight = inputHeight + 2 * padH; + int64_t exactInputWidth = inputWidth + 2 * padW; + + if (exactInputHeight < kH || exactInputWidth < kW) { + THError("Calculated padded input size per channel: (%ld x %ld). " + "Kernel size: (%ld x %ld). Kernel size can't be greater than actual input size", + exactInputHeight, exactInputWidth, kH, kW); + } + + int64_t outputHeight = (exactInputHeight - kH) / dH + 1; + int64_t outputWidth = (exactInputWidth - kW) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld). " + "Calculated output size per channel: (%ld x %ld). Output size is too small", + inputHeight, inputWidth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + if (weight->dim() == 2) { + nInputPlane /= (kH * kW); + } + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) { + weight = THTensor_(newContiguous)(weight); + if (weight->dim() == 4) { + int64_t s1 = weight->size[0]; + int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3]; + THTensor *old_weight = weight; + weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, + s1, -1, s2, -1); + THTensor_(free)(old_weight); + } + return weight; +} + +static void THNN_(SpatialConvolutionMM_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t nOutputPlane, + int64_t outputWidth, + int64_t outputHeight) +{ + int64_t i; + THTensor *output2d; + + THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + + output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset, + nOutputPlane, -1, + outputHeight*outputWidth, -1); + if (bias) { + for(i = 0; i < nOutputPlane; i++) + THVector_(fill) + (THStorage_(data)(output->storage) + output->storageOffset + output->stride[0] * i, + THTensor_(get1d)(bias, i), outputHeight*outputWidth); + } else { + THTensor_(zero)(output); + } + + THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); + + THTensor_(free)(output2d); +} + +void THNN_(SpatialConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + weight = THNN_(newViewWeightMM2d)(weight); + + THNN_(SpatialConvolutionMM_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, 0); + + input = THTensor_(newContiguous)(input); + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + int64_t nInputPlane = input->size[dimf]; + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t nOutputPlane = weight->size[0]; + int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + if(input->dim() == 3) + { + THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + + THNN_(SpatialConvolutionMM_updateOutput_frame) + (input, output, weight, bias, finput, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + int64_t T = input->size[0]; + int64_t t; + + THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth); + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionMM_updateOutput_frame) + (input_t, output_t, weight, bias, finput_t, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(weight); +} + +static void THNN_(SpatialConvolutionMM_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + THTensor *gradOutput2d = THTensor_(newWithStorage2d) + (gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); + THTensor_(free)(gradOutput2d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, + padW, padH, + gradInput->size[0], gradInput->size[2], gradInput->size[1], + gradOutput->size[2], gradOutput->size[1]); +} + +void THNN_(SpatialConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + weight = THNN_(newViewWeightMM2d)(weight); + + THNN_(SpatialConvolutionMM_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resizeAs)(fgradInput, finput); + + // depending on the BLAS library, fgradInput (result tensor) might + // be left uninitialized on zero alpha, which might lead to weird behavior + // hence, to be safe, zero it + THTensor_(zero)(fgradInput); + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + + if(input->dim() == 3) + { + THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, + tweight, fgradInput, + kW, kH, dW, dH, padW, padH); + } + else + { + int64_t T = input->size[0]; + int64_t t; + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, + tweight, fgradInput_t, + kW, kH, dW, dH, padW, padH); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + real scale) +{ + int64_t i; + THTensor *gradOutput2d = THTensor_(newWithStorage2d) + (gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + + if (gradWeight) { + THTensor *tfinput = THTensor_(new)(); + THTensor_(transpose)(tfinput, finput, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput); + THTensor_(free)(tfinput); + } + + if (gradBias) { + for(i = 0; i < gradBias->size[0]; i++) + { + int64_t k; + real sum = 0; + real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; + for(k = 0; k < gradOutput2d->size[1]; k++) + sum += data[k]; + (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale*sum; + } + } + + THTensor_(free)(gradOutput2d); +} + +void THNN_(SpatialConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, // can be NULL if gradWeight = NULL + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + if (gradWeight) { + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + gradWeight = THNN_(newViewWeightMM2d)(gradWeight); + } + if (gradBias) { + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + } + + THNN_(SpatialConvolutionMM_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, 1); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + if(input->dim() == 3) + { + THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, + gradBias, finput, scale); + } + else + { + int64_t T = input->size[0]; + int64_t t; + + for(t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = NULL; + if (gradWeight) { + finput_t = THTensor_(newSelect)(finput, 0, t); + } + + THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, + gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + if (gradWeight) { + THTensor_(free)(finput_t); + } + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + if (gradWeight) { + THTensor_(free)(gradWeight); + } +} + +#endif diff --git a/aten/src/THNN/generic/SpatialConvolutionMap.c b/aten/src/THNN/generic/SpatialConvolutionMap.c new file mode 100644 index 0000000..cdd74ed --- /dev/null +++ b/aten/src/THNN/generic/SpatialConvolutionMap.c @@ -0,0 +1,277 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c" +#else + +void THNN_(SpatialConvolutionMap_updateOutput)( + THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck( + weight != NULL && !weight->is_empty() && weight->dim() == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 4, + "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + int dimw = 2; + int dimh = 1; + int dimc = 0; + int64_t nbatch = 1; + + THArgCheck(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, "non-empty 3D or 4D(batch mode) tensor expected"); + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimc++; + dimw++; + dimh++; + } + + const int64_t kH = weight->size[1]; + const int64_t kW = weight->size[2]; + + THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes"); + THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size"); + + const int64_t input_w = input->size[dimw]; + const int64_t input_h = input->size[dimh]; + const int64_t output_w = (input_w - kW) / dW + 1; + const int64_t output_h = (input_h - kH) / dH + 1; + + if (input->dim() == 3) + THTensor_(resize3d)(output, nOutputPlane, output_h, output_w); + else + THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w); + + /* contiguous */ + input = THTensor_(newContiguous)(input); + output = THTensor_(newContiguous)(output); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + connTable = THTensor_(newContiguous)(connTable); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *output_data = THTensor_(data)(output); + real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *connTable_data = THTensor_(data)(connTable); + + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nOutputPlane; p++) + { + int64_t m; + for (m = 0; m < nbatch; m++) + { + /* add bias */ + real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h; + int64_t j, k; + real z= bias_data[p]; + for (j = 0; j < output_h*output_w; j++) + ptr_output[j] = z; + + /* convolve all maps */ + int nweight = connTable->size[0]; + for (k = 0; k < nweight; k++) + { + /* get offsets for input/output */ + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + + if (o == p) + { + THTensor_(validXCorr2Dptr)( + output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h, + 1.0, + input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w, + weight_data + k*kW*kH, + kH, kW, + dH, dW + ); + } + } + } + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(free)(output); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); + THTensor_(free)(connTable); +} + +void THNN_(SpatialConvolutionMap_updateGradInput)( + THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck( + weight != NULL && !weight->is_empty() && weight->dim() == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 5, + "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* and dims */ + int dimw = 2; + int dimh = 1; + int64_t nbatch = 1; + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + const int64_t input_h = input->size[dimh]; + const int64_t input_w = input->size[dimw]; + const int64_t output_h = gradOutput->size[dimh]; + const int64_t output_w = gradOutput->size[dimw]; + const int64_t kH = weight->size[1]; + const int64_t kW = weight->size[2]; + + /* contiguous */ + gradInput = THTensor_(newContiguous)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + connTable = THTensor_(newContiguous)(connTable); + + /* Resize/Zero */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* get raw pointers */ + real *gradInput_data = THTensor_(data)(gradInput); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *weight_data = THTensor_(data)(weight); + real *connTable_data = THTensor_(data)(connTable); + + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nInputPlane; p++) + { + int64_t m; + for (m = 0; m < nbatch; m++) + { + int64_t k; + /* backward all */ + int nkernel = connTable->size[0]; + for (k = 0; k < nkernel; k++) + { + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + if (i == p) + { + /* gradient to input */ + THTensor_(fullConv2Dptr)( + gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0, + gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h, output_h, output_w, + weight_data + k*kW*kH, kH, kW, dH, dW + ); + } + } + } + } + + /* clean up */ + THTensor_(free)(gradInput); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); + THTensor_(free)(connTable); +} + +void THNN_(SpatialConvolutionMap_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *connTable, + int nInputPlane, + int nOutputPlane, + int dW, int dH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THArgCheck( + gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3 + && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, + "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* and dims */ + int dimw = 2; + int dimh = 1; + int64_t nbatch = 1; + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + const int64_t input_h = input->size[dimh]; + const int64_t input_w = input->size[dimw]; + const int64_t output_h = gradOutput->size[dimh]; + const int64_t output_w = gradOutput->size[dimw]; + const int64_t kH = gradWeight->size[1]; + const int64_t kW = gradWeight->size[2]; + + /* contiguous */ + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradWeight_data = THTensor_(data)(gradWeight); + real *gradBias_data = THTensor_(data)(gradBias); + + + int64_t k; + /* gradients wrt bias */ +#pragma omp parallel for private(k) + for (k = 0; k < nOutputPlane; k++) + { + int64_t m; + for (m = 0; m < nbatch; m++) + { + real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h; + int64_t l; + for (l = 0; l < output_h*output_w; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; + } + } + + /* gradients wrt weight */ + const int nkernel = connTable->size[0]; +#pragma omp parallel for private(k) + for (k = 0; k < nkernel; k++) + { + int64_t m; + for (m = 0; m < nbatch; m++) + { + int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE; + int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE; + + /* gradient to kernel */ + THTensor_(validXCorr2DRevptr)( + gradWeight_data + k*kW*kH, + scale, + input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w, + gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w, + dH, dW + ); + } + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c new file mode 100644 index 0000000..de4ddbd --- /dev/null +++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c @@ -0,0 +1,439 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c" +#else + +static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, int weight_nullable) { + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationH: %d, dilationW: %d", + dilationH, dilationW); + + if (weight != NULL) { + THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 4, 4, weight, + "non-empty 4D weight tensor (nOutputPlane, nInputPlane, kH, kW) expected, " + "but got: %s"); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld). " + "Calculated output size per channel: (%ld x %ld). Output size is too small", + inputHeight, inputWidth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + + THNN_(SpatialDilatedConvolution_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, 0); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous"); + if (bias) { + bias = THTensor_(newContiguous)(bias); + THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous"); + } + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + } + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); + THTensor_(zero)(output); + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (!THTensor_(isContiguous)(ones) || ones->dim() != 2 || + ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + int64_t m_ = nOutputPlane; + int64_t n_ = outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 0, + THTensor_(data)(output_n), n_ + ); + } else { + THTensor_(zero)(output_n); + } + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + int64_t m = nOutputPlane; + int64_t n = columns->size[1]; + int64_t k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(columns), n, + THTensor_(data)(weight), k, + 1, + THTensor_(data)(output_n), n + ); + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (is_batch == 0) { + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(SpatialDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + THNN_(SpatialDilatedConvolution_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, 0); + + // Params + int64_t nInputPlane = weight->size[1]; + int64_t nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous"); + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], + gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); + THTensor_(zero)(gradColumns); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + int64_t m = nInputPlane*kW*kH; + int64_t n = gradColumns->size[1]; + int64_t k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(gradOutput_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(gradColumns), n + ); + + // Unpack columns back into input: + THNN_(col2im)( + THTensor_(data)(gradColumns), + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(gradInput_n) + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (is_batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + + +void THNN_(SpatialDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(SpatialDilatedConvolution_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, 1); + + // Params + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + if (gradWeight) { + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + } + THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous"); + if (gradBias) { + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous"); + } + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], + gradOutput->size[1], gradOutput->size[2]); + } + + int64_t nInputPlane = input->size[1]; + int64_t nOutputPlane = gradOutput->size[1]; + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(input_n), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + int64_t m = nOutputPlane; + int64_t n = nInputPlane*kW*kH; + int64_t k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(gradOutput_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + int64_t m_ = nOutputPlane; + int64_t k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (is_batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c new file mode 100644 index 0000000..2d595b7 --- /dev/null +++ b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c @@ -0,0 +1,401 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c" +#else + +static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( + THTensor *input, THTensor *gradOutput, THIndexTensor *indices, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, bool ceil_mode) { + + THArgCheck(kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationH > 0 && dilationW > 0, 12, + "dilation should be greater than zero, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size, but got " + "padW = %d, padH = %d, kW = %d, kH = %d", + padW, padH, kW, kH); + + int64_t nInputPlane = input->size[dimh-1]; + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputHeight, outputWidth; + int64_t nOutputPlane = nInputPlane; + + if (ceil_mode) + { + outputHeight = (int64_t)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int64_t)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputHeight = (int64_t)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int64_t)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%d). " + "Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } + if (indices != NULL) { + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth); + } +} + +static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *ind_p, + int64_t nslices, + int64_t iwidth, + int64_t iheight, + int64_t owidth, + int64_t oheight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH + ) +{ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + int64_t i, j; + real *ip = input_p + k*iwidth*iheight; + for(i = 0; i < oheight; i++) + { + for(j = 0; j < owidth; j++) + { + int64_t hstart = i * dH - padH; + int64_t wstart = j * dW - padW; + int64_t hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight); + int64_t wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth); + while(hstart < 0) + hstart += dilationH; + while(wstart < 0) + wstart += dilationW; + + /* local pointers */ + real *op = output_p + k*owidth*oheight + i*owidth + j; + THIndex_t *indp = ind_p + k*owidth*oheight + i*owidth + j; + + /* compute local max: */ + int64_t maxindex = -1; + real maxval = -THInf; + int64_t tcntr = 0; + int64_t x,y; + for(y = hstart; y < hend; y += dilationH) + { + for(x = wstart; x < wend; x += dilationW) + { + tcntr = y*iwidth + x; + real val = *(ip + tcntr); + if ((val > maxval) || std::isnan(val)) + { + maxval = val; + maxindex = tcntr; + } + } + } + + /* set output to local max */ + *op = maxval; + + /* store location of max */ + *indp = maxindex + TH_INDEX_BASE; + } + } + } +} + +void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + bool ceil_mode) +{ + + int dimw = 2; + int dimh = 1; + int64_t nbatch = 1; + int64_t nInputPlane; + int64_t inputHeight; + int64_t inputWidth; + int64_t outputHeight; + int64_t outputWidth; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + THNN_(SpatialDilatedMaxPooling_shapeCheck) + (input, NULL, NULL, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, ceil_mode); + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nInputPlane = input->size[dimh-1]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + if (ceil_mode) + { + outputHeight = (int64_t)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int64_t)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputHeight = (int64_t)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int64_t)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->dim() == 3) + { + THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize3d)(indices, nInputPlane, outputHeight, outputWidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(SpatialDilatedMaxPooling_updateOutput_frame) + (input_data, output_data, + indices_data, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH + ); + } + else + { + int64_t p; + + THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialDilatedMaxPooling_updateOutput_frame) + (input_data+p*nInputPlane*inputWidth*inputHeight, + output_data+p*nInputPlane*outputWidth*outputHeight, + indices_data+p*nInputPlane*outputWidth*outputHeight, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *ind_p, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int dW, + int dH) +{ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nInputPlane; k++) + { + real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight; + real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight; + THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight; + + /* calculate max points */ + int64_t i, j; + for(i = 0; i < outputHeight; i++) + { + for(j = 0; j < outputWidth; j++) + { + /* retrieve position of max */ + int64_t maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE; + if (maxp != -1) { + /* update gradient */ + gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j]; + } + } + } + } +} + +void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + bool ceil_mode) +{ + int dimw = 2; + int dimh = 1; + int64_t nbatch = 1; + int nInputPlane; + int inputHeight; + int inputWidth; + int outputHeight; + int outputWidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + THNN_(SpatialDilatedMaxPooling_shapeCheck) + (input, gradOutput, indices, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, ceil_mode); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nInputPlane = input->size[dimh-1]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + outputHeight = gradOutput->size[dimh]; + outputWidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->dim() == 3) + { + THNN_(SpatialDilatedMaxPooling_updateGradInput_frame) + (gradInput_data, gradOutput_data, + indices_data, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + else + { + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialDilatedMaxPooling_updateGradInput_frame) + (gradInput_data+p*nInputPlane*inputWidth*inputHeight, + gradOutput_data+p*nInputPlane*outputWidth*outputHeight, + indices_data+p*nInputPlane*outputWidth*outputHeight, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialFractionalMaxPooling.c b/aten/src/THNN/generic/SpatialFractionalMaxPooling.c new file mode 100644 index 0000000..c759872 --- /dev/null +++ b/aten/src/THNN/generic/SpatialFractionalMaxPooling.c @@ -0,0 +1,253 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c" +#else + +static int64_t* THNN_(SpatialFractionalMaxPooling_generateIntervals)( + real sample, + int64_t inputSize, + int64_t outputSize, + int poolSize) { + real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1); + int64_t* sequence = (int64_t*) THAlloc(sizeof(int64_t) * outputSize); + + int64_t i; + for (i = 0; i < outputSize - 1; ++i) { + sequence[i] = + (int64_t) ((i + sample) * alpha) - (int64_t) (sample * alpha); + } + sequence[outputSize - 1] = inputSize - poolSize; + + return sequence; +} + +static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)( + real* input, + real* output, + THIndex_t* indices, + real* randomSamples, + int64_t numPlanes, + int64_t inputW, int64_t inputH, + int64_t outputW, int64_t outputH, + int poolSizeW, int poolSizeH) { + int64_t plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; ++plane) { + /* each plane contains 2 random samples, one for W and one for H */ + real* randomSamplesForPlane = randomSamples + plane * 2; + + /* Generate interval sequence */ + int64_t* sequenceW = + THNN_(SpatialFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[0], inputW, outputW, poolSizeW); + int64_t* sequenceH = + THNN_(SpatialFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[1], inputH, outputH, poolSizeH); + + /* loop over output */ + int64_t h, w; + + real* inputForPlane = input + plane * inputW * inputH; + real* outputForPlane = output + plane * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputW * outputH; + + for (h = 0; h < outputH; ++h) { + int64_t inputHStart = sequenceH[h]; + + for (w = 0; w < outputW; ++w) { + int64_t inputWStart = sequenceW[w]; + + real maxVal = -THInf; + int64_t maxIndex = -1; + + int64_t h2, w2; + for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) { + for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) { + THAssert(h2 >= 0 && h2 < inputH); + THAssert(w2 >= 0 && w2 < inputW); + + int64_t planeIndex = h2 * inputW + w2; + real val = inputForPlane[planeIndex]; + if (val > maxVal) { + maxVal = val; + maxIndex = planeIndex; + } + } + } + + THAssert(maxVal != -THInf); + THAssert(maxIndex != -1); + + outputForPlane[h * outputW + w] = maxVal; + /* +1 to lua index */ + indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE; + } + } + + THFree(sequenceW); + THFree(sequenceH); + } +} + +void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples) { + + int64_t numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + + int64_t numInputDims = THTensor_(nDimension)(input); + THNN_ARGCHECK(!input->is_empty() && (numInputDims == 3 || numInputDims == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (numInputDims == 4) { + numBatch = THTensor_(size)(input, 0); + planeDim++; + heightDim++; + widthDim++; + } + + /* sizes */ + int64_t numPlanes = THTensor_(size)(input, planeDim); + int64_t inputH = THTensor_(size)(input, heightDim); + int64_t inputW = THTensor_(size)(input, widthDim); + + THArgCheck(outputH + poolSizeH - 1 <= inputH, 7, + "poolSizeH (%d) too large relative to input height (%d)", + poolSizeH, inputH); + THArgCheck(outputW + poolSizeW - 1 <= inputW, 6, + "poolSizeW (%d) too large relative to input width (%d)", + poolSizeW, inputW); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (numInputDims == 3) { + /* resize output */ + THTensor_(resize3d)(output, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW); + + THNN_(SpatialFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input), + THTensor_(data)(output), + THIndexTensor_(data)(indices), + THTensor_(data)(randomSamples), + numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH); + } else { + THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW); + + int64_t batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(SpatialFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input) + batch * numPlanes * inputH * inputW, + THTensor_(data)(output) + batch * numPlanes * outputH * outputW, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW, + THTensor_(data)(randomSamples) + batch * numPlanes * 2, + numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)( + real* gradInput, + real* gradOutput, + THIndex_t* indices, + int64_t numPlanes, + int64_t inputW, int64_t inputH, + int64_t outputW, int64_t outputH) { + int64_t plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; plane++) { + real* gradInputForPlane = gradInput + plane * inputW * inputH; + real* gradOutputForPlane = gradOutput + plane * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputW * outputH; + + int64_t h, w; + for (h = 0; h < outputH; ++h) { + for (w = 0; w < outputW; ++w) { + int64_t outputIndex = h * outputW + w; + int64_t index = indicesForPlane[outputIndex] - TH_INDEX_BASE; + THAssert(index >= 0 && index < inputW * inputH); + + gradInputForPlane[index] += gradOutputForPlane[outputIndex]; + } + } + } +} + +void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THIndexTensor *indices) { + + int64_t numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + + int64_t numInputDims = THTensor_(nDimension)(input); + if (numInputDims == 4) { + numBatch = THTensor_(size)(input, 0); + planeDim = 1; + heightDim++; + widthDim++; + } + + /* sizes */ + int64_t numPlanes = THTensor_(size)(input, planeDim); + int64_t inputH = THTensor_(size)(input, heightDim); + int64_t inputW = THTensor_(size)(input, widthDim); + + THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3, + "gradOutput width unexpected"); + THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3, + "gradOutput height unexpected"); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (numInputDims == 3) { + THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + THIndexTensor_(data)(indices), + numPlanes, inputW, inputH, outputW, outputH); + } else { + int64_t batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW, + THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW, + numPlanes, inputW, inputH, outputW, outputH); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialFullConvolution.c b/aten/src/THNN/generic/SpatialFullConvolution.c new file mode 100644 index 0000000..b9cd9fe --- /dev/null +++ b/aten/src/THNN/generic/SpatialFullConvolution.c @@ -0,0 +1,59 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c" +#else + +void THNN_(SpatialFullConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + THNN_(SpatialFullDilatedConvolution_updateOutput)( + state, input, output, weight, bias, columns, ones, + kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH); + } + +void THNN_(SpatialFullConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + THNN_(SpatialFullDilatedConvolution_updateGradInput)( + state, input, gradOutput, gradInput, weight, gradColumns, + kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH); +} + +void THNN_(SpatialFullConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + accreal scale_) +{ +THNN_(SpatialFullDilatedConvolution_accGradParameters)( + state, input, gradOutput, gradWeight, gradBias, columns, ones, + kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH, scale_); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialFullConvolutionMap.c b/aten/src/THNN/generic/SpatialFullConvolutionMap.c new file mode 100644 index 0000000..a6fe507 --- /dev/null +++ b/aten/src/THNN/generic/SpatialFullConvolutionMap.c @@ -0,0 +1,223 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c" +#else + +void THNN_(SpatialFullConvolutionMap_updateOutput)( + THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + // What does this mean? + THArgCheck( + weight != NULL && !weight->is_empty() && weight->dim() == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 4, + "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + const int kH = (int)weight->size[1]; + const int kW = (int)weight->size[2]; + + THArgCheck(input != NULL && !input->is_empty() && input->dim() == 3, 2, "non-empty 3D tensor expected"); + THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes"); + + THTensor_(resize3d)( + output_, nOutputPlane, + (input->size[1] - 1) * dH + kH, + (input->size[2] - 1) * dW + kW + ); + + /* contiguous */ + input = THTensor_(newContiguous)(input); + THTensor* output = THTensor_(newContiguous)(output_); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *output_data = THTensor_(data)(output); + real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *connTable_data = THTensor_(data)(connTable); + + /* and dims */ + const int64_t input_h = input->size[1]; + const int64_t input_w = input->size[2]; + const int64_t output_h = output->size[1]; + const int64_t output_w = output->size[2]; + const int64_t weight_h = weight->size[1]; + const int64_t weight_w = weight->size[2]; + + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nOutputPlane; p++) + { + /* add bias */ + real *ptr_output = output_data + p*output_w*output_h; + int64_t j; + int nweight; + int64_t k; + + for (j = 0; j < output_h*output_w; j++) + ptr_output[j] = bias_data[p]; + + /* convolve all maps */ + nweight = connTable->size[0]; + for (k = 0; k < nweight; k++) + { + /* get offsets for input/output */ + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + + if (o == p) + { + THTensor_(fullConv2Dptr)( + output_data + o*output_w*output_h, + 1.0, + input_data + i*input_w*input_h, input_h, input_w, + weight_data + k*weight_w*weight_h, weight_h, weight_w, + dH, dW + ); + } + } + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(freeCopyTo)(output, output_); +} + +void THNN_(SpatialFullConvolutionMap_updateGradInput)( + THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck( + weight != NULL && !weight->is_empty() && weight->dim() == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 5, + "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* contiguous */ + THTensor* gradInput = THTensor_(newContiguous)(gradInput_); + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* Resize/Zero */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* get raw pointers */ + real *gradInput_data = THTensor_(data)(gradInput); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *weight_data = THTensor_(data)(weight); + real *connTable_data = THTensor_(data)(connTable); + + /* and dims */ + const int64_t input_h = input->size[1]; + const int64_t input_w = input->size[2]; + const int64_t output_h = gradOutput->size[1]; + const int64_t output_w = gradOutput->size[2]; + const int64_t kH = weight->size[1]; + const int64_t kW = weight->size[2]; + + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nInputPlane; p++) + { + int64_t k; + /* backward all */ + int nkernel = connTable->size[0]; + for (k = 0; k < nkernel; k++) + { + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + if (i == p) + { + /* gradient to input */ + THTensor_(validXCorr2Dptr)( + gradInput_data + i*input_w*input_h, + 1.0, + gradOutput_data + o*output_w*output_h, output_h, output_w, + weight_data + k*kW*kH, kH, kW, + dH, dW + ); + } + } + } + + /* clean up */ + THTensor_(freeCopyTo)(gradInput, gradInput_); + THTensor_(free)(gradOutput); +} + +void THNN_(SpatialFullConvolutionMap_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *connTable, + int nInputPlane, + int nOutputPlane, + int dW, int dH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THArgCheck( + gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3 + && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, + "non-empty 3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* contiguous */ + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradWeight_data = THTensor_(data)(gradWeight); + real *gradBias_data = THTensor_(data)(gradBias); + + /* and dims */ + const int64_t input_h = input->size[1]; + const int64_t input_w = input->size[2]; + const int64_t output_h = gradOutput->size[1]; + const int64_t output_w = gradOutput->size[2]; + const int64_t weight_h = gradWeight->size[1]; + const int64_t weight_w = gradWeight->size[2]; + + /* gradients wrt bias */ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nOutputPlane; k++) + { + real *ptr_gradOutput = gradOutput_data + k*output_w*output_h; + int64_t l; + for (l = 0; l < output_h*output_w; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; + } + + /* gradients wrt weight */ + int nkernel = connTable->size[0]; +#pragma omp parallel for private(k) + for (k = 0; k < nkernel; k++) + { + int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE; + int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE; + + /* gradient to kernel */ + THTensor_(validXCorr2DRevptr)( + gradWeight_data + k*weight_w*weight_h, + scale, + gradOutput_data + o*output_w*output_h, output_h, output_w, + input_data + i*input_w*input_h, input_h, input_w, + dH, dW + ); + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c new file mode 100644 index 0000000..8c66d02 --- /dev/null +++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c @@ -0,0 +1,454 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFullDilatedConvolution.c" +#else + +static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationH: %d, dilationW: %d", + dilationH, dilationW); + THArgCheck((adjW < dW || adjW < dilationW) && (adjH < dH || adjH < dilationH), 15, + "output padding must be smaller than either stride or dilation, but got adjH: %d adjW: %d dH: %d dW: %d dilationH: %d dilationW: %d", + adjH, adjW, dH, dW, dilationH, dilationW); + + if (weight != NULL) { + THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, + "non-empty 2D or 4D weight tensor expected, but got: %s"); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, + "non-empty 3D or 4D input tensor expected but got: %s"); + + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + if (outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld). " + "Calculated output size per channel: (%ld x %ld). Output size is too small", + inputHeight, inputWidth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[0]; + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[1]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialFullDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH) +{ + THNN_(SpatialFullDilatedConvolution_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, adjH, adjW, 0); + + int nInputPlane = THTensor_(size)(weight,0); + int nOutputPlane = THTensor_(size)(weight,1); + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous"); + if (bias) { + bias = THTensor_(newContiguous)(bias); + THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous"); + } + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + } + + int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size[3]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + THTensor_(zero)(columns); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t n = columns->size[1]; + int64_t k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(input_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(columns), n + ); + + // Unpack columns back into input: + THNN_(col2im)( + THTensor_(data)(columns), + nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 1, + THTensor_(data)(output_n), n_ + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (is_batch == 0) { + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(SpatialFullDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH) +{ + THNN_(SpatialFullDilatedConvolution_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, adjH, adjW, 0); + + int64_t nInputPlane = THTensor_(size)(weight,0); + int64_t nOutputPlane = THTensor_(size)(weight,1); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous"); + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(gradOutput_n), + nOutputPlane, outputHeight, outputWidth, + inputHeight, inputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(gradColumns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = weight->size[0]; + int64_t n = gradColumns->size[1]; + int64_t k = weight->size[1] * weight->size[2] * weight->size[3]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(gradColumns), n, + THTensor_(data)(weight), k, + 0, + THTensor_(data)(gradInput_n), n + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (is_batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + + +void THNN_(SpatialFullDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(SpatialFullDilatedConvolution_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, adjH, adjW, 1); + + int64_t nOutputPlane; + if (gradWeight) { + nOutputPlane = THTensor_(size)(gradWeight, 1); + } else if (gradBias) { + nOutputPlane = THTensor_(size)(gradBias, 0); + } else { + return; + } + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + if (gradWeight) { + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + } + THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous"); + if (gradBias) { + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous"); + } + + int is_batch = 1; + if (input->dim() == 3) { + // Force batch + is_batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size[2]; + int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; + int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(gradOutput_n), + nOutputPlane, outputHeight, outputWidth, + inputHeight, inputWidth, + kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t n = columns->size[0]; // nOutputPlane * kh * kw + int64_t m = input_n->size[0]; // nInputPlane + int64_t k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(input_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (is_batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, input->size[1], inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c new file mode 100644 index 0000000..d31f3e0 --- /dev/null +++ b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c @@ -0,0 +1,250 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c" +#else + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) + +#undef MODE_BORDER +#define MODE_BORDER 1 + +static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck) + (THTensor *input, THTensor *grid, THTensor *gradOutput) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input, + "non-empty 4D input tensor expected but got: %s"); + THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 4, 2, grid, + "non-empty 4D grid tensor expected but got: %s"); + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int oheight = THTensor_(size)(grid, 1); + int owidth = THTensor_(size)(grid, 2); + + THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch); + THNN_CHECK_DIM_SIZE(grid, 4, 3, 2); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth); + } +} + +#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \ + && y < H ? THTensor_(fastGet4d)(input, n, c, y, x) : 0 + +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode) { + + THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int IH = THTensor_(size)(input, 2); + int IW = THTensor_(size)(input, 3); + int H = THTensor_(size)(grid, 1); + int W = THTensor_(size)(grid, 2); + + // resize output to the same shape as input + THTensor_(resize4d)(output, N, C, H, W); + + // loop over each output pixel + int n, h, w, c; +#pragma omp parallel for private(n, h, w, c) + for (n = 0; n < N; ++n) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y co-ordinates from grid + real ix = THTensor_(fastGet4d)(grid, n, h, w, 0); + real iy = THTensor_(fastGet4d)(grid, n, h, w, 1); + + // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ix); + int iy_nw = floor(iy); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + real nw = (ix_se - ix) * (iy_se - iy); + real ne = (ix - ix_sw) * (iy_sw - iy); + real sw = (ix_ne - ix) * (iy - iy_ne); + real se = (ix - ix_nw) * (iy - iy_nw); + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_nw, ix_nw, IW); + CLIP_COORDINATES(iy_nw, iy_nw, IH); + CLIP_COORDINATES(ix_ne, ix_ne, IW); + CLIP_COORDINATES(iy_ne, iy_ne, IH); + CLIP_COORDINATES(ix_sw, ix_sw, IW); + CLIP_COORDINATES(iy_sw, iy_sw, IH); + CLIP_COORDINATES(ix_se, ix_se, IW); + CLIP_COORDINATES(iy_se, iy_se, IH); + } + + // calculate bilinear weighted pixel value and set output pixel + for (c = 0; c < C; ++c) { + // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne + // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se + real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW); + real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW); + real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW); + real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW); + real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; + THTensor_(fastSet4d)(output, n, c, h, w, out_val); + } + } + } + } +} + +#define SAFE_ADD(input, x, y, n, c, H, W, value) \ + do { \ + if (x >= 0 && x < W && y >=0 && y < H) { \ + real old_value = THTensor_(fastGet4d)(input, n, c, y, x); \ + THTensor_(fastSet4d)(input, n, c, y, x, value + old_value); \ + } \ + } while(0) + +TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode) { + + THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int IH = THTensor_(size)(input, 2); + int IW = THTensor_(size)(input, 3); + int H = THTensor_(size)(grid, 1); + int W = THTensor_(size)(grid, 2); + + THTensor_(resize4d)(gradInput, N, C, IH, IW); + THTensor_(resize4d)(gradGrid, N, H, W, 2); + THTensor_(zero)(gradInput); + THTensor_(zero)(gradGrid); + + // loop over each output pixel + int n, h, w; +#pragma omp parallel for private(n, h, w) + for (n = 0; n < N; ++n) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y co-ordinates from grid + real ix = THTensor_(fastGet4d)(grid, n, h, w, 0); + real iy = THTensor_(fastGet4d)(grid, n, h, w, 1); + + real gix = 0; + real giy = 0; + + // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ix); + int iy_nw = floor(iy); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + real nw = (ix_se - ix) * (iy_se - iy); + real ne = (ix - ix_sw) * (iy_sw - iy); + real sw = (ix_ne - ix) * (iy - iy_ne); + real se = (ix - ix_nw) * (iy - iy_nw); + + int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; + + if (padding_mode==MODE_BORDER){ + // get clipped NE, NW, SE, SW pixel values from (x, y) + CLIP_COORDINATES(ix_nw, ix_nw_cl, IW); + CLIP_COORDINATES(iy_nw, iy_nw_cl, IH); + CLIP_COORDINATES(ix_ne, ix_ne_cl, IW); + CLIP_COORDINATES(iy_ne, iy_ne_cl, IH); + CLIP_COORDINATES(ix_sw, ix_sw_cl, IW); + CLIP_COORDINATES(iy_sw, iy_sw_cl, IH); + CLIP_COORDINATES(ix_se, ix_se_cl, IW); + CLIP_COORDINATES(iy_se, iy_se_cl, IH); + } + else { + ix_nw_cl = ix_nw; + iy_nw_cl = iy_nw; + ix_ne_cl = ix_ne; + iy_ne_cl = iy_ne; + ix_sw_cl = ix_sw; + iy_sw_cl = iy_sw; + ix_se_cl = ix_se; + iy_se_cl = iy_se; + } + + for (int c = 0; c < C; ++c) { + real gradout = THTensor_(fastGet4d)(gradOutput, n, c, h, w); + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout); + SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout); + SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout); + SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout); + + // calculate gradGrid + real nw_val = SAFE_GET(input, ix_nw_cl, iy_nw_cl, n, c, IH, IW); + real ne_val = SAFE_GET(input, ix_ne_cl, iy_ne_cl, n, c, IH, IW); + real sw_val = SAFE_GET(input, ix_sw_cl, iy_sw_cl, n, c, IH, IW); + real se_val = SAFE_GET(input, ix_se_cl, iy_se_cl, n, c, IH, IW); + + gix -= nw_val * (iy_se - iy) * gradout; + gix += ne_val * (iy_sw - iy) * gradout; + gix -= sw_val * (iy - iy_ne) * gradout; + gix += se_val * (iy - iy_nw) * gradout; + + giy -= nw_val * (ix_se - ix) * gradout; + giy -= ne_val * (ix - ix_sw) * gradout; + giy += sw_val * (ix_ne - ix) * gradout; + giy += se_val * (ix - ix_nw) * gradout; + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + + real gix_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 0); + real giy_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 1); + + THTensor_(fastSet4d)(gradGrid, n, h, w, 0, gix_old + gix); + THTensor_(fastSet4d)(gradGrid, n, h, w, 1, giy_old + giy); + } + } + } +} + + +#undef MIN +#undef MAX +#undef SAFE_GET +#undef CLIP_COORDINATES +#undef SAFE_ADD +#undef MODE_BORDER + +#endif diff --git a/aten/src/THNN/generic/SpatialMaxPooling.c b/aten/src/THNN/generic/SpatialMaxPooling.c new file mode 100644 index 0000000..88aaa40 --- /dev/null +++ b/aten/src/THNN/generic/SpatialMaxPooling.c @@ -0,0 +1,44 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c" +#else + +void THNN_(SpatialMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode + ); +} + +void THNN_(SpatialMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode + ); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialMaxUnpooling.c b/aten/src/THNN/generic/SpatialMaxUnpooling.c new file mode 100644 index 0000000..64179b5 --- /dev/null +++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c @@ -0,0 +1,234 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c" +#else + +static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p, + THIndex_t *ind_p, + int nslices, + int iwidth, int iheight, + int owidth, int oheight) +{ + int k; + int has_error = 0; + THIndex_t error_index = 0; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *output_p_k = output_p + k*owidth*oheight; + real *input_p_k = input_p + k*iwidth*iheight; + THIndex_t *ind_p_k = ind_p + k*iwidth*iheight; + + int i, j; + THIndex_t maxp; + for(i = 0; i < iheight; i++) + { + for(j = 0; j < iwidth; j++) + { + maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */ + if(maxp<0 || maxp>=owidth*oheight){ +#pragma omp critical + { + has_error = 1; + error_index = maxp; + } + } else { + output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */ + } + } + } + } + if (has_error) { + THError("found an invalid max index %ld (output volumes are of size %dx%d)", + error_index, oheight, owidth); + } +} + +void THNN_(SpatialMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int owidth, int oheight) +{ + int dimw = 2; + int dimh = 1; + int nbatch = 1; + int nslices; + int iheight; + int iwidth; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + + AT_CHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input, but got sizes: ", input->sizes()); + THNN_CHECK_SHAPE_INDICES(input, indices); + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + + /* get contiguous input and indices */ + input = THTensor_(newContiguous)(input); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize output */ + if (input->dim() == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data, + indices_data, + nslices, + iwidth, iheight, + owidth, oheight); + } + else + { + int p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialMaxUnpooling_updateOutput_frame)( + input_data+p*nslices*iwidth*iheight, + output_data+p*nslices*owidth*oheight, + indices_data+p*nslices*iwidth*iheight, + nslices, + iwidth, iheight, + owidth, oheight); + } + } + + /* cleanup */ + THTensor_(free)(input); + THIndexTensor_(free)(indices); +} + +static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p, + THIndex_t *ind_p, + int nslices, + int iwidth, int iheight, + int owidth, int oheight) +{ + int k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k*iwidth*iheight; + real *gradOutput_p_k = gradOutput_p + k*owidth*oheight; + THIndex_t *ind_p_k = ind_p + k*iwidth*iheight; + + int i, j; + THIndex_t maxp; + for(i = 0; i < iheight; i++) + { + for(j = 0; j < iwidth; j++) + { + maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */ + if(maxp < 0 || maxp >= owidth * oheight) { + THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight); + } + gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */ + } + } + } +} + +void THNN_(SpatialMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int owidth, int oheight) +{ + int dimw = 2; + int dimh = 1; + int nbatch = 1; + int nslices; + int iheight; + int iwidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + THNN_CHECK_SHAPE_INDICES(input, indices); + + /* get contiguous gradOutput and indices */ + gradOutput = THTensor_(newContiguous)(gradOutput); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + + if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){ + THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", + oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]); + } + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->dim() == 3) + { + THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + indices_data, + nslices, + iwidth, iheight, + owidth, oheight); + } + else + { + int p; + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight, + indices_data+p*nslices*iwidth*iheight, + nslices, + iwidth, iheight, + owidth, oheight); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); + THIndexTensor_(free)(indices); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialReflectionPadding.c b/aten/src/THNN/generic/SpatialReflectionPadding.c new file mode 100644 index 0000000..4ccdca8 --- /dev/null +++ b/aten/src/THNN/generic/SpatialReflectionPadding.c @@ -0,0 +1,272 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c" +#else + +static void THNN_(SpatialReflectionPadding_updateOutput_frame)( + real *input_p, real *output_p, + int64_t nslices, + int64_t iwidth, int64_t iheight, + int64_t owidth, int64_t oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + int64_t k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + + for (k = 0; k < nslices; k++) + { + int64_t i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l * 2 - j; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = (iwidth + pad_l - 1) * 2 - j; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t * 2 - i; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = (iheight + pad_t - 1) * 2 - i; + } + ip_y = ip_y - oStartY + iStartY; + + real *dest_p = output_p + k*owidth*oheight + i * owidth + j; + real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p = *src_p; + } + } + } +} + +void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + int64_t nbatch = 1; + int64_t nslices; + int64_t iheight; + int64_t iwidth; + int64_t oheight; + int64_t owidth; + real *input_data; + real *output_data; + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* input sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + + THArgCheck(pad_l < iwidth && pad_r < iwidth, 4, + "Padding size should be less than the corresponding input dimension, " + "but got: padding (%d, %d) at dimension %d of input %s", + pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str); + + THArgCheck(pad_t < iheight && pad_b < iheight, 6, + "Padding size should be less than the corresponding input dimension, " + "but got: padding (%d, %d) at dimension %d of input %s", + pad_t, pad_b, dimh, _THSizeDesc(input->size, input->dim()).str); + + /* output sizes */ + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth >= 1 || oheight >= 1 , 2, + "input (H: %d, W: %d)is too small." + " Calculated output H: %d W: %d", + iheight, iwidth, oheight, owidth); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->dim() == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + else + { + int64_t p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialReflectionPadding_updateOutput_frame)( + input_data+p*nslices*iwidth*iheight, + output_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialReflectionPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + int64_t nslices, + int64_t iwidth, int64_t iheight, + int64_t owidth, int64_t oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + int64_t k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + + for (k = 0; k < nslices; k++) + { + int64_t i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l * 2 - j; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = (iwidth + pad_l - 1) * 2 - j; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t * 2 - i; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = (iheight + pad_t - 1) * 2 - i; + } + ip_y = ip_y - oStartY + iStartY; + + real *src_p = goutput_p + k*owidth*oheight + i * owidth + j; + real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p += *src_p; + } + } + } +} + +void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + int64_t nbatch = 1; + int64_t nslices; + int64_t iheight; + int64_t iwidth; + int64_t oheight; + int64_t owidth; + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THTensor_(size)(gradOutput, dimh)); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->dim() == 3) { + THNN_(SpatialReflectionPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } else { + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(SpatialReflectionPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * iheight * iwidth, + THTensor_(data)(gradOutput) + p * nslices * oheight * owidth, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialReplicationPadding.c b/aten/src/THNN/generic/SpatialReplicationPadding.c new file mode 100644 index 0000000..32c125d --- /dev/null +++ b/aten/src/THNN/generic/SpatialReplicationPadding.c @@ -0,0 +1,260 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c" +#else + +static void THNN_(SpatialReplicationPadding_updateOutput_frame)( + real *input_p, real *output_p, + int64_t nslices, + int64_t iwidth, int64_t iheight, + int64_t owidth, int64_t oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + int64_t k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + for (k = 0; k < nslices; k++) + { + int64_t i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = iwidth + pad_l - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = iheight + pad_t - 1; + } + ip_y = ip_y - oStartY + iStartY; + + real *dest_p = output_p + k*owidth*oheight + i * owidth + j; + real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p = *src_p; + } + } + } +} + +void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + int64_t nbatch = 1; + int64_t nslices; + int64_t iheight; + int64_t iwidth; + int64_t oheight; + int64_t owidth; + real *input_data; + real *output_data; + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth >= 1 || oheight >= 1 , 2, + "input (H: %d, W: %d)is too small." + " Calculated output H: %d W: %d", + iheight, iwidth, oheight, owidth); + + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->dim() == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + else + { + int64_t p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialReplicationPadding_updateOutput_frame)( + input_data+p*nslices*iwidth*iheight, + output_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialReplicationPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + int64_t nslices, + int64_t iwidth, int64_t iheight, + int64_t owidth, int64_t oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + int64_t k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + for (k = 0; k < nslices; k++) + { + int64_t i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = iwidth + pad_l - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = iheight + pad_t - 1; + } + ip_y = ip_y - oStartY + iStartY; + + real *src_p = goutput_p + k*owidth*oheight + i * owidth + j; + real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p += *src_p; + } + } + } +} + +void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + int64_t nbatch = 1; + int64_t nslices; + int64_t iheight; + int64_t iwidth; + int64_t oheight; + int64_t owidth; + + if (input->dim() == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THTensor_(size)(gradOutput, dimh)); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->dim() == 3) { + THNN_(SpatialReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } else { + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(SpatialReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * iheight * iwidth, + THTensor_(data)(gradOutput) + p * nslices * oheight * owidth, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + + +#endif diff --git a/aten/src/THNN/generic/SpatialSubSampling.c b/aten/src/THNN/generic/SpatialSubSampling.c new file mode 100644 index 0000000..8f9f95d --- /dev/null +++ b/aten/src/THNN/generic/SpatialSubSampling.c @@ -0,0 +1,299 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialSubSampling.c" +#else + +static inline void THNN_(SpatialSubSampling_shapeCheck)( + THTensor *input, + THTensor *gradOutput, + THTensor *weight, + int kW, int kH) { + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input, + "3D or 4D input tensor expected but got: %s"); + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + + int nInputPlane = THTensor_(size)(weight, 0); + + int dimw = 2; + int dimh = 1; + + int64_t inputWidth; + int64_t inputHeight; + + if (input->dim() == 4) { + dimw++; + dimh++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + + THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes"); + THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size"); +} + +void THNN_(SpatialSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int kH, + int dW, int dH) +{ + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + + real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *output_data; + real *input_data; + + int dimw = 2; + int dimh = 1; + int64_t nbatch = 1; + + int64_t inputWidth; + int64_t inputHeight; + int64_t outputWidth; + int64_t outputHeight; + + int nInputPlane = THTensor_(size)(weight,0); + + int64_t k; + + THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH); + + if (input->dim() == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + outputWidth = (inputWidth - kW) / dW + 1; + outputHeight = (inputHeight - kH) / dH + 1; + + if (input->dim() == 3) + THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); + else + THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + int64_t p; + for(p = 0; p < nbatch; p++) + { + int64_t xx, yy; + /* For all output pixels... */ + real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight; + /* Get the good mask for (k,i) (k out, i in) */ + real the_weight = weight_data[k]; + /* Initialize to the bias */ + real z = bias_data[k]; + int64_t i; + for(i = 0; i < outputWidth*outputHeight; i++) + ptr_output[i] = z; + + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + /* Compute the mean of the input image... */ + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; + real sum = 0; + int64_t kx, ky; + + for(ky = 0; ky < kH; ky++) + { + for(kx = 0; kx < kW; kx++) + sum += ptr_input[kx]; + ptr_input += inputWidth; /* next input line */ + } + /* Update output */ + *ptr_output++ += the_weight*sum; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int kH, + int dW, int dH) +{ + THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH); + + int dimw = 2; + int dimh = 1; + int64_t nbatch = 1; + + int64_t inputWidth; + int64_t inputHeight; + int64_t outputWidth; + int64_t outputHeight; + + int nInputPlane = THTensor_(size)(weight,0); + + real *weight_data; + real *gradOutput_data; + real *gradInput_data; + + int64_t k; + + if (input->dim() == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + outputWidth = (inputWidth - kW) / dW + 1; + outputHeight = (inputHeight - kH) / dH + 1; + + weight_data = THTensor_(data)(weight); + gradOutput = THTensor_(newContiguous)(gradOutput); + gradOutput_data = THTensor_(data)(gradOutput); + + THTensor_(resizeAs)(gradInput, input); + gradInput_data = THTensor_(data)(gradInput); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + int64_t p; + for(p = 0; p < nbatch; p++) + { + real the_weight = weight_data[k]; + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; + int64_t xx, yy; + + real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + int64_t i; + for(i=0; idim() == 4) { + dimw++; + dimh++; + nbatch = input->size[0]; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + outputWidth = (inputWidth - kW) / dW + 1; + outputHeight = (inputHeight - kH) / dH + 1; + + gradWeight_data = THTensor_(data)(gradWeight); + gradBias_data = THTensor_(data)(gradBias); + gradOutput = THTensor_(newContiguous)(gradOutput); + gradOutput_data = THTensor_(data)(gradOutput); + + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + int64_t p; + for(p = 0; p < nbatch; p++) + { + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; + real sum; + int64_t xx, yy; + int64_t i; + + sum = 0; + for(i = 0; i < outputWidth*outputHeight; i++) + sum += ptr_gradOutput[i]; + gradBias_data[k] += scale*sum; + + sum = 0; + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; + real z = *ptr_gradOutput++; + int64_t kx, ky; + + for(ky = 0; ky < kH; ky++) + { + for(kx = 0; kx < kW; kx++) + sum += z * ptr_input[kx]; + ptr_input += inputWidth; + } + } + } + gradWeight_data[k] += scale*sum; + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialUpSamplingBilinear.c b/aten/src/THNN/generic/SpatialUpSamplingBilinear.c new file mode 100644 index 0000000..1998d3b --- /dev/null +++ b/aten/src/THNN/generic/SpatialUpSamplingBilinear.c @@ -0,0 +1,180 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou + +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c" +#else + +#include "linear_upsampling.h" + +static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputHeight, int inputWidth, + int outputHeight, int outputWidth) { + THArgCheck(inputHeight > 0 && inputWidth > 0 + && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (H: %d, W: %d) output (H: %d, W: %d)", + inputHeight, inputWidth, outputHeight, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input, + "non-empty 4D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth); + } +} + +void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputHeight, + int outputWidth, + bool align_corners){ + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputHeight = THTensor_(size)(input, 2); + int inputWidth = THTensor_(size)(input, 3); + + THNN_(SpatialUpSamplingBilinear_shapeCheck) + (input, NULL, + nbatch, channels, + inputHeight, inputWidth, + outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + THTensor_(resize4d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputHeight, outputWidth); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + channels = nbatch * channels; + THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); + // special case: just copy + if (inputHeight == outputHeight && inputWidth == outputWidth) { + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[h1 * inputWidth + w1]; + real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + THTensor_(free)(input); + return; + } + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + for (int h2 = 0; h2 < outputHeight; ++h2) { + const accreal h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + const real* pos1 = &idata[h1 * inputWidth + w1]; + real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p]) + + h1lambda * (w0lambda * pos1[h1p * inputWidth] + + w1lambda * pos1[h1p * inputWidth + w1p]); + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth, + bool align_corners){ + + THNN_(SpatialUpSamplingBilinear_shapeCheck) + (NULL, gradOutput, + nbatch, channels, + inputHeight, inputWidth, + outputHeight, outputWidth); + + THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *data1 = THTensor_(data)(gradInput); + real *data2 = THTensor_(data)(gradOutput); + channels = nbatch * channels; + + // special case: same-size matching grids + if (inputHeight == outputHeight && inputWidth == outputWidth) { + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &data1[h1 * inputWidth + w1]; + const real* pos2 = &data2[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + THTensor_(free)(gradOutput); + return; + } + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + for (int h2 = 0; h2 < outputHeight; ++h2) { + const accreal h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + real* pos1 = &data1[h1 * inputWidth + w1]; + const real* pos2 = &data2[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += h0lambda * w0lambda * pos2[0]; + pos1[w1p] += h0lambda * w1lambda * pos2[0]; + pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0]; + pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0]; + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/SpatialUpSamplingNearest.c b/aten/src/THNN/generic/SpatialUpSamplingNearest.c new file mode 100644 index 0000000..92eaddd --- /dev/null +++ b/aten/src/THNN/generic/SpatialUpSamplingNearest.c @@ -0,0 +1,154 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c" +#else + +#include "linear_upsampling.h" + +static inline void THNN_(SpatialUpSamplingNearest_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputHeight, int inputWidth, + int outputHeight, int outputWidth) { + THArgCheck(inputHeight > 0 && inputWidth > 0 + && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (H: %d, W: %d) output (H: %d, W: %d)", + inputHeight, inputWidth, outputHeight, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(input->_dim() == 4, 2, input, + "4D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth); + } +} + + +void THNN_(SpatialUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputHeight, + int outputWidth) +{ + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputHeight = THTensor_(size)(input, 2); + int inputWidth = THTensor_(size)(input, 3); + const float height_scale = (float) inputHeight / (float) outputHeight; + const float width_scale = (float) inputWidth / (float) outputWidth; + + THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, nbatch, channels, + inputHeight, inputWidth, outputHeight, outputWidth); + + THTensor_(resize4d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputHeight, + outputWidth); + channels = channels * nbatch; + + THAssert(inputWidth > 0 && outputWidth > 0); + + input = THTensor_(newContiguous)(input); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + + // special case: just copy + if (inputHeight == outputHeight && inputWidth == outputWidth) { + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[h1 * inputWidth + w1]; + real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputHeight * inputWidth; + pos2 += outputHeight * outputWidth; + } + } + } + THTensor_(free)(input); + return; + } + + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight); + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth); + const real* pos1 = &idata[h1 * inputWidth + w1]; + real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputHeight * inputWidth; + pos2 += outputHeight * outputWidth; + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth) +{ + THNN_(SpatialUpSamplingNearest_shapeCheck)(NULL, gradOutput, nbatch, channels, + inputHeight, inputWidth, outputHeight, outputWidth); + THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *idata = THTensor_(data)(gradInput); + real *odata = THTensor_(data)(gradOutput); + channels = nbatch * channels; + const float height_scale = (float) inputHeight / (float)outputHeight; + const float width_scale = (float) inputWidth / (float)outputWidth; + // special case: just copy + if (inputHeight == outputHeight && inputWidth == outputWidth) { + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &idata[h1 * inputWidth + w1]; + const real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] = pos2[0]; + pos1 += inputHeight * inputWidth; + pos2 += outputHeight * outputWidth; + } + } + } + THTensor_(free)(gradOutput); + return; + } + + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight); + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth); + real* pos1 = &idata[h1 * inputWidth + w1]; + const real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputHeight * inputWidth; + pos2 += outputHeight * outputWidth; + } + } + } + + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/Sqrt.c b/aten/src/THNN/generic/Sqrt.c new file mode 100644 index 0000000..5e75c7d --- /dev/null +++ b/aten/src/THNN/generic/Sqrt.c @@ -0,0 +1,51 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Sqrt.c" +#else + +void THNN_(Sqrt_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal eps_) +{ + THTensor_(resizeAs)(output, input); + THTensor_(sqrt)(output, input); +} + +void THNN_(Sqrt_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_SHAPE(output, gradOutput); + THTensor_(resizeAs)(gradInput, input); + + if (output->_dim() == 1 || + !THTensor_(isContiguous)(output) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data)); + ); + } + else + { + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradInput_data = THTensor_(data)(gradInput); + real *output_data = THTensor_(data)(output); + int64_t i; +#pragma omp parallel for private(i) + for(i = 0; i < THTensor_(nElement)(output); i++) + { + if (output_data[i] == 0.0) + gradInput_data[i] = 0.0; + else + gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]); + } + } +} + +#endif diff --git a/aten/src/THNN/generic/Square.c b/aten/src/THNN/generic/Square.c new file mode 100644 index 0000000..fac8ee3 --- /dev/null +++ b/aten/src/THNN/generic/Square.c @@ -0,0 +1,59 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Square.c" +#else + +void THNN_(Square_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(resizeAs)(output, input); + + if (input->_dim() == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + TH_TENSOR_APPLY2(real, output, real, input, + *output_data = (*input_data) * (*input_data); + ); + } + else + { + real *output_data = THTensor_(data)(output); + real *input_data = THTensor_(data)(input); + int64_t i; +#pragma omp parallel for private(i) + for (i = 0; i < THTensor_(nElement)(input); i++) + output_data[i] = input_data[i]*input_data[i]; + } +} + +void THNN_(Square_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + THNN_CHECK_SHAPE(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + + if (input->_dim() == 1 || + !THTensor_(isContiguous)(input) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + *gradInput_data = 2.0 * (*gradOutput_data) * (*input_data); + ); + } + else + { + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradInput_data = THTensor_(data)(gradInput); + real *input_data = THTensor_(data)(input); + int64_t i; +#pragma omp parallel for private(i) + for (i = 0; i < THTensor_(nElement)(gradInput); i++) + gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i]; + } +} + +#endif diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h new file mode 100644 index 0000000..455da04 --- /dev/null +++ b/aten/src/THNN/generic/THNN.h @@ -0,0 +1,1721 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THNN.h" +#else + +#include "Reduction.h" + +TH_API void THNN_(Abs_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output); // [OUT] Abs output +TH_API void THNN_(Abs_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput); // [OUT] gradient w.r.t. input + +TH_API void THNN_(AbsCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // tensor with target values + THTensor *output, // [OUT] a one-element tensor with loss + int64_t reduction); +TH_API void THNN_(AbsCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // tensor with target values + THTensor *gradOutput, + THTensor *gradInput, // [OUT] gradient w.r.t. input + int64_t reduction); + +TH_API void THNN_(BCECriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction, + THTensor *weights); // [OPTIONAL] +TH_API void THNN_(BCECriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction, + THTensor *weights); // [OPTIONAL] + +TH_API void THNN_(ClassNLLCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor (1D/2D) + THIndexTensor *target, // tensor containing indexes of target classes + THTensor *output, // [OUT] a one-element tensor with loss + int64_t reduction, + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight, // [BUFFER] + int64_t ignore_index); // target index to ignore (loss = 0, gradInput = 0) +TH_API void THNN_(ClassNLLCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor (1D/2D) + THIndexTensor *target, // tensor containing indexes of target classes + THTensor *gradOutput, + THTensor *gradInput, // [OUT] gradient w.r.t. input + int64_t reduction, + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight, // [BUFFER] + int64_t ignore_index); // target index to ignore (loss = 0, gradInput = 0) + +TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor (4D) + THIndexTensor *target, // tensor containing indexes of target classes (3D) + THTensor *output, // [OUT] a one-element tensor with loss + int64_t reduction, + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight, // [BUFFER] + int64_t ignore_index); // target index to ignore (loss = 0, gradInput = 0) + +TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor (4D) + THIndexTensor *target, // tensor containing indexes of target classes (3D) + THTensor *gradOutput, + THTensor *gradInput, // [OUT] gradient w.r.t. input + int64_t reduction, + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight, // [BUFFER] + int64_t ignore_index); // target index to ignore (loss = 0, gradInput = 0) + +TH_API void THNN_(ELU_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] ELU output + accreal alpha, // an ELU parameter (as in paper) + accreal scale, // scaling factor + bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) +TH_API void THNN_(ELU_updateGradInput)( + THNNState *state, // library's state + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *output, // output from a forward pass + accreal alpha, // an ELU parameter (as in paper) + accreal scale); + +TH_API void THNN_(DistKLDivCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor + THTensor *output, // [OUT] a one-element tensor containing the loss + int64_t reduction); +TH_API void THNN_(DistKLDivCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor + THTensor *gradOutput, // grad output tensor + THTensor *gradInput, // [OUT] gradient w.r.t. input + int64_t reduction); + +TH_API void THNN_(GatedLinear_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] output tensor, half size of input along dimension dim + int dim); // dimension for halving operation +TH_API void THNN_(GatedLinear_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t module's output + THTensor *gradInput, // [OUT] gradient w.r.t input + int dim); // dimension for halving operation + +// HardTanh clamps the values to the interval [min_val; max_val]. +TH_API void THNN_(HardTanh_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] output tensor + accreal min_val, // lower threshold + accreal max_val, // upper threshold + bool inplace); +TH_API void THNN_(HardTanh_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. the input + accreal min_val, // lower threshold + accreal max_val, // upper threshold + bool inplace); + +TH_API void THNN_(Im2Col_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +TH_API void THNN_(Im2Col_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +TH_API void THNN_(Col2Im_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +TH_API void THNN_(Col2Im_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); + +TH_API void THNN_(L1Cost_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output); // [OUT] output tensor +TH_API void THNN_(L1Cost_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // [OPTIONAL] gradient w.r.t module's output + THTensor *gradInput); // [OUT] gradient w.r.t the input + +TH_API void THNN_(LeakyReLU_updateOutput)( + THNNState *state, // library's state + THTensor *input, // [MODIFIED] input tensor + THTensor *output, // [OUT] output tensor + accreal negval, // negative part slope + bool inplace); // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated) +TH_API void THNN_(LeakyReLU_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // [MODIFIED] gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. the input + accreal negval, // negative part slope + bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) + +TH_API void THNN_(GRUFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, // [OPTIONAL] + THTensor *bias2, // [OPTIONAL] + THTensor *hx, + THTensor *output, + THTensor *storage); +TH_API void THNN_(GRUFused_updateGradInput)( + THNNState *state, + THTensor *gradInInput, + THTensor *gradInHidden, + THTensor *gradOutput, + THTensor *gradInputHx, + THTensor *storage); + +TH_API void THNN_(LSTMFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, // [OPTIONAL] + THTensor *bias2, // [OPTIONAL] + THTensor *cell, + THTensor *output, + THTensor *outputCell); +TH_API void THNN_(LSTMFused_updateGradInput)( + THNNState *state, + THTensor *storage, + THTensor *gradInGates, + THTensor *cx, + THTensor *cy, + THTensor *gradOutput, + THTensor *gradOutputCell, + THTensor *gradInputCx); + +TH_API void THNN_(LogSigmoid_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // output tensor + THTensor *buffer); // [BUFFER] +TH_API void THNN_(LogSigmoid_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input + THTensor *gradOutput, // gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *buffer); // [BUFFER] + +TH_API void THNN_(LookupTable_accGradParameters)( + THNNState *state, + THIndexTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THIntegerTensor *count, + THTensor *sorted, // [OPTIONAL] + THIndexTensor *indices, // [OPTIONAL] + bool scaleGradByFreq, + int paddingValue, + accreal scale); + +TH_API void THNN_(LookupTable_renorm)( + THNNState *state, // library's state + THIndexTensor *idx, // vector containing row indices (modified in function) + THTensor *weight, // 2D tensor whose rows will be renormalized + accreal maxNorm, // maximum norm + accreal normType); // the norm type (e.g., normType=2, then it's 2-norm) + +TH_API void THNN_(MarginCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor (should contain only 1s and -1s) + THTensor *output, // [OUT] a one-element tensor containing the loss + bool sizeAverage, // if true, the loss is normalized by **total number of elements** + accreal margin); // a margin that is required for the loss to be 0 + +TH_API void THNN_(MarginCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor (should contin only 1s and -1s) + THTensor *gradInput, // [OUT] gradient w.r.t. module's input + bool sizeAverage, // if true, the gradient is normalized by **total number of elements** + accreal margin); // a margin that is required for the loss to be 0 + +TH_API void THNN_(SoftMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction); + +TH_API void THNN_(SoftMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction); + +TH_API void THNN_(MSECriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction); +TH_API void THNN_(MSECriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction); + +TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + THTensor *isTarget, + int64_t reduction); +TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *isTarget, + int64_t reduction); + +TH_API void THNN_(MultiMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + int64_t reduction, + int p, + THTensor* weights, // [OPTIONAL] + accreal margin); +TH_API void THNN_(MultiMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction, + int p, + THTensor *weights, // [OPTIONAL] + accreal margin); + +TH_API void THNN_(PReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight); +TH_API void THNN_(PReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight); +TH_API void THNN_(PReLU_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradWeight, + accreal scale); + +TH_API void THNN_(Linear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *addBuffer); +TH_API void THNN_(Linear_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight); +TH_API void THNN_(Linear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *addBuffer, + accreal scale); + +TH_API void THNN_(RReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *noise, + accreal lower, + accreal upper, + bool train, + bool inplace, + THGenerator *generator); +TH_API void THNN_(RReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *noise, + accreal lower, + accreal upper, + bool train, + bool inplace); + +TH_API void THNN_(Sigmoid_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(Sigmoid_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(SmoothL1Criterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + int64_t reduction); +TH_API void THNN_(SmoothL1Criterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradOutput, + THTensor *gradInput, + int64_t reduction); + +TH_API void THNN_(SoftPlus_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal beta, + accreal threshold); +TH_API void THNN_(SoftPlus_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output, + accreal beta, + accreal threshold); + +TH_API void THNN_(SoftShrink_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal lambda); +TH_API void THNN_(SoftShrink_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal lambda); + + +TH_API void THNN_(IndexLinear_updateOutput)( + THNNState *state, + THIndexTensor *keys, + int64_t keysOffset, + THTensor *values, + THIndexTensor *sizes, + THIndexTensor *cumSumSizes, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *normalizedValues, + int train); +TH_API void THNN_(IndexLinear_accGradParameters)( + THNNState *state, + THIndexTensor *keys, + int64_t keysOffset, + THTensor *values, + THIndexTensor *sizes, + THIndexTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THTensor* valuesBuffer, + accreal weightDecay, + accreal scale); +TH_API void THNN_(IndexLinear_accUpdateGradParameters)( + THNNState *state, + THIndexTensor *keys, + int64_t keysOffset, + THTensor *values, + THIndexTensor *sizes, + THIndexTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + accreal weightDecay, + accreal scale); +TH_API void THNN_(IndexLinear_updateParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THIndexTensor *runningKeys, + THIndexTensor *cumSumSizes, + int64_t keysOffset, + accreal weightDecay, + accreal learningRate); + +TH_API void THNN_(SparseLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias); +TH_API void THNN_(SparseLinear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay, + accreal scale); +TH_API void THNN_(SparseLinear_zeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput); +TH_API void THNN_(SparseLinear_updateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate); +TH_API void THNN_(SparseLinear_legacyUpdateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias); +TH_API void THNN_(SparseLinear_legacyAccGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay, + accreal scale); +TH_API void THNN_(SparseLinear_legacyZeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput); +TH_API void THNN_(SparseLinear_legacyUpdateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate); + +TH_API void THNN_(Sqrt_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal eps); +TH_API void THNN_(Sqrt_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(Square_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(Square_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput); + +TH_API void THNN_(Tanh_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(Tanh_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(Threshold_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal threshold, + accreal val, + bool inplace); +TH_API void THNN_(Threshold_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal threshold, + accreal val, + bool inplace); + +TH_API void THNN_(TemporalConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int dW, + int inputFrameSize, + int outputFrameSize); +TH_API void THNN_(TemporalConvolution_updateGradInput)( + THNNState* state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int dW); +TH_API void THNN_(TemporalConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int dW, + accreal scale); +TH_API void THNN_(TemporalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, int dW); +TH_API void THNN_(TemporalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, int dW); +TH_API void THNN_(TemporalSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int dW, + int inputFrameSize); +TH_API void THNN_(TemporalSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int dW); +TH_API void THNN_(TemporalSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int dW, + accreal scale); + +TH_API void THNN_(TemporalRowConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst); +TH_API void THNN_(TemporalRowConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst); +TH_API void THNN_(TemporalRowConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst, + accreal scale); + +TH_API void THNN_(TemporalUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeW); +TH_API void THNN_(TemporalUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int isizeB, + int isizeC, + int isizeW, + int osizeW); + +TH_API void THNN_(TemporalUpSamplingLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeW, + bool align_corners); +TH_API void THNN_(TemporalUpSamplingLinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int isizeB, + int isizeC, + int isizeW, + int osizeW, + bool align_corners); + +TH_API void THNN_(BatchNormalization_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, // [OPTIONAL] + THTensor *bias, // [OPTIONAL] + THTensor *running_mean, // [OPTIONAL] if train + THTensor *running_var, // [OPTIONAL] if train + THTensor *save_mean, + THTensor *save_std, + bool train, + double momentum, + double eps); +TH_API void THNN_(BatchNormalization_backward)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, // [OPTIONAL] + THTensor *gradWeight, // [OPTIONAL] + THTensor *gradBias, // [OPTIONAL] + THTensor *weight, // [OPTIONAL] + THTensor *running_mean, // [OPTIONAL] if train + THTensor *running_var, // [OPTIONAL] if train + THTensor *save_mean, // [OPTIONAL] if !train + THTensor *save_std, // [OPTIONAL] if !train + bool train, + double scale, + double eps); + +TH_API void THNN_(SpatialConvolutionMap_updateOutput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *output, // [OUT] convolution output + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialConvolutionMap_updateGradInput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialConvolutionMap_accGradParameters)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // 3D gradWeight tensor (connTable:size(1) x kH x kW) + THTensor *gradBias, // 1D gradBias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH, // stride + accreal scale); // scaling factor + +TH_API void THNN_(SpatialConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH); +TH_API void THNN_(SpatialConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH); +TH_API void THNN_(SpatialConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + accreal scale); + +TH_API void THNN_(SpatialConvolutionLocal_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight); +TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight); +TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int64_t inputWidth, int64_t inputHeight, + int64_t outputWidth, int64_t outputHeight, + accreal scale); + +TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int osizeW, int osizeH); +TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices); + +TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeW, int osizeH); +TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput); + +TH_API void THNN_(SpatialAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); +TH_API void THNN_(SpatialAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputW, int outputH, + int kW, int kH, + THIndexTensor *indices, + THTensor *randomSamples); +TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputW, int outputH, + int kW, int kH, + THIndexTensor *indices); + +TH_API void THNN_(SpatialFullConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); +TH_API void THNN_(SpatialFullConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *columns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); +TH_API void THNN_(SpatialFullConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + accreal scale); + +TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *output, // [OUT] convolution output + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // 3D gradWeight tensor (connTable:size(1) x kH x kW) + THTensor *gradBias, // 1D gradBias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH, // stride + accreal scale); // scaling factor + +TH_API void THNN_(SpatialDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *columns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + accreal scale); + +TH_API void THNN_(SpatialFullDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH); + +TH_API void THNN_(SpatialFullDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *columns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH); + +TH_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + int adjW, int adjH, + accreal scale); + +TH_API void THNN_(SpatialMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); +TH_API void THNN_(SpatialMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); + +TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); +TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); + +TH_API void THNN_(SpatialMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int owidth, int oheight); +TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int owidth, int oheight); + +TH_API void THNN_(SpatialSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int kH, + int dW, int dH); +TH_API void THNN_(SpatialSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int kH, + int dW, int dH); +TH_API void THNN_(SpatialSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int kH, + int dW, int dH, + accreal scale); + +TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeH, + int osizeW); + +TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int isizeB, + int isizeC, + int isizeH, + int isizeW, + int osizeH, + int osizeW); + +TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeH, + int osizeW, + bool align_corners); +TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int isizeB, + int isizeC, + int isizeH, + int isizeW, + int osizeH, + int osizeW, + bool align_corners); + +TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode); + +TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode); + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode); + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode); + +TH_API void THNN_(unfolded_acc)( + THTensor *finput, + THTensor *input, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int nInputPlane, + int inputWidth, int inputHeight, + int osizeW, int outputHeight); +TH_API void THNN_(unfolded_copy)( + THTensor *finput, + THTensor *input, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int nInputPlane, + int inputWidth, int inputHeight, + int outputWidth, int outputHeight); + +TH_API void THNN_(VolumetricAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode, bool count_include_pad); +TH_API void THNN_(VolumetricAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceil_mode, bool count_include_pad); + +// VolumetricConvolution is legacy and purposefully not bound by ATen +TH_API void THNN_(VolumetricConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int dT, int dW, int dH, + int pT, int pW, int pH, + accreal scale); + +TH_API void THNN_(VolumetricConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, // HACK to make signature line up with backward + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + accreal scale); + +TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples); +TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices); + +TH_API void THNN_(VolumetricFullConvolution_updateOutput)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *output, // [OUT] volumetric convolution output + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *bias, // [OPTIONAL] gradBias tensor (nOutputPlane) + THTensor *finput, // [OUT] internal columns buffer + THTensor *fgradInput, // [OUT] internal ones buffer + int kT, int kW, int kH, // kenerl size + int dT, int dW, int dH, // stride of the convolution + int pT, int pW, int pH, // padding + int aT, int aW, int aH); // extra output adjustment +TH_API void THNN_(VolumetricFullConvolution_updateGradInput)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int kT, int kW, int kH, // kenerl size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH); // extra output adjustment +TH_API void THNN_(VolumetricFullConvolution_accGradParameters)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *gradBias, // [OPTIONAL] gradBias tensor (nOutputPlane) + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int kT, int kW, int kH, // kenerl size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH, // extra output adjustment + accreal scale); // scaling factor + +TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *columns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + accreal scale); + +TH_API void THNN_(VolumetricFullDilatedConvolution_updateOutput)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *output, // [OUT] volumetric convolution output + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *bias, // [OPTIONAL] gradBias tensor (nOutputPlane) + THTensor *finput, // [OUT] internal columns buffer + THTensor *fgradInput, // [OUT] internal ones buffer + int kT, int kW, int kH, // kernel size + int dT, int dW, int dH, // stride of the convolution + int pT, int pW, int pH, // padding + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH); // extra output adjustment +TH_API void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int kT, int kW, int kH, // kernel size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH); // extra output adjustment + +TH_API void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *gradBias, // [OPTIONAL] gradBias tensor (nOutputPlane) + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int kT, int kW, int kH, // kernel size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH, // extra output adjustment + accreal scale); // scaling factor + +TH_API void THNN_(VolumetricMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + bool ceilMode); +TH_API void THNN_(VolumetricMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + bool ceilMode); + +TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); +TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); + +TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int oT, int oW, int oH, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int oT, int oW, int oH, + int dT, int dW, int dH, + int pT, int pW, int pH); + +TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeT, + int osizeW, + int osizeH); +TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput); + +TH_API void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int osizeT, int osizeW, int osizeH); +TH_API void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices); + +TH_API void THNN_(SpatialReflectionPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_left, int pad_right, + int pad_top, int pad_bottom); + +TH_API void THNN_(SpatialReflectionPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_left, int pad_right, + int pad_top, int pad_bottom); + +TH_API void THNN_(SpatialReplicationPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_left, int pad_right, + int pad_top, int pad_bottom); + +TH_API void THNN_(SpatialReplicationPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_left, int pad_right, + int pad_top, int pad_bottom); + +TH_API void THNN_(FeatureLPPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal power, + int width, + int stride, + bool batchMode); + +TH_API void THNN_(FeatureLPPooling_updateGradInput)( + THNNState *state, + THTensor* gradOutput, + THTensor* input, + THTensor* output, + THTensor* gradInput, + accreal power, + int width, + int stride, + bool batchMode); + +TH_API void THNN_(VolumetricReplicationPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_left, int pad_right, + int pad_top, int pad_bottom, + int pad_front, int pad_back); + +TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_left, int pad_right, + int pad_top, int pad_bottom, + int pad_front, int pad_back); + +TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeT, + int osizeH, + int osizeW); + +TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int isizeB, + int isizeC, + int isizeT, + int isizeH, + int isizeW, + int osizeT, + int osizeH, + int osizeW); + +TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeT, + int osizeH, + int osizeW, + bool align_corners); + +TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int isizeB, + int isizeC, + int isizeT, + int isizeH, + int isizeW, + int osizeT, + int osizeH, + int osizeW, + bool align_corners); + +TH_API void THNN_(TemporalReflectionPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_left, int pad_right); + +TH_API void THNN_(TemporalReflectionPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_left, int pad_right); + +TH_API void THNN_(TemporalReplicationPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_left, int pad_right); + +TH_API void THNN_(TemporalReplicationPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_left, int pad_right); + +#endif diff --git a/aten/src/THNN/generic/Tanh.c b/aten/src/THNN/generic/Tanh.c new file mode 100644 index 0000000..898656b --- /dev/null +++ b/aten/src/THNN/generic/Tanh.c @@ -0,0 +1,48 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Tanh.c" +#else + +void THNN_(Tanh_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(tanh)(output, input); +} + +void THNN_(Tanh_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_SHAPE(output, gradOutput); + THTensor_(resizeAs)(gradInput, output); + + if (output->_dim() == 1 || + !THTensor_(isContiguous)(output) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + real z = *output_data; \ + *gradInput_data = *gradOutput_data * (1. - z*z); + ); + } + else + { + real* ptr_gradOutput = THTensor_(data)(gradOutput); + real* ptr_gradInput = THTensor_(data)(gradInput); + real* ptr_output = THTensor_(data)(output); + int64_t i; + +#pragma omp parallel for private(i) + for (i = 0; i < THTensor_(nElement)(gradInput); i++) + { + real z = ptr_output[i]; + ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z); + } + } +} + +#endif diff --git a/aten/src/THNN/generic/TemporalConvolution.c b/aten/src/THNN/generic/TemporalConvolution.c new file mode 100644 index 0000000..a7fdd3f --- /dev/null +++ b/aten/src/THNN/generic/TemporalConvolution.c @@ -0,0 +1,392 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalConvolution.c" +#else + +static inline void THNN_(TemporalConvolution_shapeCheck)( + THNNState *state, + THTensor *input, + int kW, + int dW, + int *inputFrameSize) { + + THArgCheck(kW > 0, 9, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 11, + "stride should be greater than zero, but got dW: %d", dW); + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (input->dim() == 3) + { + dimS = 1; + dimF = 2; + } + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + if (inputFrameSize != NULL) { + THArgCheck(input->size[dimF] == *inputFrameSize, 2, + "invalid input frame size. Got: %d, Expected: %d", + input->size[dimF], *inputFrameSize); + } + THArgCheck(input->size[dimS] >= kW, 2, + "input sequence smaller than kernel size. Got: %d, Expected: %d", + input->size[dimS], kW); +} + +void THNN_(TemporalConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, + int dW, + int inputFrameSize, + int outputFrameSize) +{ + THTensor *outputWindow, *inputWindow; + int nInputFrame, nOutputFrame; + int64_t k, i; + + int dimS = 0; // sequence dimension + + if (input->dim() == 3) + { + dimS = 1; + } + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + THNN_(TemporalConvolution_shapeCheck) + (state, input, kW, dW, &inputFrameSize); + input = THTensor_(newContiguous)(input); + outputWindow = THTensor_(new)(); + inputWindow = THTensor_(new)(); + + nInputFrame = input->size[dimS]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + if (input->dim() == 2) + { + THTensor_(resize2d)(output, + nOutputFrame, + outputFrameSize); + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(outputWindow, output, 0, k); + THTensor_(copy)(outputWindow, bias); + } + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THTensor_(setStorage2d)(outputWindow, output->storage, + output->storageOffset + k*output->size[1], + nFrame, outputFrameStride*output->size[1], + output->size[1], 1); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight); + THTensor_(free)(tweight); + } + } + else + { + THTensor *outputSample = THTensor_(new)(); + THTensor *inputSample = THTensor_(new)(); + int nBatchFrame = input->size[0]; + + THTensor_(resize3d)(output, + nBatchFrame, + nOutputFrame, + outputFrameSize); + + for(i = 0; i < nBatchFrame; i++) + { + THTensor_(select)(outputSample, output, 0, i); + THTensor_(select)(inputSample, input, 0, i); + int64_t nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(outputWindow, outputSample, 0, k); + THTensor_(copy)(outputWindow, bias); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THTensor_(setStorage2d)(outputWindow, outputSample->storage, + outputSample->storageOffset + k*outputSample->size[1], + nFrame, outputFrameStride*outputSample->size[1], + outputSample->size[1], 1); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight); + THTensor_(free)(tweight); + } + } + THTensor_(free)(outputSample); + THTensor_(free)(inputSample); + } + + THTensor_(free)(outputWindow); + THTensor_(free)(inputWindow); + THTensor_(free)(input); + +} + +void THNN_(TemporalConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, + int dW) +{ + int64_t nInputFrame; + int64_t nOutputFrame; + + THTensor *gradOutputWindow; + THTensor *gradInputWindow; + int64_t k, i; + + int dimS = 0; // sequence dimension + + if (gradOutput->dim() == 3) + { + dimS = 1; + } + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THNN_(TemporalConvolution_shapeCheck)( + state, input, kW, dW, NULL); + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + gradOutputWindow = THTensor_(new)(); + gradInputWindow = THTensor_(new)(); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (gradOutput->dim() == 2) + { + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THTensor_(setStorage2d)(gradInputWindow, gradInput->storage, + gradInput->storageOffset+k*dW*gradInput->size[1], + nFrame, inputFrameStride*gradInput->size[1], + kW*gradInput->size[1], 1); + + THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); + } + } + else + { + THTensor *gradOutputSample = THTensor_(new)(); + THTensor *gradInputSample = THTensor_(new)(); + int nBatchFrame = input->size[0]; + + for(i = 0; i < nBatchFrame; i++) + { + THTensor_(select)(gradOutputSample, gradOutput, 0, i); + THTensor_(select)(gradInputSample, gradInput, 0, i); + int nOutputSampleFrame = nOutputFrame; + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage, + gradInputSample->storageOffset+k*dW*gradInputSample->size[1], + nFrame, inputFrameStride*gradInputSample->size[1], + kW*gradInputSample->size[1], 1); + + THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); + } + } + THTensor_(free)(gradOutputSample); + THTensor_(free)(gradInputSample); + } + + THTensor_(free)(gradOutputWindow); + THTensor_(free)(gradInputWindow); + THTensor_(free)(gradOutput); + THTensor_(free)(input); + +} + +void THNN_(TemporalConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, + int dW, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int64_t nInputFrame; + int64_t nOutputFrame; + + THTensor *gradOutputWindow; + THTensor *inputWindow; + int64_t k, i; + + int dimS = 0; // sequence dimension + + if (gradOutput->dim() == 3) + { + dimS = 1; + } + + THNN_(TemporalConvolution_shapeCheck)( + state, input, kW, dW, NULL); + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + gradOutputWindow = THTensor_(new)(); + inputWindow = THTensor_(new)(); + + if (input->dim() == 2) + { + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(gradOutputWindow, gradOutput, 0, k); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THTensor *tgradOutputWindow = THTensor_(new)(); + THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow); + THTensor_(free)(tgradOutputWindow); + } + } + else + { + THTensor *gradOutputSample = THTensor_(new)(); + THTensor *inputSample = THTensor_(new)(); + int nBatchFrame = input->size[0]; + + for(i = 0; i < nBatchFrame; i++) + { + THTensor_(select)(gradOutputSample, gradOutput, 0, i); + THTensor_(select)(inputSample, input, 0, i); + int nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + int64_t outputFrameStride = (kW-1)/dW+1; + int64_t inputFrameStride = outputFrameStride*dW; + int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THTensor *tgradOutputWindow = THTensor_(new)(); + THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow); + THTensor_(free)(tgradOutputWindow); + } + } + THTensor_(free)(gradOutputSample); + THTensor_(free)(inputSample); + } + + THTensor_(free)(gradOutputWindow); + THTensor_(free)(inputWindow); + THTensor_(free)(gradOutput); + THTensor_(free)(input); + +} + +#endif diff --git a/aten/src/THNN/generic/TemporalMaxPooling.c b/aten/src/THNN/generic/TemporalMaxPooling.c new file mode 100644 index 0000000..faef305 --- /dev/null +++ b/aten/src/THNN/generic/TemporalMaxPooling.c @@ -0,0 +1,283 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c" +#else + +static inline void THNN_(TemporalMaxPooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THIndexTensor *indices, + int kW, + int dW) { + int64_t niframe; + int64_t framesize; + int64_t noframe; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + int ndims = input->dim(); + + if (input->dim() == 3) + { + dimS = 1; + dimF = 2; + } + + niframe = input->size[dimS]; + framesize = input->size[dimF]; + noframe = (niframe - kW) / dW + 1; + + THArgCheck(kW > 0, 5, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 6, + "stride should be greater than zero, but got dW: %d", dW); + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(input->size[dimS] >= kW, 2, + "input sequence smaller than kernel size. Got: %d, Expected: %d", + input->size[dimS], kW); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe); + THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize) + } + if (indices != NULL) { + THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize); + } +} + +void THNN_(TemporalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, + int dW) +{ + int64_t niframe; + int64_t framesize; + int64_t noframe; + + real *input_data; + real *output_data; + THIndex_t *indices_data; + + int64_t t, y; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW); + + if (input->dim() == 3) + { + dimS = 1; + dimF = 2; + } + + /* sizes */ + niframe = input->size[dimS]; + framesize = input->size[dimF]; + noframe = (niframe - kW) / dW + 1; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (input->dim() == 2) + { + /* resize output */ + THTensor_(resize2d)(output, noframe, framesize); + + /* indices will contain index locations for each output point */ + THIndexTensor_(resize2d)(indices, noframe, framesize); + + /* get raw pointers */ + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for(t = 0; t < noframe; t++) + { + real *ip = input_data + t*framesize*dW; + real *op = output_data + t*framesize; + THIndex_t *xp = indices_data + t*framesize; +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + int64_t maxindex = -1; + real maxval = -THInf; + int64_t x; + for(x = 0; x < kW; x++) + { + real val = ip[x*framesize+y]; + if (val > maxval) + { + maxval = val; + maxindex = x; + } + } + + /* set output to local max */ + op[y] = maxval; + xp[y] = (real)maxindex; + } + } + } + else + { + /* number of batch frames */ + int64_t nbframe = input->size[0]; + int64_t i; + + /* resize output */ + THTensor_(resize3d)(output, nbframe, noframe, framesize); + + /* indices will contain index locations for each output point */ + THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize); + + /* get raw pointers */ + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for(i = 0; i < nbframe; i++) + { + real *inputSample_data = input_data + i*niframe*framesize; + real *outputSample_data = output_data + i*noframe*framesize; + THIndex_t *indicesSample_data = indices_data + i*noframe*framesize; + + for(t = 0; t < noframe; t++) + { + real *ip = inputSample_data + t*framesize*dW; + real *op = outputSample_data + t*framesize; + THIndex_t *xp = indicesSample_data + t*framesize; + +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + int64_t maxindex = -1; + real maxval = -THInf; + int64_t x; + for(x = 0; x < kW; x++) + { + real val = ip[x*framesize+y]; + if (val > maxval) + { + maxval = val; + maxindex = x; + } + } + + /* set output to local max */ + op[y] = maxval; + xp[y] = (real)maxindex; + } + } + } + } + + /* cleanup */ + THTensor_(free)(input); + +} + +void THNN_(TemporalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, + int dW) +{ + int64_t niframe; + int noframe; + int64_t framesize; + + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + int64_t t, y; + + THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW); + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize and zero */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (input->dim() == 3) + { + dimS = 1; + dimF = 2; + } + /* sizes */ + niframe = input->size[dimS]; + noframe = gradOutput->size[dimS]; + framesize = gradOutput->size[dimF]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + if (input->dim() == 2) + { + for(t = 0; t < noframe; t++) + { + real *gip = gradInput_data + t*framesize*dW; + real *gop = gradOutput_data + t*framesize; + THIndex_t *xp = indices_data + t*framesize; +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + int64_t maxindex = (int64_t)xp[y]; + if (maxindex != -1) + gip[maxindex*framesize+y] += gop[y]; + } + } + } + else + { + /* number of batch frames */ + int64_t nbframe = input->size[0]; + int64_t i; + + for(i = 0; i < nbframe; i++) + { + real *gradInputSample_data = gradInput_data + i*niframe*framesize; + real *gradOutputSample_data = gradOutput_data + i*noframe*framesize; + THIndex_t *indicesSample_data = indices_data + i*noframe*framesize; + + for(t = 0; t < noframe; t++) + { + real *gip = gradInputSample_data + t*framesize*dW; + real *gop = gradOutputSample_data + t*framesize; + THIndex_t *xp = indicesSample_data + t*framesize; +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + int64_t maxindex = (int64_t)xp[y]; + if (maxindex != -1) + gip[maxindex*framesize+y] += gop[y]; + } + } + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/TemporalReflectionPadding.c b/aten/src/THNN/generic/TemporalReflectionPadding.c new file mode 100644 index 0000000..ea6ea9a --- /dev/null +++ b/aten/src/THNN/generic/TemporalReflectionPadding.c @@ -0,0 +1,219 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalReflectionPadding.c" +#else + +static void THNN_(TemporalReflectionPadding_updateOutput_frame)( + real *input_p, real *output_p, + long nslices, + long iwidth, + long owidth, + int pad_l, int pad_r) +{ + int iStartX = fmax(0, -pad_l); + int oStartX = fmax(0, pad_l); + + long k, ip_x; +#pragma omp parallel for private(k, ip_x) + + for (k = 0; k < nslices; k++) + { + long j; + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l * 2 - j; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = (iwidth + pad_l - 1) * 2 - j; + } + ip_x = ip_x - oStartX + iStartX; + + /* real *dest_p = output_p + k*owidth*oheight + i * owidth + j; */ + real *dest_p = output_p + k*owidth + j; + real *src_p = input_p + k*iwidth + ip_x; + *dest_p = *src_p; + } + } +} + +void THNN_(TemporalReflectionPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r) +{ + int dimw = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iwidth; + long owidth; + real *input_data; + real *output_data; + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 3) + { + nbatch = input->size[0]; + dimw++; + dimslices++; + } + + /* input size */ + nslices = input->size[dimslices]; + iwidth = input->size[dimw]; + + THArgCheck(pad_l < iwidth && pad_r < iwidth, 4, + "Padding size should be less than the corresponding input dimension, " + "but got: padding (%d, %d) at dimension %d of input %s", + pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str); + + /* output size */ + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth >= 1 , 2, + "input (W: %d)is too small." + " Calculated output W: %d", + iwidth, owidth); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->dim() == 2) + { + THTensor_(resize2d)(output, nslices, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(TemporalReflectionPadding_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, + owidth, + pad_l, pad_r); + } + else + { + long p; + + THTensor_(resize3d)(output, nbatch, nslices, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(TemporalReflectionPadding_updateOutput_frame)( + input_data+p*nslices*iwidth, + output_data+p*nslices*owidth, + nslices, + iwidth, + owidth, + pad_l, pad_r); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(TemporalReflectionPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + long nslices, + long iwidth, + long owidth, + int pad_l, int pad_r) +{ + int iStartX = fmax(0, -pad_l); + int oStartX = fmax(0, pad_l); + + long k, ip_x; +#pragma omp parallel for private(k, ip_x) + + for (k = 0; k < nslices; k++) + { + long j; + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l * 2 - j; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = (iwidth + pad_l - 1) * 2 - j; + } + ip_x = ip_x - oStartX + iStartX; + + real *src_p = goutput_p + k*owidth + j; + real *dest_p = ginput_p + k*iwidth + ip_x; + *dest_p += *src_p; + } + } +} + +void THNN_(TemporalReflectionPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r) +{ + int dimw = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iwidth; + long owidth; + + if (input->dim() == 3) + { + nbatch = input->size[0]; + dimw++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iwidth = input->size[dimw]; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->dim() == 2) { + THNN_(TemporalReflectionPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, + owidth, + pad_l, pad_r); + } else { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(TemporalReflectionPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * iwidth, + THTensor_(data)(gradOutput) + p * nslices * owidth, + nslices, + iwidth, + owidth, + pad_l, pad_r); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/TemporalReplicationPadding.c b/aten/src/THNN/generic/TemporalReplicationPadding.c new file mode 100644 index 0000000..da8aeb5 --- /dev/null +++ b/aten/src/THNN/generic/TemporalReplicationPadding.c @@ -0,0 +1,211 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalReplicationPadding.c" +#else + +static void THNN_(TemporalReplicationPadding_updateOutput_frame)( + real *input_p, real *output_p, + long nslices, + long iwidth, + long owidth, + int pad_l, int pad_r) +{ + int iStartX = fmax(0, -pad_l); + int oStartX = fmax(0, pad_l); + + long k, ip_x; +#pragma omp parallel for private(k, ip_x) + for (k = 0; k < nslices; k++) + { + long j; + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = iwidth + pad_l - 1; + } + ip_x = ip_x - oStartX + iStartX; + + real *dest_p = output_p + k*owidth + j; + real *src_p = input_p + k*iwidth + ip_x; + *dest_p = *src_p; + } + } +} + +void THNN_(TemporalReplicationPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r) +{ + int dimw = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iwidth; + long owidth; + real *input_data; + real *output_data; + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 3) + { + nbatch = input->size[0]; + dimw++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iwidth = input->size[dimw]; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth >= 1 , 2, + "input (W: %d)is too small." + " Calculated output W: %d", + iwidth, owidth); + + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->dim() == 2) + { + THTensor_(resize2d)(output, nslices, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(TemporalReplicationPadding_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, + owidth, + pad_l, pad_r); + } + else + { + long p; + + THTensor_(resize3d)(output, nbatch, nslices, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(TemporalReplicationPadding_updateOutput_frame)( + input_data+p*nslices*iwidth, + output_data+p*nslices*owidth, + nslices, + iwidth, + owidth, + pad_l, pad_r); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(TemporalReplicationPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + long nslices, + long iwidth, + long owidth, + int pad_l, int pad_r) +{ + int iStartX = fmax(0, -pad_l); + int oStartX = fmax(0, pad_l); + + long k, ip_x; +#pragma omp parallel for private(k, ip_x) + for (k = 0; k < nslices; k++) + { + long j; + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = iwidth + pad_l - 1; + } + ip_x = ip_x - oStartX + iStartX; + + real *src_p = goutput_p + k*owidth + j; + real *dest_p = ginput_p + k*iwidth + ip_x; + *dest_p += *src_p; + } + } +} + +void THNN_(TemporalReplicationPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r) +{ + int dimw = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iwidth; + long owidth; + + if (input->dim() == 3) + { + nbatch = input->size[0]; + dimw++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iwidth = input->size[dimw]; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->dim() == 2) { + THNN_(TemporalReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, + owidth, + pad_l, pad_r); + } else { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(TemporalReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * iwidth, + THTensor_(data)(gradOutput) + p * nslices * owidth, + nslices, + iwidth, + owidth, + pad_l, pad_r); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + + +#endif diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c new file mode 100644 index 0000000..db3278b --- /dev/null +++ b/aten/src/THNN/generic/TemporalRowConvolution.c @@ -0,0 +1,468 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalRowConvolution.c" +#else + +static inline void THNN_(TemporalRowConvolution_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + int kW, + int dW, + int padW) { + + THArgCheck(kW > 0, 5, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 6, + "stride should be greater than zero, but got dW: %d", dW); + THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight, + "non-empty 3D weight tensor expected, but got: %s"); + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + + // we're always looking at (possibly batch) x feats x seq + int ndim = input->dim(); + int dimF = 0; + int dimS = 1; + + if (ndim == 3) { + ++dimS; + ++dimF; + } + + THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input, + "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s"); + + int64_t inputFrameSize = weight->size[0]; + int64_t nInputFrame = input->size[dimS]; + int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + if (nOutputFrame < 1) { + THError("Given input size: (%d x %d). " + "Calculated output size: (%d x %d). Output size is too small", + inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame); + } + + THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame); + } +} + +static void THNN_(unfolded_acc_row)( + THTensor *finput, + THTensor *input, + int kW, + int dW, + int padW, + int64_t inputFrameSize, + int64_t nInputFrame, + int64_t nOutputFrame) { + + int64_t c; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +// #pragma omp parallel for private(c) + for (c = 0; c < inputFrameSize; c++) { + int64_t kw, x; + int64_t ix = 0; + + for (kw = 0; kw < kW; kw++) { + real *src = finput_data + + c * (kW * nOutputFrame) + + kw * (nOutputFrame); + real *dst = input_data + c * (nInputFrame); + + ix = (size_t)(kw); + if (dW == 1) { + real *dst_slice = dst + (size_t)(ix); + THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame); + } else { + for (x = 0; x < nOutputFrame; x++) { + real *dst_slice = dst + (size_t)(ix + x * dW); + THVector_(cadd)(dst_slice, dst_slice, + src + (size_t)(x), 1, 1); + } + } + } + } +} + +static void THNN_(unfolded_copy_row)( + THTensor *finput, + THTensor *input, + int kW, + int dW, + int padW, + int64_t inputFrameSize, + int64_t nInputFrame, + int64_t nOutputFrame) { + + int64_t k; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +// #pragma omp parallel for private(k) + for (k = 0; k < inputFrameSize * kW; k++) { + int64_t c = k / kW; + int64_t rest = k % kW; + int64_t kw = rest % kW; + int64_t x; + int64_t ix; + real *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame); + real *src = input_data + c * (nInputFrame); + + ix = (size_t)(kw); + if (dW == 1) { + memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame)); + } else { + for (x = 0; x < nOutputFrame; x++) { + memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW), + sizeof(real) * 1); + } + } + } +} + +static void THNN_(TemporalRowConvolution_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kW, + int dW, + int padW, + int64_t inputFrameSize, + int64_t nInputFrame, + int64_t nOutputFrame) { + + int64_t i; + + THTensor *output3d = THTensor_(newWithStorage3d)( + output->storage, output->storageOffset, + inputFrameSize, -1, + 1, -1, + nOutputFrame, -1); + + THNN_(unfolded_copy_row)(finput, input, kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + + THTensor_(zero)(output); + + if (bias != NULL) { + for (i = 0; i < inputFrameSize; i++) + THVector_(fill) + (THStorage_(data)(output->storage) + output->storageOffset + + output->stride[0] * i, + THTensor_(get1d)(bias, i), nOutputFrame); + } + + THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput); + + THTensor_(free)(output3d); +} + +void THNN_(TemporalRowConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, // unused here but needed for Cuda + int kW, + int dW, + int padW, + bool featFirst) { + + int ndim = input->dim(); + + THTensor *tinput; + if (!featFirst) { + tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); + input = THTensor_(newContiguous)(tinput); + } else { + input = THTensor_(newContiguous)(input); + } + + THNN_(TemporalRowConvolution_shapeCheck)( + state, input, NULL, weight, bias, kW, dW, padW); + + int64_t inputFrameSize = weight->size[0]; + int64_t nInputFrame = input->size[ndim - 1]; + int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + if (ndim == 2) { /* non-batch mode */ + + THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame); + THTensor_(resize2d)(output, inputFrameSize, nOutputFrame); + + THTensor_(zero)(finput); + THTensor_(zero)(output); + + THNN_(TemporalRowConvolution_updateOutput_frame) + (input, output, weight, bias, finput, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + + } else { + int64_t T = input->size[0]; + int64_t t; + + THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame); + THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame); + + THTensor_(zero)(finput); + THTensor_(zero)(output); + +#pragma omp parallel for private(t) + for (t = 0; t < T; t++) { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(TemporalRowConvolution_updateOutput_frame) + (input_t, output_t, weight, bias, finput_t, + kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + if (!featFirst) { // NOTE: output will NOT be contiguous in this case + THTensor_(transpose)(output, output, ndim - 1, ndim - 2); + THTensor_(free)(tinput); + } + + THTensor_(free)(input); +} + +static void THNN_(TemporalRowConvolution_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kW, + int dW, + int padW, + int64_t inputFrameSize, + int64_t nInputFrame, + int64_t nOutputFrame) { + + THTensor *gradOutput3d = THTensor_(newWithStorage3d)( + gradOutput->storage, gradOutput->storageOffset, + inputFrameSize, -1, + 1, -1, + nOutputFrame, -1); + + // weight: inputFrameSize x kW x 1 + // gradOutput3d: inputFrameSize x 1 x nOutputFrame + THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d); + // fgradInput: inputFrameSize x kW x nOutputFrame + THTensor_(free)(gradOutput3d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc_row)(fgradInput, gradInput, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); +} + +void THNN_(TemporalRowConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst) { + + int ndim = input->dim(); + + THTensor *tinput, *tgradOutput; + + if (!featFirst) { + tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); + tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2); + + input = THTensor_(newContiguous)(tinput); + gradOutput = THTensor_(newContiguous)(tgradOutput); + + } else { + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + } + + THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight, + NULL, kW, dW, padW); + + int64_t inputFrameSize = weight->size[0]; + int64_t nInputFrame = input->size[ndim - 1]; + int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + THTensor_(resizeAs)(fgradInput, finput); + THTensor_(resizeAs)(gradInput, input); + + THTensor_(zero)(fgradInput); + THTensor_(zero)(gradInput); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 1, 2); + + if (ndim == 2) { + THNN_(TemporalRowConvolution_updateGradInput_frame) + (gradInput, gradOutput, tweight, fgradInput, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + } else { + int64_t T = input->size[0]; + int64_t t; + +#pragma omp parallel for private(t) + for (t = 0; t < T; t++) { + + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(TemporalRowConvolution_updateGradInput_frame) + (gradInput_t, gradOutput_t, tweight, fgradInput_t, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + + if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case + + THTensor_(free)(tinput); + THTensor_(free)(tgradOutput); + + THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + +} + +static void THNN_(TemporalRowConvolution_accGradParameters_frame)( + THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, + THTensor *finput, real scale) { + + int64_t i; + THTensor *gradOutput3d = THTensor_(newWithStorage3d)( + gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + 1, -1, + gradOutput->size[1], -1); + + THTensor *tfinput = THTensor_(new)(); + THTensor_(transpose)(tfinput, finput, 1, 2); + // gradOutput3d: inputFrameSize x 1 x nOutputFrame + // finput: inputFrameSize x nOutputFrame x kW + THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput); + // gradWeight: inputFrameSize x 1 x kW + THTensor_(free)(tfinput); + + if (gradBias != NULL) { + for (i = 0; i < gradBias->size[0]; i++) { + int64_t k; + real sum = 0; + real *data = THStorage_(data)(gradOutput3d->storage) + + gradOutput3d->storageOffset + + i * gradOutput3d->stride[0]; + for (k = 0; k < gradOutput3d->size[2]; k++) { + sum += data[k]; + } + (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] + += scale * sum; + } + } + + THTensor_(free)(gradOutput3d); + +} + +void THNN_(TemporalRowConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst, + accreal scale_) { + + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int ndim = input->dim(); + + THTensor *tinput, *tgradOutput; + + if (!featFirst) { + tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); + tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2); + + input = THTensor_(newContiguous)(tinput); + gradOutput = THTensor_(newContiguous)(tgradOutput); + } else { + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + } + + THNN_(TemporalRowConvolution_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW); + + if (ndim == 2) { + THNN_(TemporalRowConvolution_accGradParameters_frame)( + gradOutput, gradWeight, gradBias, finput, scale); + } else { + int64_t T = input->size[0]; + int64_t t; + + for (t = 0; t < T; t++) { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(TemporalRowConvolution_accGradParameters_frame)( + gradOutput_t, gradWeight, gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } + + if (!featFirst) { + THTensor_(free)(tinput); + THTensor_(free)(tgradOutput); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/TemporalSubSampling.c b/aten/src/THNN/generic/TemporalSubSampling.c new file mode 100644 index 0000000..8c90d26 --- /dev/null +++ b/aten/src/THNN/generic/TemporalSubSampling.c @@ -0,0 +1,156 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalSubSampling.c" +#else + +static inline void THNN_(TemporalSubSampling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int kW, + int dW, + int *inputFrameSize) { + int nInputFrame, nOutputFrame; + + THArgCheck(kW > 0, 6, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 7, + "stride should be greater than zero, but got dW: %d", dW); + + THNN_ARGCHECK(!input->is_empty() && input->dim() == 2, 2, input, + "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); + if (inputFrameSize != NULL) { + THArgCheck( input->size[1] == *inputFrameSize, 2, + "invalid input frame size. Got: %d, Expected: %d", + input->size[1], *inputFrameSize); + } + THArgCheck( input->size[0] >= kW, 2, + "input sequence smaller than kernel size. Got %d, Expected: %d", + input->size[0], kW); + + nInputFrame = input->size[0]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, input->dim(), 0, nOutputFrame); + if (inputFrameSize != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, input->dim(), 1, *inputFrameSize); + } + } +} + +void THNN_(TemporalSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, + int dW, + int inputFrameSize) +{ + THTensor *outputFrame, *inputWindow; + int nInputFrame, nOutputFrame; + int64_t k; + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous"); + THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize); + + outputFrame = THTensor_(new)(); + inputWindow = THTensor_(new)(); + + nInputFrame = input->size[0]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + THTensor_(resize2d)(output, + nOutputFrame, + inputFrameSize); + + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(narrow)(inputWindow, input, 0, k*dW, kW); + THTensor_(select)(outputFrame, output, 0, k); + THTensor_(sum)(outputFrame, inputWindow, 0, 1); + THTensor_(cmul)(outputFrame, outputFrame, weight); + THTensor_(cadd)(outputFrame, outputFrame, 1, bias); + } + + THTensor_(free)(outputFrame); + THTensor_(free)(inputWindow); +} + +void THNN_(TemporalSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, + int dW) +{ + + THTensor *gradOutputFrame; + THTensor *gradInputWindow, *buffer, *kwunit; + int64_t k; + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL); + + gradOutputFrame = THTensor_(new)(); + gradInputWindow = THTensor_(new)(); + buffer = THTensor_(new)(); + kwunit = THTensor_(newWithSize1d)(kW); + + THTensor_(fill)(kwunit, 1); + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + for(k = 0; k < gradOutput->size[0]; k++) + { + THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW); + THTensor_(select)(gradOutputFrame, gradOutput, 0, k); + THTensor_(cmul)(buffer, weight, gradOutputFrame); + THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer); + } + + THTensor_(free)(gradOutputFrame); + THTensor_(free)(gradInputWindow); + THTensor_(free)(buffer); + THTensor_(free)(kwunit); +} + +void THNN_(TemporalSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, + int dW, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THTensor *gradOutputFrame; + THTensor *inputWindow, *buffer; + int64_t k; + + THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL); + gradOutputFrame = THTensor_(new)(); + inputWindow = THTensor_(new)(); + buffer = THTensor_(new)(); + + for(k = 0; k < gradOutput->size[0]; k++) + { + THTensor_(narrow)(inputWindow, input, 0, k*dW, kW); + THTensor_(select)(gradOutputFrame, gradOutput, 0, k); + THTensor_(sum)(buffer, inputWindow, 0, 1); + THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame); + } + + THTensor_(free)(gradOutputFrame); + THTensor_(free)(inputWindow); + THTensor_(free)(buffer); +} + +#endif diff --git a/aten/src/THNN/generic/TemporalUpSamplingLinear.c b/aten/src/THNN/generic/TemporalUpSamplingLinear.c new file mode 100644 index 0000000..2faa9f8 --- /dev/null +++ b/aten/src/THNN/generic/TemporalUpSamplingLinear.c @@ -0,0 +1,147 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou + +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalUpSamplingLinear.c" +#else + +#include "linear_upsampling.h" + +static inline void THNN_(TemporalUpSamplingLinear_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputWidth, int outputWidth) { + THArgCheck(inputWidth > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (W: %d) output (W: %d)", + inputWidth, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 3, 2, input, + "non-empty 3D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth); + } +} + +void THNN_(TemporalUpSamplingLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputWidth, + bool align_corners){ + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputWidth = THTensor_(size)(input, 2); + + THNN_(TemporalUpSamplingLinear_shapeCheck) + (input, NULL, + nbatch, channels, + inputWidth, outputWidth); + + input = THTensor_(newContiguous)(input); + THTensor_(resize3d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputWidth); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + channels = nbatch * channels; + THAssert(inputWidth > 0 && outputWidth > 0); + // special case: just copy + if (inputWidth == outputWidth) { + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[w1]; + real* pos2 = &odata[w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(input); + return; + } + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + const real* pos1 = &idata[w1]; + // index w2 is interpolated by idata[w1] and (itself or idata[w1 + 1]) + real* pos2 = &odata[w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = w0lambda * pos1[0] + w1lambda * pos1[w1p]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(input); +} + +void THNN_(TemporalUpSamplingLinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputWidth, + int outputWidth, + bool align_corners){ + + THNN_(TemporalUpSamplingLinear_shapeCheck) + (NULL, gradOutput, + nbatch, channels, + inputWidth, + outputWidth); + + THTensor_(resize3d)(gradInput, nbatch, channels, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *data1 = THTensor_(data)(gradInput); + real *data2 = THTensor_(data)(gradOutput); + channels = nbatch * channels; + + // special case: same-size matching grids + if (inputWidth == outputWidth) { + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &data1[w1]; + const real* pos2 = &data2[w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(gradOutput); + return; + } + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + real* pos1 = &data1[w1]; + const real* pos2 = &data2[w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += w0lambda * pos2[0]; + pos1[w1p] += w1lambda * pos2[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/TemporalUpSamplingNearest.c b/aten/src/THNN/generic/TemporalUpSamplingNearest.c new file mode 100644 index 0000000..853f6ca --- /dev/null +++ b/aten/src/THNN/generic/TemporalUpSamplingNearest.c @@ -0,0 +1,130 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalUpSamplingNearest.c" +#else + +#include "linear_upsampling.h" + +static inline void THNN_(TemporalUpSamplingNearest_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputWidth, int outputWidth) { + THArgCheck(inputWidth > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (W: %d) output (W: %d)", + inputWidth, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(input->_dim() == 3, 2, input, + "3D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth); + } +} + +void THNN_(TemporalUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputWidth) +{ + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputWidth = THTensor_(size)(input, 2); + const float scale = (float) inputWidth / (float)outputWidth; + + THNN_(TemporalUpSamplingNearest_shapeCheck)(input, NULL, nbatch, channels, inputWidth, outputWidth); + + THTensor_(resize3d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputWidth); + channels = channels * nbatch; + + THAssert(inputWidth > 0 && outputWidth > 0); + + input = THTensor_(newContiguous)(input); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + + // special case: just copy + if (inputWidth == outputWidth) { + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[w1]; + real* pos2 = &odata[w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(input); + return; + } + + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal src_x = nearest_neighbor_compute_source_index(scale, w2, inputWidth); + const int w1 = src_x; + const real* pos1 = &idata[w1]; + real* pos2 = &odata[w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(input); +} + +void THNN_(TemporalUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputWidth, + int outputWidth) +{ + THNN_(TemporalUpSamplingNearest_shapeCheck)(NULL, gradOutput, nbatch, channels, inputWidth, outputWidth); + THTensor_(resize3d)(gradInput, nbatch, channels, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *data1 = THTensor_(data)(gradInput); + real *data2 = THTensor_(data)(gradOutput); + channels = nbatch * channels; + const float scale = (float) inputWidth / (float)outputWidth; + + // special case: same-size matching grids + if (inputWidth == outputWidth) { + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &data1[w1]; + const real* pos2 = &data2[w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(gradOutput); + return; + } + + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = nearest_neighbor_compute_source_index(scale, w2, inputWidth); + real* pos1 = &data1[w1]; + const real* pos2 = &data2[w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth; + pos2 += outputWidth; + } + } + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/Threshold.c b/aten/src/THNN/generic/Threshold.c new file mode 100644 index 0000000..592aa8d --- /dev/null +++ b/aten/src/THNN/generic/Threshold.c @@ -0,0 +1,63 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Threshold.c" +#else + +void THNN_(Threshold_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal threshold_, + accreal val_, + bool inplace) +{ + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + real val = TH_CONVERT_ACCREAL_TO_REAL(val_); + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data <= threshold) + *input_data = val; + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, output, real, input, + *output_data = (*input_data > threshold) ? *input_data : val; + ); + } +} + +void THNN_(Threshold_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal threshold_, + accreal val_, + bool inplace) +{ + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + THNN_CHECK_NELEMENT(input, gradOutput); + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if ((*input_data) <= threshold) + *gradOutput_data = 0; + ); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if ((*input_data) > threshold) + *gradInput_data = *gradOutput_data; + else + *gradInput_data = 0; + ); + } +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c new file mode 100644 index 0000000..1edf8a9 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c @@ -0,0 +1,304 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricAdaptiveAveragePooling.c" +#else + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +// 5d tensor B x D x T x H x W + +static void THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)( + real *input_p, + real *output_p, + int64_t sizeD, + int64_t isizeT, + int64_t isizeH, + int64_t isizeW, + int64_t osizeT, + int64_t osizeH, + int64_t osizeW, + int64_t istrideD, + int64_t istrideT, + int64_t istrideH, + int64_t istrideW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + /* loop over output */ + int64_t ot, oh, ow; + for(ot = 0; ot < osizeT; ot++) + { + int istartT = START_IND(ot, osizeT, isizeT); + int iendT = END_IND(ot, osizeT, isizeT); + int kT = iendT - istartT; + + for(oh = 0; oh < osizeH; oh++) + { + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = 0; ow < osizeW; ow++) + { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + /* local pointers */ + real *ip = input_p + d*istrideD + istartT*istrideT + istartH*istrideH + istartW*istrideW; + real *op = output_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow; + + /* compute local average: */ + real sum = 0; + int it, ih, iw; + for(it = 0; it < kT; it++) + { + for(ih = 0; ih < kH; ih++) + { + for(iw = 0; iw < kW; iw++) + { + real val = *(ip + it*istrideT + ih*istrideH + iw*istrideW); + sum += val; + } + } + } + + /* set output to local average */ + *op = sum / kT / kH / kW; + } + } + } + } +} + +void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int osizeT, + int osizeW, + int osizeH) +{ + int dimD = 0; + int dimT = 1; + int dimH = 2; + int dimW = 3; + int64_t sizeB = 1; + int64_t sizeD = 0; + int64_t isizeT = 0; + int64_t isizeH = 0; + int64_t isizeW = 0; + + int64_t istrideB = 0; + int64_t istrideD = 0; + int64_t istrideT = 0; + int64_t istrideH = 0; + int64_t istrideW = 0; + + real *input_data = nullptr; + real *output_data = nullptr; + + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 5) + { + istrideB = input->stride[0]; + sizeB = input->size[0]; + dimD++; + dimT++; + dimH++; + dimW++; + } + + /* sizes */ + sizeD = input->size[dimD]; + isizeT = input->size[dimT]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + /* strides */ + istrideD = input->stride[dimD]; + istrideT = input->stride[dimT]; + istrideH = input->stride[dimH]; + istrideW = input->stride[dimW]; + + /* resize output */ + if (input->dim() == 4) + { + THTensor_(resize4d)(output, sizeD, osizeT, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + istrideD, istrideT, + istrideH, istrideW); + } + else + { + int64_t b; + + THTensor_(resize5d)(output, sizeB, sizeD, osizeT, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + istrideD, istrideT, + istrideH, istrideW); + } + } +} + +static void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + int64_t sizeD, + int64_t isizeT, + int64_t isizeH, + int64_t isizeW, + int64_t osizeT, + int64_t osizeH, + int64_t osizeW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + real *gradInput_p_d = gradInput_p + d*isizeT*isizeW*isizeH; + real *gradOutput_p_d = gradOutput_p + d*osizeT*osizeW*osizeH; + + /* calculate average */ + int64_t ot, oh, ow; + for(ot = 0; ot < osizeT; ot++) + { + int istartT = START_IND(ot, osizeT, isizeT); + int iendT = END_IND(ot, osizeT, isizeT); + int kT = iendT - istartT; + + for(oh = 0; oh < osizeH; oh++) + { + int istartH = START_IND(oh, osizeH, isizeH); + int iendH = END_IND(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for(ow = 0; ow < osizeW; ow++) + { + + int istartW = START_IND(ow, osizeW, isizeW); + int iendW = END_IND(ow, osizeW, isizeW); + int kW = iendW - istartW; + + real grad_delta = gradOutput_p_d[ot*osizeH*osizeW + oh*osizeW + ow] / kT / kH / kW; + + int it, ih, iw; + for(it = istartT; it < iendT; it++) + { + for(ih = istartH; ih < iendH; ih++) + { + for(iw = istartW; iw < iendW; iw++) + { + /* update gradient */ + gradInput_p_d[it*isizeH*isizeW + ih*isizeW + iw] += grad_delta; + } + } + } + } + } + } + } +} + +void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + int dimD = 0; + int dimT = 1; + int dimH = 2; + int dimW = 3; + int64_t sizeB = 1; + int64_t sizeD; + int64_t isizeT; + int64_t isizeH; + int64_t isizeW; + int64_t osizeT; + int64_t osizeH; + int64_t osizeW; + real *gradInput_data; + real *gradOutput_data; + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 5) { + sizeB = input->size[0]; + dimD++; + dimT++; + dimH++; + dimW++; + } + + /* sizes */ + sizeD = input->size[dimD]; + isizeT = input->size[dimT]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + osizeT = gradOutput->size[dimT]; + osizeH = gradOutput->size[dimH]; + osizeW = gradOutput->size[dimW]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + + /* backprop */ + if (input->dim() == 4) + { + THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW); + } + else + { + int64_t b; +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif + +#undef START_IND +#undef END_IND diff --git a/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c new file mode 100644 index 0000000..74efa76 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c @@ -0,0 +1,305 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricAdaptiveMaxPooling.c" +#else + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +// 5d tensor B x D x T x H x W + +static void THNN_(VolumetricAdaptiveMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *ind_p, + int64_t sizeD, + int64_t isizeT, + int64_t isizeH, + int64_t isizeW, + int64_t osizeT, + int64_t osizeH, + int64_t osizeW, + int64_t istrideD, + int64_t istrideT, + int64_t istrideH, + int64_t istrideW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + /* loop over output */ + int64_t ot, oh, ow; + for(ot = 0; ot < osizeT; ot++) + { + int64_t istartT = START_IND(ot, osizeT, isizeT); + int64_t iendT = END_IND(ot, osizeT, isizeT); + int64_t kT = iendT - istartT; + + for(oh = 0; oh < osizeH; oh++) + { + int64_t istartH = START_IND(oh, osizeH, isizeH); + int64_t iendH = END_IND(oh, osizeH, isizeH); + int64_t kH = iendH - istartH; + + for(ow = 0; ow < osizeW; ow++) + { + + int64_t istartW = START_IND(ow, osizeW, isizeW); + int64_t iendW = END_IND(ow, osizeW, isizeW); + int64_t kW = iendW - istartW; + + /* local pointers */ + real *ip = input_p + d*istrideD + istartT *istrideT + istartH*istrideH + istartW*istrideW; + real *op = output_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow; + THIndex_t *indp = ind_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow; + + /* compute local max: */ + int64_t maxindex = -1; + real maxval = -FLT_MAX; + int64_t it, ih, iw; + for(it = 0; it < kT; it++) + { + for(ih = 0; ih < kH; ih++) + { + for(iw = 0; iw < kW; iw++) + { + real val = *(ip + it*istrideT + ih*istrideH + iw*istrideW); + if ((val > maxval) || std::isnan(val)) + { + maxval = val; + maxindex = (it+istartT)*isizeH*isizeW + (ih+istartH)*isizeW + (iw+istartW); + } + } + } + } + + /* set output to local max */ + *op = maxval; + + /* store location of max */ + *indp = maxindex + TH_INDEX_BASE; + } + } + } + } +} + +void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int osizeT, + int osizeW, + int osizeH) +{ + int dimD = 0; + int dimT = 1; + int dimH = 2; + int dimW = 3; + int64_t sizeB = 1; + int64_t sizeD = 0; + int64_t isizeT = 0; + int64_t isizeH = 0; + int64_t isizeW = 0; + + int64_t istrideB = 0; + int64_t istrideD = 0; + int64_t istrideT = 0; + int64_t istrideH = 0; + int64_t istrideW = 0; + + real *input_data = nullptr; + real *output_data = nullptr; + THIndex_t *indices_data = nullptr; + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 5) + { + istrideB = input->stride[0]; + sizeB = input->size[0]; + dimD++; + dimT++; + dimH++; + dimW++; + } + + /* sizes */ + sizeD = input->size[dimD]; + isizeT = input->size[dimT]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + /* strides */ + istrideD = input->stride[dimD]; + istrideT = input->stride[dimT]; + istrideH = input->stride[dimH]; + istrideW = input->stride[dimW]; + + /* resize output */ + if (input->dim() == 4) + { + THTensor_(resize4d)(output, sizeD, osizeT, osizeH, osizeW); + /* indices will contain max input locations for each output point */ + THIndexTensor_(resize4d)(indices, sizeD, osizeT, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(VolumetricAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data, + indices_data, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + istrideD, istrideT, + istrideH, istrideW); + } + else + { + int64_t b; + + THTensor_(resize5d)(output, sizeB, sizeD, osizeT, osizeH, osizeW); + /* indices will contain max input locations for each output point */ + THIndexTensor_(resize5d)(indices, sizeB, sizeD, osizeT, osizeH, osizeW); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(VolumetricAdaptiveMaxPooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW, + indices_data+b*sizeD*osizeT*osizeH*osizeW, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + istrideD, istrideT, + istrideH, istrideW); + } + } +} + +static void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *ind_p, + int64_t sizeD, + int64_t isizeT, + int64_t isizeH, + int64_t isizeW, + int64_t osizeT, + int64_t osizeH, + int64_t osizeW) +{ + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) + { + real *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW; + real *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW; + THIndex_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW; + + /* calculate max points */ + int64_t ot, oh, ow; + for(ot = 0; ot < osizeT; ot++) + { + for(oh = 0; oh < osizeH; oh++) + { + for(ow = 0; ow < osizeW; ow++) + { + /* retrieve position of max */ + int64_t maxp = ind_p_d[ot*osizeH*osizeW + oh*osizeW + ow] - TH_INDEX_BASE; + + /* update gradient */ + gradInput_p_d[maxp] += gradOutput_p_d[ot*osizeH*osizeW + oh*osizeW + ow]; + } + } + } + } +} + +void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices) +{ + int dimD = 0; + int dimT = 1; + int dimH = 2; + int dimW = 3; + int64_t sizeB = 1; + int64_t sizeD; + int64_t isizeT; + int64_t isizeH; + int64_t isizeW; + int64_t osizeT; + int64_t osizeH; + int64_t osizeW; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 5) { + sizeB = input->size[0]; + dimD++; + dimT++; + dimH++; + dimW++; + } + + /* sizes */ + sizeD = input->size[dimD]; + isizeT = input->size[dimT]; + isizeH = input->size[dimH]; + isizeW = input->size[dimW]; + osizeT = gradOutput->size[dimT]; + osizeH = gradOutput->size[dimH]; + osizeW = gradOutput->size[dimW]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->dim() == 4) + { + THNN_(VolumetricAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + indices_data, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW); + } + else + { + int64_t b; +#pragma omp parallel for private(b) + for (b = 0; b < sizeB; b++) + { + THNN_(VolumetricAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW, + indices_data+b*sizeD*osizeT*osizeH*osizeW, + sizeD, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricAveragePooling.c b/aten/src/THNN/generic/VolumetricAveragePooling.c new file mode 100644 index 0000000..c9dd9f7 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricAveragePooling.c @@ -0,0 +1,500 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c" +#else + +static inline void THNN_(VolumetricAveragePooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int padT, + int padW, + int padH, + bool ceil_mode) +{ + int64_t nslices; + int64_t itime; + int64_t iheight; + int64_t iwidth; + int64_t otime; + int64_t oheight; + int64_t owidth; + int ndim = input->dim(); + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + THArgCheck(kT > 0 && kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", + kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH + && input->size[dimt] >= kT, 2, + "input image (T: %d H: %d W: %d) smaller than " + "kernel size (kT: %d kH: %d kW: %d)", + input->size[dimt], input->size[dimh], input->size[dimw], + kT, kH, kW); + + // The second argument is argNumber... here is the index of padH. + THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11, + "pad should not be greater than half of kernel size, but got " + "padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d", + padT, padW, padH, kT, kW, kH); + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + + if (ceil_mode) { + otime = (int64_t)(ceil((float)(itime - kT + 2*padT) / dT)) + 1; + oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1; + owidth = (int64_t)(ceil((float)(iwidth - kW + 2*padW) / dW)) + 1; + } + else + { + otime = (int64_t)(floor((float)(itime - kT + 2*padT) / dT)) + 1; + oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1; + owidth = (int64_t)(floor((float)(iwidth - kW + 2*padW) / dW)) + 1; + } + + if (padT || padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((otime - 1)*dT >= itime + padT) + --otime; + if ((oheight - 1)*dH >= iheight + padH) + --oheight; + if ((owidth - 1)*dW >= iwidth + padW) + --owidth; + } + + if (otime < 1 || owidth < 1 || oheight < 1) + THError("Given input size: (%dx%dx%dx%d). " + "Calculated output size: (%dx%dx%dx%d). Output size is too small", + nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth); + } +} + +static void THNN_(VolumetricAveragePooling_updateOutput_frame)( + real *input_p, + real *output_p, + int64_t nslices, + int64_t itime, + int64_t iwidth, + int64_t iheight, + int64_t otime, + int64_t owidth, + int64_t oheight, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int padT, + int padW, + int padH, + bool count_include_pad) +{ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + int64_t i, j, ti; + + /* local pointers. */ + real *ip = input_p + k * itime * iwidth * iheight; + real *op = output_p + k * otime * owidth * oheight; + for (i = 0; i < otime * oheight * owidth; ++i) + *(op + i) = 0; + + /* loop over output */ + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* compute pool range. */ + int64_t tstart = ti * dT - padT; + int64_t hstart = i * dH - padH; + int64_t wstart = j * dW - padW; + int64_t tend = fminf(tstart + kT, itime + padT); + int64_t hend = fminf(hstart + kH, iheight + padH); + int64_t wend = fminf(wstart + kW, iwidth + padW); + int64_t pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart); + tstart = fmaxf(tstart, 0); + hstart = fmaxf(hstart, 0); + wstart = fmaxf(wstart, 0); + tend = fmin(tend, itime); + hend = fmin(hend, iheight); + wend = fmin(wend, iwidth); + + int divide_factor; + if (count_include_pad) + divide_factor = pool_size; + else + divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart); + + /* compute local sum: */ + real sum = 0.0; + int64_t x, y, z; + + for (z = tstart; z < tend; z++) + { + for (y = hstart; y < hend; y++) + { + for (x = wstart; x < wend; x++) + { + sum += *(ip + z * iwidth * iheight + y * iwidth + x); + } + } + } + + /* set output to local max */ + *op++ += sum / divide_factor; + } + } + } + } +} + +void THNN_(VolumetricAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int padT, + int padW, + int padH, + bool ceil_mode, + bool count_include_pad) +{ + int64_t nslices; + int64_t itime; + int64_t iheight; + int64_t iwidth; + int64_t otime; + int64_t oheight; + int64_t owidth; + real *input_data; + real *output_data; + + THNN_(VolumetricAveragePooling_shapeCheck)( + state, input, NULL, kT, kW, kH, + dT, dW, dH, padT, padW, padH, ceil_mode); + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + if (ceil_mode) + { + otime = (int64_t)(ceil((float)(itime - kT + 2*padT) / dT)) + 1; + oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1; + owidth = (int64_t)(ceil((float)(iwidth - kW + 2*padW) / dW)) + 1; + } + else + { + otime = (int64_t)(floor((float)(itime - kT + 2*padT) / dT)) + 1; + oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1; + owidth = (int64_t)(floor((float)(iwidth - kW + 2*padW) / dW)) + 1; + } + if (padT || padH || padW) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((otime - 1)*dT >= itime + padT) + --otime; + if ((oheight - 1)*dH >= iheight + padH) + --oheight; + if ((owidth - 1)*dW >= iwidth + padW) + --owidth; + } + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (input->dim() == 4) /* non-batch mode */ + { + /* resize output */ + THTensor_(resize4d)(output, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(VolumetricAveragePooling_updateOutput_frame)( + input_data, output_data, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + padT, padW, padH, + count_include_pad + ); + } + else /* batch mode */ + { + int64_t p; + int64_t nBatch = input->size[0]; + + int64_t istride = nslices * itime * iwidth * iheight; + int64_t ostride = nslices * otime * owidth * oheight; + + /* resize output */ + THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p=0; p < nBatch; p++) + { + THNN_(VolumetricAveragePooling_updateOutput_frame)( + input_data + p * istride, output_data + p * ostride, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + padT, padW, padH, + count_include_pad + ); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricAveragePooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + int64_t nslices, + int64_t itime, + int64_t iwidth, + int64_t iheight, + int64_t otime, + int64_t owidth, + int64_t oheight, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int padT, + int padW, + int padH, + bool count_include_pad) +{ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + int64_t i, j, ti; + + /* local pointers */ + real *ip = gradInput_p + k * itime * iwidth * iheight; + real *op = gradOutput_p + k * otime * owidth * oheight; + for (i = 0; i < itime*iwidth*iheight; i++) + *(ip + i) = 0; + + /* loop over output */ + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + int64_t tstart = ti * dT - padT; + int64_t hstart = i * dH - padH; + int64_t wstart = j * dW - padW; + int64_t tend = fminf(tstart + kT, itime + padT); + int64_t hend = fminf(hstart + kH, iheight + padH); + int64_t wend = fminf(wstart + kW, iwidth + padW); + int64_t pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart); + tstart = fmaxf(tstart, 0); + hstart = fmaxf(hstart, 0); + wstart = fmaxf(wstart, 0); + tend = fminf(tend, itime); + hend = fminf(hend, iheight); + wend = fminf(wend, iwidth); + + int64_t divide_factor; + if (count_include_pad) + divide_factor = pool_size; + else + divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart); + + /* scatter gradients out to footprint: */ + real val = *op++; + + int64_t x,y,z; + for (z = tstart; z < tend; z++) + { + for (y = hstart; y < hend; y++) + { + for (x = wstart; x < wend; x++) + { + *(ip + z * iheight * iwidth + y * iwidth + x) += val / divide_factor; + } + } + } + } + } + } + } +} + +void THNN_(VolumetricAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int padT, + int padW, + int padH, + bool ceil_mode, + bool count_include_pad) +{ + int64_t nslices; + int64_t itime; + int64_t iheight; + int64_t iwidth; + int64_t otime; + int64_t oheight; + int64_t owidth; + real *gradInput_data; + real *gradOutput_data; + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + THNN_(VolumetricAveragePooling_shapeCheck)( + state, input, gradOutput, kT, kW, kH, + dT, dW, dH, padT, padW, padH, ceil_mode); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + otime = gradOutput->size[dimt]; + oheight = gradOutput->size[dimh]; + owidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + + /* backprop */ + if (input->dim() == 4) /* non-batch mode*/ + { + THNN_(VolumetricAveragePooling_updateGradInput_frame)( + gradInput_data, gradOutput_data, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + padT, padW, padH, + count_include_pad + ); + } + else /* batch mode */ + { + int64_t p; + int64_t nBatch = input->size[0]; + + int64_t istride = nslices * itime * iwidth * iheight; + int64_t ostride = nslices * otime * owidth * oheight; + +#pragma omp parallel for private(p) + for (p = 0; p < nBatch; p++) + { + THNN_(VolumetricAveragePooling_updateGradInput_frame)( + gradInput_data + p * istride, gradOutput_data + p * ostride, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + padT, padW, padH, + count_include_pad + ); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c new file mode 100644 index 0000000..d88cc60 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricConvolution.c @@ -0,0 +1,260 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricConvolution.c" +#else + +void THNN_(VolumetricConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, // only used by cuda impl + THTensor *fgradInput, // only used by cuda impl + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->dim() == 5) + { + dimt++; + dimh++; + dimw++; + } + + int64_t nOutputPlane = weight->size[0]; + int64_t kT = weight->size[2]; + int64_t kH = weight->size[3]; + int64_t kW = weight->size[4]; + int64_t inputDepth = input->size[dimt]; + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputDepth = (inputDepth - kT) / dT + 1; + int64_t outputWidth = (inputWidth - kW) / dW + 1; + int64_t outputHeight = (inputHeight - kH) / dH + 1; + THTensor *outn = THTensor_(new)(); + int64_t i, j; + if (input->dim() == 4) /* non-batch mode */ + { + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + + /* add bias */ + if (bias) { + for (i = 0; i < bias->size[0]; i++) + { + THTensor_(select)(outn, output, 0, i); + THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); + } + } else { + THTensor_(zero)(output); + } + + /* do convolutions */ + THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X"); + } + else /* batch mode */ + { + int64_t nBatch = input->size[0]; + THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor *inb = THTensor_(new)(); + THTensor *outb = THTensor_(new)(); + + /* loop over batches */ + for (j = 0; j < nBatch; j++) + { + THTensor_(select)(inb, input, 0, j); + THTensor_(select)(outb, output, 0, j); + + /* add bias */ + if (bias) { + for (i = 0; i < bias->size[0]; i++) + { + THTensor_(select)(outn, outb, 0, i); + THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); + } + } else { + THTensor_(zero)(outb); + } + + /* do convolutions */ + THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X"); + } + + THTensor_(free)(inb); + THTensor_(free)(outb); + } + THTensor_(free)(outn); +} + +void THNN_(VolumetricConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, // only used by cuda impl + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version + + THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 5, 4, weight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + + int nOutputPlane = (int)weight->size[0]; + + THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3, + gradOutput, + "non-empty 4D or 5D (batch mode) tensor expected for gradOutput, but got: %s"); + + int dimPlane = 0; + if (gradOutput->dim() == 5) + { + dimPlane++; + } + + THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1, + "Number of output features is not equal to nOutputPlane" + ); + + /* gradient to input */ + THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1); + if (gradOutput->dim() == 4) /* non-batch mode */ + { + THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C"); + } + else /* batch mode */ + { + int64_t nBatch = gradOutput->size[0]; + THTensor *ginpb = THTensor_(new)(); + THTensor *goutb = THTensor_(new)(); + int64_t j; + + THTensor_(resize5d)(gradInput, + input->size[0], input->size[1], input->size[2], input->size[3], input->size[4] + ); + + /* loop over batches */ + for (j = 0; j < nBatch; j++) + { + THTensor_(select)(ginpb, gradInput, 0, j); + THTensor_(select)(goutb, gradOutput, 0, j); + THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C"); + } + THTensor_(free)(ginpb); + THTensor_(free)(goutb); + } + + THTensor_(free)(tweight); +} + +void THNN_(VolumetricConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, // only used by cuda impl + THTensor *fgradInput, // only used by cuda impl + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version + + THNN_ARGCHECK(!gradWeight->is_empty() && gradWeight->dim() == 5, 4, gradWeight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for gradWeight, but got: %s"); + + int nOutputPlane = (int)gradWeight->size[0]; + if (gradBias) { + THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size[0] == nOutputPlane, 5, + "gradBias tensor has wrong size" + ); + } + + int64_t k; + real *gradBias_data; + THTensor *gradOutSlice; + int dimPlane = 0; + if (gradOutput->dim() == 5) + { + dimPlane++; + } + + THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1, + "Number of output features is not equal to nOutputPlane" + ); + + if (gradOutput->dim() == 4) /* non-batch mode */ + { + /* gradient to bias */ + if (gradBias) { + gradBias_data = THTensor_(data)(gradBias); + gradOutSlice = THTensor_(new)(); + for (k = 0; k < nOutputPlane; k++) + { + THTensor_(select)(gradOutSlice, gradOutput, 0, k); + gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice); + } + THTensor_(free)(gradOutSlice); + } + + /* gradient to kernels */ + THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW); + } + else /* batch mode */ + { + int64_t nBatch = gradOutput->size[0]; + THTensor *inpb = THTensor_(new)(); + THTensor *goutb = THTensor_(new)(); + int64_t j; + + /* loop over batches */ + for (j = 0; j < nBatch; j++) + { + THTensor_(select)(inpb, input, 0, j); + THTensor_(select)(goutb, gradOutput, 0, j); + + /* gradient to bias */ + if (gradBias) { + gradBias_data = THTensor_(data)(gradBias); + gradOutSlice = THTensor_(new)(); + for (k = 0; k < nOutputPlane; k++) + { + THTensor_(select)(gradOutSlice, goutb, 0, k); + gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice); + } + THTensor_(free)(gradOutSlice); + } + + /* gradient to kernels */ + THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW); + } + THTensor_(free)(inpb); + THTensor_(free)(goutb); + } +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c new file mode 100644 index 0000000..2fa1874 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c @@ -0,0 +1,768 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c" +#else + +#define CONV3D_OMP_THRESHOLD 20 + +static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int weight_nullable) { + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + + if (weight != NULL) { + THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 5), 5, weight, + "non-empty 2D or 5D weight tensor expected, but got: %s"); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) + { + dimf++; + dimt++; + dimh++; + dimw++; + } + + int64_t inputDepth; + int64_t inputHeight; + int64_t inputWidth; + + int64_t exactInputDepth; + int64_t exactInputHeight; + int64_t exactInputWidth; + int64_t outputDepth; + int64_t outputHeight; + int64_t outputWidth; + + inputDepth = input->size[dimt]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + + exactInputDepth = inputDepth + 2*pT; + exactInputHeight = inputHeight + 2*pH; + exactInputWidth = inputWidth + 2*pW; + + if (exactInputDepth < kT || exactInputHeight < kH || exactInputWidth < kW) { + THError("Calculated padded input size per channel: (%ld x %ld x %ld). " + "Kernel size: (%ld x %ld x %ld). Kernel size can't be greater than actual input size", + exactInputDepth, exactInputHeight, exactInputWidth, kT, kH, kW); + } + + outputDepth = (exactInputDepth - kT) / dT + 1; + outputHeight = (exactInputHeight - kH) / dH + 1; + outputWidth = (exactInputWidth - kW) / dW + 1; + + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld x %ld). " + "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + if (weight->dim() == 2) { + nInputPlane /= (kT * kH * kW); + } + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +static THTensor* THNN_(newViewWeight)(THTensor *weight) +{ + weight = THTensor_(newContiguous)(weight); + if (weight->dim() == 5) { + int64_t s1 = weight->size[0]; + int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + THTensor *old_weight = weight; + weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, + s1, -1, s2, -1); + THTensor_(free)(old_weight); + } + return weight; +} + + +// Kernel for fast unfold+copy +// Borrowed from Theano +// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas + +static void THNN_(unfolded_acc_vol)( + THTensor *finput, + THTensor *input, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int64_t nInputPlane, + int64_t inputDepth, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputDepth, + int64_t outputWidth, + int64_t outputHeight) +{ + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); +#ifdef _OPENMP + int inOmp = omp_in_parallel(); + #pragma omp parallel if (!inOmp) firstprivate(finput_data, input_data, outputWidth, outputHeight, outputDepth, kW, kH, kT, dW, dH, dT, pW, pH, pT, nInputPlane, inputHeight, inputWidth, inputDepth) + { + size_t num_threads = omp_get_num_threads(); + size_t tid = omp_get_thread_num(); + int64_t n = nInputPlane * inputHeight * inputWidth * inputDepth; + int64_t seg_len_tmp = n / num_threads; + int64_t line_index_offset = tid * seg_len_tmp; + int64_t line_seg_len = (tid == num_threads - 1)? (n-line_index_offset) : seg_len_tmp; + + int64_t w = line_index_offset % inputWidth + pW; + int64_t h_index = line_index_offset / inputWidth; + int64_t h = h_index % inputHeight + pH; + int64_t d_index = h_index / inputHeight; + int64_t d = d_index % inputDepth + pT; + int64_t c = d_index / inputDepth; +#else + int64_t line_seg_len = nInputPlane * inputHeight * inputWidth * inputDepth; + int64_t line_index_offset = 0; + int64_t w = pW; + int64_t h = pH; + int64_t d = pT; + int64_t c = 0;; +#endif + int64_t outputHW = outputHeight * outputWidth; + int64_t outputDHW = outputDepth * outputHW; + int64_t kHkW = kH*kW; + int64_t kTkHkW = kT*kHkW; + + int64_t coeff_d_col = outputHW - dT * kHkW * outputDHW; + int64_t coeff_h_col = outputWidth - dH * kW * outputDHW; + int64_t coeff_w_col = (1 - dW * outputDHW); + + int64_t count = 0; + while (count < line_seg_len) { + // compute the start and end of the output + int64_t w_col_start = (w < kW) ? 0 : (w - kW) / dW + 1; + int64_t w_col_tmp = w / dW + 1; + int64_t w_col_end = w_col_tmp < outputWidth? w_col_tmp : outputWidth; + + int64_t h_col_start = (h < kH) ? 0 : (h - kH) / dH + 1; + int64_t h_col_tmp = h / dH + 1; + int64_t h_col_end = h_col_tmp < outputHeight? h_col_tmp : outputHeight; + + int64_t d_col_start = (d < kT) ? 0 : (d - kT) / dT + 1; + int64_t d_col_tmp = d / dT + 1; + int64_t d_col_end = d_col_tmp < outputDepth? d_col_tmp : outputDepth; + + real val = 0; + int64_t offset = (c * kTkHkW + d * kHkW + h * kW + w) * outputDHW; + + int64_t offset_w_col_start = w_col_start * coeff_w_col; + int64_t offset_d_col_start = d_col_start * coeff_d_col; + int64_t offset_h_col_start = h_col_start * coeff_h_col; + int64_t offset_w_col = offset_w_col_start + offset; + int64_t offset_d_col; + int64_t offset_h_col; + int64_t w_col, d_col, h_col; + for (w_col = w_col_start; w_col < w_col_end; ++w_col) { + offset_d_col = offset_d_col_start + offset_w_col; + for (d_col = d_col_start; d_col < d_col_end; ++d_col) { + offset_h_col = offset_h_col_start + offset_d_col; + for (h_col = h_col_start; h_col < h_col_end; ++h_col) { + val += finput_data[offset_h_col]; + offset_h_col += coeff_h_col; + } + offset_d_col += coeff_d_col; + } + offset_w_col += coeff_w_col; + } + + input_data[line_index_offset+count] = val; + count++; + + if (count < line_seg_len) { + if (w - pW + 1 == inputWidth) { + w = pW; + if (h - pH + 1 == inputHeight) { + h = pH; + if (d - pT + 1 == inputDepth) { + d = pT; + c++; + } + else d++; + } + else h++; + } + else w++; + } + } +#ifdef _OPENMP + } +#endif +} + +/* + Modified from the version of CUDA implementation, but the loop iterations is larger than that one. + The larger loop could lower the proportion of openmp overhead. And the inner part in loop is simpler. + The naive code is below: + + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + + int64_t n = nInputPlane*kT*kH*kW*outputDepth*outputWidth*outputHeight; + #pragma omp parallel for firstprivate(finput_data, input_data, outputWidth, outputHeight, outputDepth, kW, kH, kT, dW, dH, dT, pW, pH, pT, inputHeight, inputWidth, inputDepth) + for (int64_t idx = 0; idx < n ; ++idx) { + int64_t w_out = line_index_offset % outputWidth; + int64_t remained = line_index_offset / outputWidth; + int64_t h_out = remained % outputHeight; + remained /= outputHeight; + int64_t d_out = remained % outputDepth; + remained /= outputDepth; + int k = remained % kW; + remained /= kW; + int j = remained % kH; + remained /= kH; + int i = remained % kT; + int64_t nip = remained / kT; + + int64_t d = d_out * dT - pT + i; + int64_t h = h_out * dH - pH + j; + int64_t w = w_out * dW - pW + k; + + finput_data[idx] = (h >= 0 && w >= 0 && d >= 0 && h < inputHeight && w < inputWidth && d < inputDepth) ? + input_data[nip*inputDepth*inputWidth*inputHeight+ d*inputHeight*inputWidth + h*inputWidth + w] : 0; + } + + However, there are 6 quotient and 6 module operations which are very time-consuming. So we choose relatively + more complex but more efficient pattern. +*/ +static void THNN_(unfolded_copy_vol)( + THTensor *finput, + THTensor *input, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int64_t nInputPlane, + int64_t inputDepth, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputDepth, + int64_t outputWidth, + int64_t outputHeight) +{ + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#ifdef _OPENMP + int inOmp = omp_in_parallel(); + #pragma omp parallel if (!inOmp) firstprivate(finput_data, input_data, outputWidth, outputHeight, outputDepth, kW, kH, kT, dW, dH, dT, pW, pH, pT, nInputPlane, inputHeight, inputWidth, inputDepth) + { + size_t num_threads = omp_get_num_threads(); + size_t tid = omp_get_thread_num(); + int64_t n = nInputPlane*kT*kH*kW*outputDepth*outputWidth*outputHeight; + int64_t seg_len_tmp = n / num_threads; + int64_t line_index_offset = tid * seg_len_tmp; + int64_t line_seg_len = (tid == num_threads - 1)? (n-line_index_offset) : seg_len_tmp; + + int64_t w_out = line_index_offset % outputWidth; + int64_t remained = line_index_offset / outputWidth; + int64_t h_out = remained % outputHeight; + remained /= outputHeight; + int64_t d_out = remained % outputDepth; + remained /= outputDepth; + int k = remained % kW; + remained /= kW; + int j = remained % kH; + remained /= kH; + int i = remained % kT; + int64_t nip = remained / kT; +#else + int64_t line_seg_len = nInputPlane*kT*kH*kW*outputDepth*outputWidth*outputHeight; + int64_t line_index_offset = 0; + int64_t w_out = 0; + int64_t h_out = 0; + int64_t d_out = 0; + int i = 0; + int j = 0; + int k = 0; + int64_t nip = 0; +#endif + + int64_t count = 0; + real* dst = finput_data + line_index_offset; + int64_t inputHW = inputHeight*inputWidth; + int64_t inputDHW = inputHW*inputDepth; + + while (count < line_seg_len) { + int64_t w = w_out * dW - pW + k; + int64_t h = h_out * dH - pH + j; + int64_t d = d_out * dT - pT + i; + + + *dst = (h >= 0 && w >= 0 && d >= 0 && h < inputHeight && w < inputWidth && d < inputDepth) ? + input_data[nip*inputDHW+ d*inputHW + h*inputWidth + w] : 0; + + count++; + if (count < line_seg_len) { + dst++; + w_out++; + if (w_out == outputWidth) { + w_out = 0; + h_out++; + if (h_out == outputHeight) { + h_out = 0; + d_out++; + if (d_out == outputDepth) { + d_out = 0; + k++; + if(k == kW) { + k = 0; + j++; + if(j == kH) { + j = 0; + i++; + if(i == kT) { + i = 0; + nip++; + } + } + } + } + } + } + } + + } +#ifdef _OPENMP + } +#endif +} + +static void THNN_(VolumetricConvolutionMM_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int64_t nInputPlane, + int64_t inputDepth, + int64_t inputWidth, + int64_t inputHeight, + int64_t nOutputPlane, + int64_t outputDepth, + int64_t outputWidth, + int64_t outputHeight) +{ + int64_t i; + THTensor *output2d; + + THNN_(unfolded_copy_vol)( + finput, input, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + nInputPlane, + inputDepth, inputWidth, inputHeight, + outputDepth, outputWidth, outputHeight + ); + + output2d = THTensor_(newWithStorage2d)( + output->storage, output->storageOffset, nOutputPlane, -1, + outputDepth*outputHeight*outputWidth, -1 + ); + + if (bias) { + for (i = 0; i < nOutputPlane; i++) + { + THVector_(fill)( + THStorage_(data)(output->storage)+output->storageOffset+output->stride[0]*i, + THTensor_(get1d)(bias, i), + outputDepth*outputHeight*outputWidth + ); + } + } else { + THTensor_(zero)(output); + } + + THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); + + THTensor_(free)(output2d); +} + +void THNN_(VolumetricConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, // unused + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int dimf = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + int64_t nInputPlane; + int64_t inputDepth; + int64_t inputHeight; + int64_t inputWidth; + int64_t nOutputPlane; + int64_t outputDepth; + int64_t outputHeight; + int64_t outputWidth; + + THNN_(VolumetricConvolutionMM_shapeCheck)( + state, input, NULL, weight, bias, + kT, kW, kH, dT, dW, dH, pT, pW, pH, 0); + input = THTensor_(newContiguous)(input); + + if (input->dim() == 5) + { + dimf++; + dimt++; + dimh++; + dimw++; + } + + nInputPlane = input->size[dimf]; + inputDepth = input->size[dimt]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + nOutputPlane = weight->size[0]; + outputDepth = (inputDepth + 2*pT - kT) / dT + 1; + outputHeight = (inputHeight + 2*pH - kH) / dH + 1; + outputWidth = (inputWidth + 2*pW - kW) / dW + 1; + + weight = THNN_(newViewWeight)(weight); + + if (input->dim() == 4) + { + THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth); + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + + THNN_(VolumetricConvolutionMM_updateOutput_frame)( + input, output, weight, bias, finput, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + nInputPlane, inputDepth, inputWidth, inputHeight, + nOutputPlane, outputDepth, outputWidth, outputHeight + ); + } + else + { + int64_t T = input->size[0]; + int64_t t; + + THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth); + THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth); +#ifdef _OPENMP + #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t) +#endif + for (t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(VolumetricConvolutionMM_updateOutput_frame)( + input_t, output_t, weight, bias, finput_t, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + nInputPlane, inputDepth, inputWidth, inputHeight, + nOutputPlane, outputDepth, outputWidth, outputHeight + ); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(weight); +} + +static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THTensor *gradOutput2d = THTensor_(newWithStorage2d)( + gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1 + ); + + THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); + THTensor_(free)(gradOutput2d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc_vol)( + fgradInput, gradInput, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2], + gradOutput->size[1], gradOutput->size[3], gradOutput->size[2] + ); +} + +void THNN_(VolumetricConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THNN_(VolumetricConvolutionMM_shapeCheck)( + state, input, gradOutput, weight, NULL, + kT, kW, kH, dT, dW, dH, pT, pW, pH, 0); + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + weight = THNN_(newViewWeight)(weight); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resizeAs)(fgradInput, finput); + // depending on the BLAS library, fgradInput (result tensor) might + // be left uninitialized on zero alpha, which might lead to weird behavior + // hence, to be safe, zero it + THTensor_(zero)(fgradInput); + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + + if (input->dim() == 4) + { + THNN_(VolumetricConvolutionMM_updateGradInput_frame)( + gradInput, gradOutput, tweight, fgradInput, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH + ); + } + else + { + int64_t T = input->size[0]; + int64_t t; + +#ifdef _OPENMP + #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t) +#endif + for (t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(VolumetricConvolutionMM_updateGradInput_frame)( + gradInput_t, gradOutput_t, tweight, fgradInput_t, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH + ); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, // can be NULL if gradWeight = NULL + real scale) +{ + int64_t i; + THTensor *gradOutput2d = THTensor_(newWithStorage2d)( + gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1 + ); + + if (gradWeight){ + THTensor *tfinput = THTensor_(new)(); + THTensor_(transpose)(tfinput, finput, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput); + THTensor_(free)(tfinput); + } + + if (gradBias) { + for (i = 0; i < gradBias->size[0]; i++) + { + int64_t k; + real sum = 0; + real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; + for (k = 0; k < gradOutput2d->size[1]; k++) + sum += data[k]; + + (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale * sum; + } + } + + THTensor_(free)(gradOutput2d); +} + +void THNN_(VolumetricConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + + THNN_(VolumetricConvolutionMM_shapeCheck)( + state, input, gradOutput, gradWeight, gradBias, + kT, kW, kH, dT, dW, dH, pT, pW, pH, 1); + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + if (gradWeight) { + gradWeight = THNN_(newViewWeight)(gradWeight); + } + + if (input->dim() == 4) // non-batch mode + { + THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale); + } + else // batch mode + { + int64_t T = input->size[0]; + int64_t t; + +#ifdef _OPENMP + #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t) +#endif + for (t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = NULL; + if (gradWeight) { + finput_t = THTensor_(newSelect)(finput, 0, t); + } + + THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + if (gradWeight) { + THTensor_(free)(finput_t); + } + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + if (gradWeight) { + THTensor_(free)(gradWeight); + } +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c new file mode 100644 index 0000000..66d560a --- /dev/null +++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c @@ -0,0 +1,455 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c" +#else + +static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kT, int kH, int kW, int dT, int dH, int dW, + int padT, int padH, int padW, + int dilationT, int dilationH, int dilationW, + int weight_nullable) { + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d", + dilationT, dilationH, dilationW); + + if (weight != NULL) { + THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 5, 4, weight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + // Params + int ndim = input->dim(); + int dimf = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) { + dimf++; + dimd++; + dimh++; + dimw++; + } + + int64_t inputDepth = input->size[dimd]; + int64_t inputHeight = input->size[dimh]; + int64_t inputWidth = input->size[dimw]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld x %ld). " + "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + } + + if (weight != NULL) { + int64_t nInputPlane = weight->size[1]; + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + int64_t nOutputPlane = weight->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + int64_t nOutputPlane = bias->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) +{ + THNN_(VolumetricDilatedConvolution_shapeCheck)( + input, NULL, weight, bias, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW, 0); + + // Params: + int64_t nInputPlane = weight->size[1]; + int64_t nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous"); + if (bias) { + bias = THTensor_(newContiguous)(bias); + THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous"); + } + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } + + int64_t inputDepth = input->size[2]; + int64_t inputHeight = input->size[3]; + int64_t inputWidth = input->size[4]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(zero)(output); + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 3 || + ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + int64_t m_ = nOutputPlane; + int64_t n_ = outputDepth * outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 0, + THTensor_(data)(output_n), n_ + ); + } else { + THTensor_(zero)(output_n); + } + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + int64_t m = nOutputPlane; + int64_t n = columns->size[1]; + int64_t k = nInputPlane*kT*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(columns), n, + THTensor_(data)(weight), k, + 1, + THTensor_(data)(output_n), n + ); + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (is_batch == 0) { + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) +{ + THNN_(VolumetricDilatedConvolution_shapeCheck)( + input, gradOutput, weight, NULL, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW, 0); + + // Params + int64_t nInputPlane = weight->size[1]; + int64_t nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous"); + + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + THTensor_(zero)(gradColumns); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + int64_t m = nInputPlane*kT*kW*kH; + int64_t n = gradColumns->size[1]; + int64_t k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(gradOutput_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(gradColumns), n + ); + + // Unpack columns back into input: + THNN_(col2vol)( + THTensor_(data)(gradColumns), + nInputPlane, inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(gradInput_n) + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (is_batch == 0) { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(VolumetricDilatedConvolution_shapeCheck)( + input, gradOutput, gradWeight, gradBias, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW, 1); + + // Params + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + if (gradWeight) { + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + } + THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous"); + if (gradBias) { + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous"); + } + + int is_batch = 1; + if (input->dim() == 4) { + // Force batch + is_batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + int64_t nInputPlane = input->size[1]; + int64_t nOutputPlane = gradOutput->size[1]; + int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size[4]; + int64_t inputHeight = input->size[3]; + int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + int64_t m = nOutputPlane; + int64_t n = nInputPlane*kT*kW*kH; + int64_t k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(gradOutput_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + int64_t m_ = nOutputPlane; + int64_t k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (is_batch == 0) { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c new file mode 100644 index 0000000..1641c60 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c @@ -0,0 +1,503 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c" +#else + +static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + bool ceilMode) { + int ndim = input->dim(); + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + int64_t nslices; + int64_t itime; + int64_t iheight; + int64_t iwidth; + int64_t otime; + int64_t oheight; + int64_t owidth; + + THArgCheck(kT > 0 && kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", + kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14, + "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d", + dilationT, dilationH, dilationW); + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2, + "pad should be smaller than half of kernel size, but got " + "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d", + kT, kW, kH, pT, pW, pH); + + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + if (ceilMode) + { + otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(ceil((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + else + { + otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(floor((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + + if (pT || pW || pH) + { + // ensure that the last pooling starts inside the image + if ((otime - 1)*dT >= itime + pT) + --otime; + if ((oheight - 1)*dH >= iheight + pH) + --oheight; + if ((owidth - 1)*dW >= iwidth + pW) + --owidth; + } + + if (otime < 1 || owidth < 1 || oheight < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth); + } + if (indices != NULL) { + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth); + } +} + +static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *indz_p, + int64_t nslices, + int64_t itime, + int64_t iwidth, + int64_t iheight, + int64_t otime, + int64_t owidth, + int64_t oheight, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH) +{ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + int64_t i, j, ti; + real *ip = input_p + k * itime * iwidth * iheight; + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* local pointers */ + + int64_t start_t = ti * dT - pT; + int64_t start_h = i * dH - pH; + int64_t start_w = j * dW - pW; + + int64_t end_t = fminf(start_t + (kT - 1) * dilationT + 1, itime); + int64_t end_h = fminf(start_h + (kH - 1) * dilationH + 1, iheight); + int64_t end_w = fminf(start_w + (kW - 1) * dilationW + 1, iwidth); + + while(start_t < 0) + start_t += dilationT; + while(start_h < 0) + start_h += dilationH; + while(start_w < 0) + start_w += dilationW; + + real *op = output_p + k * otime * owidth * oheight + + ti * owidth * oheight + i * owidth + j; + THIndex_t *indzp = indz_p + k * otime * owidth * oheight + + ti * owidth * oheight + i * owidth + j; + + /* compute local max: */ + int64_t maxindex = -1; + real maxval = -THInf; + int64_t x,y,z; + int64_t index = 0; + + for (z = start_t; z < end_t; z += dilationT) + { + for (y = start_h; y < end_h; y += dilationH) + { + for (x = start_w; x < end_w; x += dilationW) + { + index = z * iwidth * iheight + y * iwidth + x; + real val = ip[index]; + if ((val > maxval) || std::isnan(val)) + { + maxval = val; + maxindex = index; + } + } + } + } + + // store location of max + *indzp = maxindex + TH_INDEX_BASE; + + /* set output to local max */ + *op = maxval; + } + } + } + } +} + +void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH, + bool ceilMode) +{ + int64_t nslices; + int64_t itime; + int64_t iheight; + int64_t iwidth; + int64_t otime; + int64_t oheight; + int64_t owidth; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + state, input, NULL, NULL, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, dilationT, dilationW, dilationH, + ceilMode); + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + if (ceilMode) + { + otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(ceil((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + else + { + otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(floor((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + + if (pT || pW || pH) + { + // ensure that the last pooling starts inside the image + if ((otime - 1)*dT >= itime + pT) + --otime; + if ((oheight - 1)*dH >= iheight + pH) + --oheight; + if ((owidth - 1)*dW >= iwidth + pW) + --owidth; + } + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (input->dim() == 4) /* non-batch mode */ + { + /* resize output */ + THTensor_(resize4d)(output, nslices, otime, oheight, owidth); + /* indices will contain ti,i,j uchar locations packed into float/double */ + THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)( + input_data, output_data, + indices_data, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + else /* batch mode */ + { + int64_t p; + int64_t nBatch = input->size[0]; + + int64_t istride = nslices * itime * iwidth * iheight; + int64_t ostride = nslices * otime * owidth * oheight; + + /* resize output */ + THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth); + /* indices will contain ti,i,j locations for each output point */ + THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(p) + for (p=0; p < nBatch; p++) + { + THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)( + input_data + p * istride, + output_data + p * ostride, + indices_data + p * ostride, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *indz_p, + int64_t nslices, + int64_t itime, + int64_t iwidth, + int64_t iheight, + int64_t otime, + int64_t owidth, + int64_t oheight, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH) +{ + int64_t k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight; + real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight; + THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight; + + /* calculate max points */ + int64_t ti, i, j; + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* retrieve position of max */ + int64_t index = ti * oheight * owidth + i * owidth + j; + int64_t maxp = indz_p_k[index] - TH_INDEX_BASE; + + if (maxp != -1) { + /* update gradient */ + gradInput_p_k[maxp] += gradOutput_p_k[index]; + } + } + } + } + } +} + +void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH, + bool ceilMode) +{ + int nslices; + int itime; + int iheight; + int iwidth; + int otime; + int oheight; + int owidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + state, input, gradOutput, indices, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, dilationT, dilationW, dilationH, + ceilMode); + + // TODO: gradOutput shape check + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + otime = gradOutput->size[dimt]; + oheight = gradOutput->size[dimh]; + owidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->dim() == 4) /* non-batch mode*/ + { + THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)( + gradInput_data, gradOutput_data, + indices_data, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + else /* batch mode */ + { + int64_t p; + int64_t nBatch = input->size[0]; + + int64_t istride = nslices * itime * iwidth * iheight; + int64_t ostride = nslices * otime * owidth * oheight; + +#pragma omp parallel for private(p) + for (p = 0; p < nBatch; p++) + { + THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)( + gradInput_data + p * istride, + gradOutput_data + p * ostride, + indices_data + p * ostride, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricFractionalMaxPooling.c b/aten/src/THNN/generic/VolumetricFractionalMaxPooling.c new file mode 100644 index 0000000..12f9925 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricFractionalMaxPooling.c @@ -0,0 +1,279 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c" +#else + +static int64_t* THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + real sample, + int64_t inputSize, + int64_t outputSize, + int poolSize) { + real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1); + int64_t* sequence = (int64_t*) THAlloc(sizeof(int64_t) * outputSize); + + int64_t i; + for (i = 0; i < outputSize - 1; ++i) { + sequence[i] = + (int64_t) ((i + sample) * alpha) - (int64_t) (sample * alpha); + } + sequence[outputSize - 1] = inputSize - poolSize; + + return sequence; +} + +static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)( + real* input, + real* output, + THIndex_t* indices, + real* randomSamples, + int64_t numPlanes, + int64_t inputT, int64_t inputW, int64_t inputH, + int64_t outputT, int64_t outputW, int64_t outputH, + int poolSizeT, int poolSizeW, int poolSizeH) { + int64_t plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; ++plane) { + /* each plane contains 3 random samples, one for T, one for W, and one for H */ + real* randomSamplesForPlane = randomSamples + plane * 3; + + /* Generate interval sequence */ + int64_t* sequenceT = + THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[0], inputT, outputT, poolSizeT); + int64_t* sequenceW = + THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[1], inputW, outputW, poolSizeW); + int64_t* sequenceH = + THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[2], inputH, outputH, poolSizeH); + + /* loop over output */ + int64_t h, w, t; + + real* inputForPlane = input + plane * inputT * inputW * inputH; + real* outputForPlane = output + plane * outputT * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH; + + for (h = 0; h < outputH; ++h) { + int64_t inputHStart = sequenceH[h]; + + for (w = 0; w < outputW; ++w) { + int64_t inputWStart = sequenceW[w]; + + for (t = 0; t < outputT; ++t) { + int64_t inputTStart = sequenceT[t]; + + real maxVal = -THInf; + int64_t maxIndex = -1; + + int64_t h2, w2, t2; + for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) { + for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) { + for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) { + THAssert(h2 >= 0 && h2 < inputH); + THAssert(w2 >= 0 && w2 < inputW); + THAssert(t2 >= 0 && t2 < inputT); + + int64_t planeIndex = h2 * inputW * inputT + w2 * inputT + t2; + real val = inputForPlane[planeIndex]; + if (val > maxVal) { + maxVal = val; + maxIndex = planeIndex; + } + } + } + } + + THAssert(maxVal != -THInf); + THAssert(maxIndex != -1); + + outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal; + /* +1 to lua index */ + indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE; + } + } + } + + THFree(sequenceT); + THFree(sequenceW); + THFree(sequenceH); + } +} + +void THNN_(VolumetricFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples) { + + int64_t numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + int timeDim = 3; + + int64_t numInputDims = THTensor_(nDimension)(input); + THNN_ARGCHECK(!input->is_empty() && (numInputDims == 4 || numInputDims == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (numInputDims == 5) { + numBatch = THTensor_(size)(input, 0); + planeDim++; + heightDim++; + widthDim++; + timeDim++; + } + + /* sizes */ + int64_t numPlanes = THTensor_(size)(input, planeDim); + int64_t inputH = THTensor_(size)(input, heightDim); + int64_t inputW = THTensor_(size)(input, widthDim); + int64_t inputT = THTensor_(size)(input, timeDim); + + THArgCheck(outputH + poolSizeH - 1 < inputH, 9, + "poolSizeH (%d) too large relative to input height (%d)", + poolSizeH, inputH); + THArgCheck(outputW + poolSizeW - 1 < inputW, 8, + "poolSizeW (%d) too large relative to input width (%d)", + poolSizeW, inputW); + THArgCheck(outputT + poolSizeT - 1 < inputT, 7, + "poolSizeT (%d) too large relative to input time (%d)", + poolSizeT, inputT); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (numInputDims == 4) { + /* resize output */ + THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT); + + THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input), + THTensor_(data)(output), + THIndexTensor_(data)(indices), + THTensor_(data)(randomSamples), + numPlanes, inputT, inputW, inputH, + outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH); + } else { + THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT); + + int64_t batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT, + THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT, + THTensor_(data)(randomSamples) + batch * numPlanes * 3, + numPlanes, inputT, inputW, inputH, + outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)( + real* gradInput, + real* gradOutput, + THIndex_t* indices, + int64_t numPlanes, + int64_t inputT, int64_t inputW, int64_t inputH, + int64_t outputT, int64_t outputW, int64_t outputH) { + int64_t plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; plane++) { + real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH; + real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH; + + int64_t h, w, t; + for (h = 0; h < outputH; ++h) { + for (w = 0; w < outputW; ++w) { + for (t = 0; t < outputT; ++t) { + int64_t outputIndex = h * outputW * outputT + w * outputT + t; + int64_t index = indicesForPlane[outputIndex] - TH_INDEX_BASE; + THAssert(index >= 0 && index < inputT * inputW * inputH); + + gradInputForPlane[index] += gradOutputForPlane[outputIndex]; + } + } + } + } +} + +void THNN_(VolumetricFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices) { + + int64_t numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + int timeDim = 3; + + int64_t numInputDims = THTensor_(nDimension)(input); + if (numInputDims == 5) { + numBatch = THTensor_(size)(input, 0); + planeDim = 1; + heightDim++; + widthDim++; + timeDim++; + } + + /* sizes */ + int64_t numPlanes = THTensor_(size)(input, planeDim); + int64_t inputH = THTensor_(size)(input, heightDim); + int64_t inputW = THTensor_(size)(input, widthDim); + int64_t inputT = THTensor_(size)(input, timeDim); + + THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3, + "gradOutput time unexpected"); + THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3, + "gradOutput width unexpected"); + THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3, + "gradOutput height unexpected"); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (numInputDims == 4) { + THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + THIndexTensor_(data)(indices), + numPlanes, inputT, inputW, inputH, outputT, outputW, outputH); + } else { + int64_t batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT, + THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT, + numPlanes, inputT, inputW, inputH, outputT, outputW, outputH); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricFullConvolution.c b/aten/src/THNN/generic/VolumetricFullConvolution.c new file mode 100644 index 0000000..e546584 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricFullConvolution.c @@ -0,0 +1,60 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c" +#else + +void THNN_(VolumetricFullConvolution_updateOutput)( + THNNState *state, + THTensor *input, // 4D or 5D (batch) tensor + THTensor *output, + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *bias, + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int kT, int kW, int kH, // kenerl size + int dT, int dW, int dH, // stride of the convolution + int pT, int pW, int pH, // padding + int aT, int aW, int aH) // extra output adjustment +{ + THNN_(VolumetricFullDilatedConvolution_updateOutput)( + state, input, output, weight, bias, finput, fgradInput, + kT, kW, kH, dT, dW, dH, pT, pW, pH, 1, 1, 1, aT, aW, aH); +} + +void THNN_(VolumetricFullConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, // only used by cuda impl + int kT, int kW, int kH, // kenerl size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH) // extra output adjustment +{ + THNN_(VolumetricFullDilatedConvolution_updateGradInput)( + state, input, gradOutput, gradInput, weight, finput, fgradInput, + kT, kW, kH, dT, dW, dH, pT, pW, pH, 1, 1, 1, aT, aW, aH); +} + +void THNN_(VolumetricFullConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kT, int kW, int kH, // kenerl size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH, // extra output adjustment + accreal scale_) +{ + THNN_(VolumetricFullDilatedConvolution_accGradParameters)( + state, input, gradOutput, gradWeight, gradBias, finput, fgradInput, + kT, kW, kH, dT, dW, dH, pT, pW, pH, 1, 1, 1, aT, aW, aH, scale_); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c new file mode 100644 index 0000000..c7c18ea --- /dev/null +++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c @@ -0,0 +1,573 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricFullDilatedConvolution.c" +#else + +static void THNN_(vol2col)( + const real *data_vol, const int64_t channels, + const int64_t depth, const int64_t height, const int64_t width, + const int64_t depth_col, const int64_t height_col, const int64_t width_col, + const int64_t kT, const int64_t kH, const int64_t kW, + const int64_t pT, const int64_t pH, const int64_t pW, + const int64_t dT, const int64_t dH, const int64_t dW, + const int64_t dilationT, const int64_t dilationH, const int64_t dilationW, + real *data_col) +{ + int64_t c, t, h, w; + int64_t channels_col = channels * kT * kH * kW; + for (c = 0; c < channels_col; ++c) + { + int64_t w_offset = c % kW; + int64_t h_offset = (c / kW) % kH; + int64_t t_offset = (c / kW / kH) % kT; + int64_t c_vol = c / kT / kH / kW; + for (t = 0; t < depth_col; ++t) + { + int64_t t_pad = t * dT - pT + t_offset * dilationT; + for (h = 0; h < height_col; ++h) + { + int64_t h_pad = h * dH - pH + h_offset * dilationH; + for (w = 0; w < width_col; ++w) + { + int64_t w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && + h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = + data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad]; + else + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0; + } + } + } + } +} + +static void THNN_(col2vol)( + const real* data_col, const int64_t channels, + const int64_t depth, const int64_t height, const int64_t width, + const int64_t out_depth, const int64_t out_height, const int64_t out_width, + const int64_t kT, const int64_t kH, const int64_t kW, + const int64_t pT, const int64_t pH, const int64_t pW, + const int64_t dT, const int64_t dH, const int64_t dW, + const int64_t dilationT, const int64_t dilationH, const int64_t dilationW, + real* data_vol) +{ + int64_t c, t, h, w; + memset(data_vol, 0, sizeof(real) * depth * height * width * channels); + int64_t depth_col = out_depth; + int64_t height_col = out_height; + int64_t width_col = out_width; + int64_t channels_col = channels * kT * kH * kW; + for (c = 0; c < channels_col; ++c) + { + int64_t w_offset = c % kW; + int64_t h_offset = (c / kW) % kH; + int64_t t_offset = (c / kW / kH) % kT; + int64_t c_vol = c / kT / kH / kW; + for (t = 0; t < depth_col; ++t) + { + int64_t t_pad = t * dT - pT + t_offset * dilationT; + for (h = 0; h < height_col; ++h) + { + int64_t h_pad = h * dH - pH + h_offset * dilationH; + for (w = 0; w < width_col; ++w) + { + int64_t w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && + h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] += + data_col[((c * depth_col + t) * height_col + h) * width_col + w]; + } + } + } + } +} + +static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kT, int kW, int kH, int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH, int weight_nullable) { + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d", + dilationT, dilationH, dilationW); + THArgCheck((aT < dT || aT < dilationT) + && (aW < dW || aW < dilationW) + && (aH < dH || aH < dilationH), 15, + "output padding must be smaller than either stride or dilation," + " but got aT: %d aH: %d aW: %d dT: %d dH: %d dW: %d " + "dilationT: %d dilationH: %d dilationW: %d", + aT, aH, aW, dT, dH, dW, dilationT, dilationH, dilationW); + + // number of input & output planes and kernel size is indirectly defined by the weight tensor + if (weight != NULL) { + THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 5, 4, weight, + "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); + } + } else if (!weight_nullable) { + THError("weight tensor is expected to be non-nullable"); + } + + int ndim = input->dim(); + int dimf = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) { + dimf++; + dimd++; + dimh++; + dimw++; + } + + if (weight != NULL) { + const int64_t nInputPlane = weight->size[0]; + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + } + + const int64_t inputWidth = input->size[dimw]; + const int64_t inputHeight = input->size[dimh]; + const int64_t inputDepth = input->size[dimd]; + const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; + const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; + const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) { + THError("Given input size per channel: (%ld x %ld x %ld). " + "Calculated output size per channel: (%ld x %ld x %ld). Output size is too small", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + } + + if (gradOutput != NULL) { + if (weight != NULL) { + const int64_t nOutputPlane = weight->size[1]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } else if (bias != NULL) { + const int64_t nOutputPlane = bias->size[0]; + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + } + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricFullDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, // 4D or 5D (batch) tensor + THTensor *output, + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *bias, + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int kT, int kW, int kH, // kernel size + int dT, int dW, int dH, // stride of the convolution + int pT, int pW, int pH, // padding + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH) // extra output adjustment +{ + THTensor *columns = finput; + THTensor *ones = fgradInput; + + THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + input, NULL, weight, bias, kT, kW, kH, + dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0); + + const int nInputPlane = (int)weight->size[0]; + const int nOutputPlane = (int)weight->size[1]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + int is_batch = 1; + if (input->dim() == 4) + { + // Force batch + is_batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } + + const int64_t inputWidth = input->size[4]; + const int64_t inputHeight = input->size[3]; + const int64_t inputDepth = input->size[2]; + const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; + const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; + const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; + + // Batch size + input planes + const int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + THTensor_(zero)(columns); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; ++elt) + { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + const int64_t n = columns->size[1]; + const int64_t k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(input_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(columns), n + ); + + // Unpack columns back into input: + THNN_(col2vol)( + THTensor_(data)(columns), + nOutputPlane, outputDepth, outputHeight, outputWidth, + inputDepth, inputHeight, inputWidth, + kT, kH, kW, + pT, pH, pW, + dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const int64_t m_ = nOutputPlane; + const int64_t n_ = outputDepth * outputHeight * outputWidth; + const int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 1, + THTensor_(data)(output_n), n_ + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (is_batch == 0) + { + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, // only used by cuda impl + int kT, int kW, int kH, // kernel size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH) // extra output adjustment +{ + THTensor *gradColumns = finput; + + // number of input & output planes and kernel size is indirectly defined by the weight tensor + THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + input, gradOutput, weight, NULL, kT, kW, kH, + dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0); + + const int64_t nInputPlane = weight->size[0]; + const int64_t nOutputPlane = weight->size[1]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + gradOutput = THTensor_(newContiguous)(gradOutput); + + int is_batch = 1; + if (input->dim() == 4) + { + // Force batch + is_batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + const int64_t inputWidth = input->size[4]; + const int64_t inputHeight = input->size[3]; + const int64_t inputDepth = input->size[2]; + const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; + const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; + const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; + + // Batch size + input planes + const int64_t batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; ++elt) + { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(gradOutput_n), + nOutputPlane, outputDepth, outputHeight, outputWidth, + inputDepth, inputHeight, inputWidth, + kT, kH, kW, + pT, pH, pW, + dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(gradColumns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const int64_t m = weight->size[0]; + const int64_t n = gradColumns->size[1]; + const int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(gradColumns), n, + THTensor_(data)(weight), k, + 0, + THTensor_(data)(gradInput_n), n + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (is_batch == 0) + { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kT, int kW, int kH, // kernel size + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int dilationT, int dilationW, int dilationH, + int aT, int aW, int aH, // extra output adjustment + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor + THNN_(VolumetricFullDilatedConvolution_shapeCheck)( + input, gradOutput, gradWeight, gradBias, kT, kW, kH, + dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 1); + + int64_t nOutputPlane; + if (gradWeight) { + nOutputPlane = THTensor_(size)(gradWeight, 1); + } else if (gradBias) { + nOutputPlane = THTensor_(size)(gradBias, 0); + } else { + return; + } + + THTensor *columns = finput; + THTensor *ones = fgradInput; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + if (gradWeight) { + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + } + if (gradBias) { + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous"); + } + + int is_batch = 1; + if (input->dim() == 4) + { + // Force batch + is_batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + const int64_t inputWidth = input->size[4]; + const int64_t inputHeight = input->size[3]; + const int64_t inputDepth = input->size[2]; + const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; + const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; + const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; + + // Batch size + input planes + const int64_t batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; ++elt) + { + // Matrix mulitply per output: + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Do Weight: + if (gradWeight) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(gradOutput_n), nOutputPlane, + outputDepth, outputHeight, outputWidth, + inputDepth, inputHeight, inputWidth, + kT, kH, kW, + pT, pH, pW, + dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const int64_t n = columns->size[0]; // nOutputPlane * kt * kh * kw + const int64_t m = input_n->size[0]; // nInputPlane + const int64_t k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(input_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + } + + // Do Bias: + if (gradBias) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const int64_t m_ = nOutputPlane; + const int64_t k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (is_batch == 0) + { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, input->size[1], inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c new file mode 100644 index 0000000..4d7ace4 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c @@ -0,0 +1,409 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.c" +#else + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) + +#undef MODE_BORDER +#define MODE_BORDER 1 + +static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck) + (THTensor *input, THTensor *grid, THTensor *gradOutput) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input, + "non-empty 5D input tensor expected but got: %s"); + THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 5, 2, grid, + "non-empty 5D grid tensor expected but got: %s"); + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int odepth = THTensor_(size)(grid, 1); + int oheight = THTensor_(size)(grid, 2); + int owidth = THTensor_(size)(grid, 3); + + THNN_CHECK_DIM_SIZE(grid, 5, 0, nbatch); + THNN_CHECK_DIM_SIZE(grid, 5, 4, 3); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nbatch); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, channels); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, odepth); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, owidth); + } +} + +#define SAFE_GET(input, x, y, z, n, c, D, H, W) \ + x >= 0 && x < W && y >=0 && y < H && z >= 0 && z < D \ + ? THTensor_(fastGet5d)(input, n, c, z, y, x) : 0 + +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode) { + + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, NULL); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int ID = THTensor_(size)(input, 2); + int IH = THTensor_(size)(input, 3); + int IW = THTensor_(size)(input, 4); + int D = THTensor_(size)(grid, 1); + int H = THTensor_(size)(grid, 2); + int W = THTensor_(size)(grid, 3); + + // resize output to the same shape as input + THTensor_(resize5d)(output, N, C, D, H, W); + + // loop over each output pixel + int n, d, h, w, c; +#pragma omp parallel for private(n, d, h, w, c) + for (n = 0; n < N; ++n) { + for (d = 0; d < D; ++d) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y, z co-ordinates from grid + real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0); + real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1); + real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2); + + // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + iz = ((iz + 1) / 2) * (ID-1); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ix); + int iy_tnw = floor(iy); + int iz_tnw = floor(iz); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + real tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + real tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + real tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + real tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + real bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + real bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + real bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + real bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw, ID); + CLIP_COORDINATES(ix_tne, ix_tne, IW); + CLIP_COORDINATES(iy_tne, iy_tne, IH); + CLIP_COORDINATES(iz_tne, iz_tne, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw, ID); + CLIP_COORDINATES(ix_tse, ix_tse, IW); + CLIP_COORDINATES(iy_tse, iy_tse, IH); + CLIP_COORDINATES(iz_tse, iz_tse, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw, ID); + CLIP_COORDINATES(ix_bne, ix_bne, IW); + CLIP_COORDINATES(iy_bne, iy_bne, IH); + CLIP_COORDINATES(iz_bne, iz_bne, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw, ID); + CLIP_COORDINATES(ix_bse, ix_bse, IW); + CLIP_COORDINATES(iy_bse, iy_bse, IH); + CLIP_COORDINATES(iz_bse, iz_bse, ID); + } + + // calculate bilinear weighted pixel value and set output pixel + for (c = 0; c < C; ++c) { + // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne + // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se + real tnw_val = SAFE_GET(input, ix_tnw, iy_tnw, iz_tnw, n, c, ID, IH, IW); + real tne_val = SAFE_GET(input, ix_tne, iy_tne, iz_tne, n, c, ID, IH, IW); + real tsw_val = SAFE_GET(input, ix_tsw, iy_tsw, iz_tsw, n, c, ID, IH, IW); + real tse_val = SAFE_GET(input, ix_tse, iy_tse, iz_tse, n, c, ID, IH, IW); + real bnw_val = SAFE_GET(input, ix_bnw, iy_bnw, iz_bnw, n, c, ID, IH, IW); + real bne_val = SAFE_GET(input, ix_bne, iy_bne, iz_bne, n, c, ID, IH, IW); + real bsw_val = SAFE_GET(input, ix_bsw, iy_bsw, iz_bsw, n, c, ID, IH, IW); + real bse_val = SAFE_GET(input, ix_bse, iy_bse, iz_bse, n, c, ID, IH, IW); + real out_val = tnw_val * tnw + tne_val * tne + tsw_val * tsw + tse_val * tse + + bnw_val * bnw + bne_val * bne + bsw_val * bsw + bse_val * bse; + THTensor_(fastSet5d)(output, n, c, d, h, w, out_val); + } + } + } + } + } +} + +#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value) \ + do { \ + if (x >= 0 && x < W && y >=0 && y < H && z >=0 && z < D) { \ + real old_value = THTensor_(fastGet5d)(input, n, c, z, y, x); \ + THTensor_(fastSet5d)(input, n, c, z, y, x, value + old_value); \ + } \ + } while(0) + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode) { + + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, gradOutput); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int ID = THTensor_(size)(input, 2); + int IH = THTensor_(size)(input, 3); + int IW = THTensor_(size)(input, 4); + int D = THTensor_(size)(grid, 1); + int H = THTensor_(size)(grid, 2); + int W = THTensor_(size)(grid, 3); + + THTensor_(resize5d)(gradInput, N, C, ID, IH, IW); + THTensor_(resize5d)(gradGrid, N, D, H, W, 3); + THTensor_(zero)(gradInput); + THTensor_(zero)(gradGrid); + + // loop over each output pixel + int n, d, h, w; +//#pragma omp parallel for private(n, d, h, w) + for (n = 0; n < N; ++n) { + for (d = 0; d < D; ++d) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y, z co-ordinates from grid + real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0); + real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1); + real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2); + + real gix = 0; + real giy = 0; + real giz = 0; + + // normalize ix, iy, iz from [-1, 1] to [0, W-1] & [0, H-1] & [0, D-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + iz = ((iz + 1) / 2) * (ID-1); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ix); + int iy_tnw = floor(iy); + int iz_tnw = floor(iz); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + real tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + real tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + real tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + real tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + real bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + real bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + real bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + real bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; + int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; + int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; + int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID); + CLIP_COORDINATES(ix_tne, ix_tne_cl, IW); + CLIP_COORDINATES(iy_tne, iy_tne_cl, IH); + CLIP_COORDINATES(iz_tne, iz_tne_cl, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID); + CLIP_COORDINATES(ix_tse, ix_tse_cl, IW); + CLIP_COORDINATES(iy_tse, iy_tse_cl, IH); + CLIP_COORDINATES(iz_tse, iz_tse_cl, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID); + CLIP_COORDINATES(ix_bne, ix_bne_cl, IW); + CLIP_COORDINATES(iy_bne, iy_bne_cl, IH); + CLIP_COORDINATES(iz_bne, iz_bne_cl, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID); + CLIP_COORDINATES(ix_bse, ix_bse_cl, IW); + CLIP_COORDINATES(iy_bse, iy_bse_cl, IH); + CLIP_COORDINATES(iz_bse, iz_bse_cl, ID); + } + else { + ix_tnw_cl = ix_tnw; + iy_tnw_cl = iy_tnw; + iz_tnw_cl = iz_tnw; + ix_tne_cl = ix_tne; + iy_tne_cl = iy_tne; + iz_tne_cl = iz_tne; + ix_tsw_cl = ix_tsw; + iy_tsw_cl = iy_tsw; + iz_tsw_cl = iz_tsw; + ix_tse_cl = ix_tse; + iy_tse_cl = iy_tse; + iz_tse_cl = iz_tse; + ix_bnw_cl = ix_bnw; + iy_bnw_cl = iy_bnw; + iz_bnw_cl = iz_bnw; + ix_bne_cl = ix_bne; + iy_bne_cl = iy_bne; + iz_bne_cl = iz_bne; + ix_bsw_cl = ix_bsw; + iy_bsw_cl = iy_bsw; + iz_bsw_cl = iz_bsw; + ix_bse_cl = ix_bse; + iy_bse_cl = iy_bse; + iz_bse_cl = iz_bse; + } + + for (int c = 0; c < C; ++c) { + real gradout = THTensor_(fastGet5d)(gradOutput, n, c, d, h, w); + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout); + SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout); + SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout); + SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout); + SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout); + SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout); + SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout); + SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout); + + // calculate gradGrid + real tnw_val = SAFE_GET(input, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW); + real tne_val = SAFE_GET(input, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW); + real tsw_val = SAFE_GET(input, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW); + real tse_val = SAFE_GET(input, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW); + real bnw_val = SAFE_GET(input, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW); + real bne_val = SAFE_GET(input, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW); + real bsw_val = SAFE_GET(input, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW); + real bse_val = SAFE_GET(input, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW); + + gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout; + gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout; + gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout; + gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout; + + + giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gradout; + giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gradout; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gradout; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gradout; + giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gradout; + giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gradout; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gradout; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gradout; + + giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gradout; + giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gradout; + giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gradout; + giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gradout; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gradout; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gradout; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gradout; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gradout; + + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + giz = giz * (ID - 1) / 2; + + real gix_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 0); + real giy_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 1); + real giz_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 2); + + THTensor_(fastSet5d)(gradGrid, n, d, h, w, 0, gix_old + gix); + THTensor_(fastSet5d)(gradGrid, n, d, h, w, 1, giy_old + giy); + THTensor_(fastSet5d)(gradGrid, n, d, h, w, 2, giz_old + giz); + } + } + } + } +} + +#undef MIN +#undef MAX +#undef SAFE_GET +#undef CLIP_COORDINATES +#undef SAFE_ADD +#undef MODE_BORDER + +#endif diff --git a/aten/src/THNN/generic/VolumetricMaxPooling.c b/aten/src/THNN/generic/VolumetricMaxPooling.c new file mode 100644 index 0000000..a3601e0 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricMaxPooling.c @@ -0,0 +1,50 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c" +#else + +void THNN_(VolumetricMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, 1, 1, 1, ceilMode); +} + +void THNN_(VolumetricMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, 1, 1, 1, ceilMode); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricMaxUnpooling.c b/aten/src/THNN/generic/VolumetricMaxUnpooling.c new file mode 100644 index 0000000..b8e649c --- /dev/null +++ b/aten/src/THNN/generic/VolumetricMaxUnpooling.c @@ -0,0 +1,339 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c" +#else + +static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + THNN_CHECK_SHAPE_INDICES(input, indices); + + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + + int dimw = 3; + int dimh = 2; + int dimt = 1; + int dimn = 0; + + if (input->dim() == 5) + { + dimt++; + dimw++; + dimh++; + dimn++; + } + int nslices = input->size[dimn]; + + if (gradOutput != NULL) { + if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh]) + { + THError( + "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d", + oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw] + ); + } + + THNN_CHECK_DIM_SIZE(gradOutput, input->dim(), dimn, nslices); + } +} + +static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *ind_p, + int nslices, + int iT, + int iW, + int iH, + int oT, + int oW, + int oH) +{ + int k; + int has_error = 0; + THIndex_t error_index = 0; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *output_p_k = output_p + k * oT * oH * oW; + real *input_p_k = input_p + k * iT * iH * iW; + THIndex_t *ind_p_k = ind_p + k * iT * iH * iW; + + int t, i, j, index; + THIndex_t maxp; + for (t = 0; t < iT; t++) + { + for (i = 0; i < iH; i++) + { + for (j = 0; j < iW; j++) + { + index = t * iH * iW + i * iW + j; + maxp = ind_p_k[index] - TH_INDEX_BASE; /* retrieve position of max */ + if (maxp < 0 || maxp >= oT * oW * oH) + { +#pragma omp critical + { + has_error = 1; + error_index = maxp; + } + } else { + output_p_k[maxp] = input_p_k[index]; /* update output */ + } + } + } + } + } + if (has_error) { + THError( + "found an invalid max index %ld (output volumes are of size %dx%dx%d)", + error_index, oT, oH, oW + ); + } +} + +void THNN_(VolumetricMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int dimw = 3; + int dimh = 2; + int dimt = 1; + int nbatch = 1; + int nslices; + int iT; + int iH; + int iW; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + THNN_(VolumetricMaxUnpooling_shapeCheck)( + state, input, NULL, indices, + oT, oW, oH, dT, dW, dH, pT, pW, pH); + + if (input->dim() == 5) + { + nbatch = input->size[0]; + dimt++; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimt-1]; + iT = input->size[dimt]; + iH = input->size[dimh]; + iW = input->size[dimw]; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize output */ + if (input->dim() == 4) + { + THTensor_(resize4d)(output, nslices, oT, oH, oW); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(VolumetricMaxUnpooling_updateOutput_frame)( + input_data, output_data, + indices_data, + nslices, + iT, iW, iH, + oT, oW, oH + ); + } + else + { + int p; + + THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for (p = 0; p < nbatch; p++) + { + THNN_(VolumetricMaxUnpooling_updateOutput_frame)( + input_data+p*nslices*iT*iW*iH, + output_data+p*nslices*oT*oW*oH, + indices_data+p*nslices*iT*iW*iH, + nslices, + iT, iW, iH, + oT, oW, oH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); + THIndexTensor_(free)(indices); +} + +static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *ind_p, + int nslices, + int iT, + int iW, + int iH, + int oT, + int oW, + int oH) +{ + int k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k * iT * iH * iW; + real *gradOutput_p_k = gradOutput_p + k * oT * oH * oW; + THIndex_t *ind_p_k = ind_p + k * iT * iH * iW; + + int t, i, j, index; + THIndex_t maxp; + for (t = 0; t < iT; t++) + { + for (i = 0; i < iH; i++) + { + for (j = 0; j < iW; j++) + { + index = t * iH * iW + i * iW + j; + maxp = ind_p_k[index] - TH_INDEX_BASE; /* retrieve position of max */ + if (maxp < 0 || maxp >= oT * oH * oW) + { + THError("invalid max index %ld, oT= %d, oW= %d, oH= %d", maxp, oT, oW, oH); + } + gradInput_p_k[index] = gradOutput_p_k[maxp]; /* update gradient */ + } + } + } + } +} + +void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int dimw = 3; + int dimh = 2; + int dimt = 1; + int nbatch = 1; + int nslices; + int iT; + int iH; + int iW; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + THNN_(VolumetricMaxUnpooling_shapeCheck)( + state, input, gradOutput, indices, + oT, oW, oH, dT, dW, dH, pT, pW, pH); + + // TODO: check gradOutput shape + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->dim() == 5) + { + nbatch = input->size[0]; + dimt++; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimt-1]; + iT = input->size[dimt]; + iH = input->size[dimh]; + iW = input->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->dim() == 4) + { + THNN_(VolumetricMaxUnpooling_updateGradInput_frame)( + gradInput_data, gradOutput_data, + indices_data, + nslices, + iT, iW, iH, + oT, oW, oH + ); + } + else + { + int p; + for (p = 0; p < nbatch; p++) + { + THNN_(VolumetricMaxUnpooling_updateGradInput_frame)( + gradInput_data+p*nslices*iT*iW*iH, + gradOutput_data+p*nslices*oT*oW*oH, + indices_data+p*nslices*iT*iW*iH, + nslices, + iT, iW, iH, + oT, oW, oH + ); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); + THIndexTensor_(free)(indices); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricReplicationPadding.c b/aten/src/THNN/generic/VolumetricReplicationPadding.c new file mode 100644 index 0000000..e64cb36 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricReplicationPadding.c @@ -0,0 +1,357 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c" +#else + +static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + int dimw = 3; + int dimh = 2; + int dimd = 1; + int dimslices = 0; + int64_t nslices; + int64_t idepth; + int64_t iheight; + int64_t iwidth; + int64_t odepth; + int64_t oheight; + int64_t owidth; + + THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, + "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (input->dim() == 5) + { + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + idepth = input->size[dimd]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + odepth = idepth + pfront + pback; + oheight = iheight + ptop + pbottom; + owidth = iwidth + pleft + pright; + + THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2, + "input (D: %d H: %d, W: %d)is too small." + " Calculated output D: %d H: %d W: %d", + idepth, iheight, iwidth, odepth, oheight, owidth); + + if (gradOutput != NULL) { + THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + nslices, THTensor_(size)(gradOutput, dimslices)); + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THTensor_(size)(gradOutput, dimh)); + THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3, + "gradOutput depth unexpected. Expected: %d, Got: %d", + odepth, THTensor_(size)(gradOutput, dimd)); + } +} + +static void THNN_(VolumetricReplicationPadding_updateOutput_frame)( + real *input_p, real *output_p, + int64_t nslices, + int64_t iwidth, int64_t iheight, int64_t idepth, + int64_t owidth, int64_t oheight, int64_t odepth, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int iStartX = fmax(0, -pleft); + int iStartY = fmax(0, -ptop); + int iStartZ = fmax(0, -pfront); + int oStartX = fmax(0, pleft); + int oStartY = fmax(0, ptop); + int oStartZ = fmax(0, pfront); + + int64_t k, ip_x, ip_y, ip_z; +#pragma omp parallel for private(k, ip_x, ip_y, ip_z) + for (k = 0; k < nslices; k++) { + int64_t i, j, z; + for (z = 0; z < odepth; z++) { + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pleft) { + ip_x = pleft; + } else if (j >= pleft && j < iwidth + pleft) { + ip_x = j; + } else { + ip_x = iwidth + pleft - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < ptop) { + ip_y = ptop; + } else if (i >= ptop && i < iheight + ptop) { + ip_y = i; + } else { + ip_y = iheight + ptop - 1; + } + ip_y = ip_y - oStartY + iStartY; + + if (z < pfront) { + ip_z = pfront; + } else if (z >= pfront && z < idepth + pfront) { + ip_z = z; + } else { + ip_z = idepth + pfront - 1; + } + ip_z = ip_z - oStartZ + iStartZ; + + real *dest_p = output_p + k * owidth * oheight * odepth + + z * owidth * oheight + i * owidth + j; + real *src_p = input_p + k * iwidth * iheight * idepth + + ip_z * iwidth * iheight + ip_y * iwidth + ip_x; + *dest_p = *src_p; + } + } + } + } +} + +void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int dimw = 3; + int dimh = 2; + int dimd = 1; + int dimslices = 0; + int64_t nbatch = 1; + int64_t nslices; + int64_t idepth; + int64_t iheight; + int64_t iwidth; + int64_t odepth; + int64_t oheight; + int64_t owidth; + real *input_data; + real *output_data; + +THNN_(VolumetricReplicationPadding_shapeCheck)( + state, input, NULL, pleft, pright, + ptop, pbottom, pfront, pback); + + if (input->dim() == 5) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + idepth = input->size[dimd]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + odepth = idepth + pfront + pback; + oheight = iheight + ptop + pbottom; + owidth = iwidth + pleft + pright; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->dim() == 4) + { + THTensor_(resize4d)(output, nslices, odepth, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(VolumetricReplicationPadding_updateOutput_frame)( + input_data, output_data, nslices, iwidth, iheight, idepth, + owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront, + pback); + } + else + { + int64_t p; + + THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(VolumetricReplicationPadding_updateOutput_frame)( + input_data + p * nslices * iwidth * iheight * idepth, + output_data + p * nslices * owidth * oheight * odepth, + nslices, + iwidth, iheight, idepth, + owidth, oheight, odepth, + pleft, pright, + ptop, pbottom, + pfront, pback); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + int64_t nslices, + int64_t iwidth, int64_t iheight, int64_t idepth, + int64_t owidth, int64_t oheight, int64_t odepth, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int iStartX = fmax(0, -pleft); + int iStartY = fmax(0, -ptop); + int iStartZ = fmax(0, -pfront); + int oStartX = fmax(0, pleft); + int oStartY = fmax(0, ptop); + int oStartZ = fmax(0, pfront); + + int64_t k, ip_x, ip_y, ip_z; +#pragma omp parallel for private(k, ip_x, ip_y, ip_z) + for (k = 0; k < nslices; k++) { + int64_t i, j, z; + for (z = 0; z < odepth; z++) { + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pleft) { + ip_x = pleft; + } else if (j >= pleft && j < iwidth + pleft) { + ip_x = j; + } else { + ip_x = iwidth + pleft - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < ptop) { + ip_y = ptop; + } else if (i >= ptop && i < iheight + ptop) { + ip_y = i; + } else { + ip_y = iheight + ptop - 1; + } + ip_y = ip_y - oStartY + iStartY; + + if (z < pfront) { + ip_z = pfront; + } else if (z >= pfront && z < idepth + pfront) { + ip_z = z; + } else { + ip_z = idepth + pfront - 1; + } + ip_z = ip_z - oStartZ + iStartZ; + + real *src_p = goutput_p + k * owidth * oheight * odepth + + z * owidth * oheight + i * owidth + j; + real *dest_p = ginput_p + k * iwidth * iheight * idepth + + ip_z * iwidth * iheight + ip_y * iwidth + ip_x; + *dest_p += *src_p; + } + } + } + } +} + +void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int dimw = 3; + int dimh = 2; + int dimd = 1; + int dimslices = 0; + int64_t nbatch = 1; + int64_t nslices; + int64_t idepth; + int64_t iheight; + int64_t iwidth; + int64_t odepth; + int64_t oheight; + int64_t owidth; + + if (input->dim() == 5) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + idepth = input->size[dimd]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + odepth = idepth + pfront + pback; + oheight = iheight + ptop + pbottom; + owidth = iwidth + pleft + pright; + + +THNN_(VolumetricReplicationPadding_shapeCheck)( + state, input, NULL, pleft, pright, + ptop, pbottom, pfront, pback); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->dim() == 4) { + THNN_(VolumetricReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, iheight, idepth, + owidth, oheight, odepth, + pleft, pright, + ptop, pbottom, + pfront, pback); + } else { + int64_t p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(VolumetricReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth, + THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth, + nslices, + iwidth, iheight, idepth, + owidth, oheight, odepth, + pleft, pright, + ptop, pbottom, + pfront, pback); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricUpSamplingNearest.c b/aten/src/THNN/generic/VolumetricUpSamplingNearest.c new file mode 100644 index 0000000..b0e1a2f --- /dev/null +++ b/aten/src/THNN/generic/VolumetricUpSamplingNearest.c @@ -0,0 +1,173 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c" +#else + +#include "linear_upsampling.h" + +static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputDepth, int inputHeight, int inputWidth, + int outputDepth, int outputHeight, int outputWidth) { + THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 + && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(input->_dim() == 5, 2, input, + "5D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth); + } +} + + +void THNN_(VolumetricUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputDepth, + int outputHeight, + int outputWidth) +{ + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputDepth = THTensor_(size)(input, 2); + int inputHeight = THTensor_(size)(input, 3); + int inputWidth = THTensor_(size)(input, 4); + const float depth_scale = (float) inputDepth / (float) outputDepth; + const float height_scale = (float) inputHeight / (float)outputHeight; + const float width_scale = (float) inputWidth / (float)outputWidth; + + THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, nbatch, channels, inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + + THTensor_(resize5d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputDepth, + outputHeight, + outputWidth); + channels = channels * nbatch; + + THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && outputDepth > 0 && outputHeight > 0 && outputWidth > 0); + + input = THTensor_(newContiguous)(input); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + + // special case: just copy + if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) { + for (int d2 = 0; d2 < outputDepth; ++d2) { + const int d1 = d2; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputDepth * inputHeight * inputWidth; + pos2 += outputDepth * outputHeight * outputWidth; + } + } + } + } + THTensor_(free)(input); + return; + } + + for (int d2 = 0; d2 < outputDepth; ++d2) { + const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, inputDepth); + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight); + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth); + const real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputDepth * inputHeight * inputWidth; + pos2 += outputDepth * outputHeight * outputWidth; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(VolumetricUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth) +{ + THNN_(VolumetricUpSamplingNearest_shapeCheck)(NULL, gradOutput, nbatch, channels, inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *idata = THTensor_(data)(gradInput); + real *odata = THTensor_(data)(gradOutput); + channels = nbatch * channels; + const float depth_scale = (float) inputDepth / (float) outputDepth; + const float height_scale = (float) inputHeight / (float)outputHeight; + const float width_scale = (float) inputWidth / (float)outputWidth; + + // special case: just copy + if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) { + for (int d2 = 0; d2 < outputDepth; ++d2) { + const int d1 = d2; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + const real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputDepth * inputHeight * inputWidth; + pos2 += outputDepth * outputHeight * outputWidth; + } + } + } + } + THTensor_(free)(gradOutput); + return; + } + + for (int d2 = 0; d2 < outputDepth; ++d2) { + const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, inputDepth); + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, inputHeight); + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, inputWidth); + real* pos1 = &idata[d1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + const real* pos2 = &odata[d2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputDepth * inputHeight * inputWidth; + pos2 += outputDepth * outputHeight * outputWidth; + } + } + } + } + + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c b/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c new file mode 100644 index 0000000..e24a3e9 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c @@ -0,0 +1,219 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou + +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c" +#else + +#include "linear_upsampling.h" + +static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputDepth, int inputHeight, int inputWidth, + int outputDepth, int outputHeight, int outputWidth) { + THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 + && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input, + "non-empty 5D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth); + } +} + +void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputDepth, + int outputHeight, + int outputWidth, + bool align_corners){ + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputDepth = THTensor_(size)(input, 2); + int inputHeight = THTensor_(size)(input, 3); + int inputWidth = THTensor_(size)(input, 4); + + THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (input, NULL, + nbatch, channels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + THTensor_(resize5d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputDepth, outputHeight, outputWidth); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + channels = nbatch * channels; + THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && + outputDepth > 0 && outputHeight > 0 && outputWidth > 0); + // special case: just copy + if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) { + for (int t2 = 0; t2 < outputDepth; ++t2) { + const int t1 = t2; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + THTensor_(free)(input); + return; + } + const accreal rdepth = linear_upsampling_compute_scale(inputDepth, outputDepth, align_corners); + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + for (int t2 = 0; t2 < outputDepth; ++t2) { + const accreal t1r = linear_upsampling_compute_source_index(rdepth, t2, align_corners); + const int t1 = t1r; + const int t1p = (t1 < inputDepth - 1) ? 1 : 0; + const real t1lambda = t1r - t1; + const real t0lambda = (real)1. - t1lambda; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const accreal h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + + h1lambda * (w0lambda * pos1[h1p * inputWidth] + + w1lambda * pos1[h1p * inputWidth + w1p])) + + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth] + + w1lambda * pos1[t1p * inputHeight * inputWidth + + w1p]) + + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth + + h1p * inputWidth] + + w1lambda * pos1[t1p * inputHeight * inputWidth + + h1p * inputWidth + w1p])); + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth, + bool align_corners){ + + THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (NULL, gradOutput, + nbatch, channels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + + THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *data1 = THTensor_(data)(gradInput); + real *data2 = THTensor_(data)(gradOutput); + channels = nbatch * channels; + + // special case: same-size matching grids + if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) { + for (int t2 = 0; t2 < outputDepth; ++t2) { + const int t1 = t2; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + THTensor_(free)(gradOutput); + return; + } + const accreal rdepth = linear_upsampling_compute_scale(inputDepth, outputDepth, align_corners); + const accreal rheight = linear_upsampling_compute_scale(inputHeight, outputHeight, align_corners); + const accreal rwidth = linear_upsampling_compute_scale(inputWidth, outputWidth, align_corners); + for (int t2 = 0; t2 < outputDepth; ++t2) { + const accreal t1r = linear_upsampling_compute_source_index(rdepth, t2, align_corners); + const int t1 = t1r; + const int t1p = (t1 < inputDepth - 1) ? 1 : 0; + const real t1lambda = t1r - t1; + const real t0lambda = (real)1. - t1lambda; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const accreal h1r = linear_upsampling_compute_source_index(rheight, h2, align_corners); + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const accreal w1r = linear_upsampling_compute_source_index(rwidth, w2, align_corners); + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0]; + pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0]; + pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0]; + pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0]; + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + THTensor_(free)(gradOutput); +} + +#endif diff --git a/aten/src/THNN/generic/linear_upsampling.h b/aten/src/THNN/generic/linear_upsampling.h new file mode 100644 index 0000000..2873506 --- /dev/null +++ b/aten/src/THNN/generic/linear_upsampling.h @@ -0,0 +1,51 @@ +#ifndef THNN_LINEAR_UPSAMPLING_H +#define THNN_LINEAR_UPSAMPLING_H + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) + + +template +static inline T linear_upsampling_compute_scale( + int inputSize, int outputSize, bool align_corners) { + /* We view each pixel as an area, idx + 0.5 as its center index. + * Here is an example formula in 1D case. + * if align_corners: center of two corner pixel areas are preserved, + * (0.5, 0.5) -> (0.5, 0.5), + * (inputSize - 0.5, 0.5) -> (outputSize - 0.5) + * scale = (inputSize - 0.5 - 0.5) / (outputSize - 0.5 - 0.5) + * src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5) + * if not align_corners: the whole range is scaled accordingly + * scale = inputSize / outputSize + * src_idx + 0.5 = scale * (dst_index + 0.5) + */ + if (outputSize > 1) { + return align_corners ? (T) (inputSize - 1) / (outputSize - 1) + : (T) inputSize / outputSize; + } else { + return T(0); + } +} + +template +static inline T linear_upsampling_compute_source_index( + T scale, int dst_index, bool align_corners) { + if (align_corners) { + return scale * dst_index; + } else { + T src_idx = scale * (dst_index + 0.5) - 0.5; + return src_idx < 0 ? T(0) : src_idx; + } +} + +static inline int nearest_neighbor_compute_source_index( + const float scale, int dst_index, int inputSize) { + const int src_index = MIN(floorf(dst_index * scale), inputSize - 1); + return src_index; +} + + +#endif + diff --git a/aten/src/THNN/generic/unfold.c b/aten/src/THNN/generic/unfold.c new file mode 100644 index 0000000..7feae7c --- /dev/null +++ b/aten/src/THNN/generic/unfold.c @@ -0,0 +1,166 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/unfold.c" +#else + +/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ +void THNN_(unfolded_acc)( + THTensor *finput, + THTensor *input, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int nInputPlane, + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight) +{ + // This function assumes that + // outputHeight*dH does not overflow a int64_t + // outputWidth*dW does not overflow a int64_t + + int nip; + + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(nip) + for(nip = 0; nip < nInputPlane; nip++) + { + int kw, kh, y, x; + int64_t ix, iy; + for(kh = 0; kh < kH; kh++) + { + for(kw = 0; kw < kW; kw++) + { + real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth); + real *dst = input_data + nip*((size_t)inputHeight*inputWidth); + if (padW > 0 || padH > 0) { + int lpad,rpad; + for(y = 0; y < outputHeight; y++) { + iy = (int64_t)y*dH - padH + kh; + if (iy < 0 || iy >= inputHeight) { + } else { + if (dW==1){ + ix = 0 - padW + kw; + lpad = fmaxf(0,padW-kw); + rpad = fmaxf(0,padW-(kW-kw-1)); + real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */ + } + else{ + for (x=0; x= inputWidth){ + }else{ + real *dst_slice = dst+(size_t)iy*inputWidth+ix; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1); + } + } + } + } + } + } else { + for(y = 0; y < outputHeight; y++) { + iy = (int64_t)y*dH + kh; + ix = 0 + kw; + if (dW == 1 ) { + real *dst_slice = dst+(size_t)iy*inputWidth+ix; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */ + }else{ + for(x = 0; x < outputWidth; x++) { + real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1); + } + } + } + } + } + } + } +} + +void THNN_(unfolded_copy)( + THTensor *finput, + THTensor *input, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int nInputPlane, + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight) +{ + // This function assumes that + // kH*kW does not overflow an int + // nInputPlane*kH*kW does not overflow a int64_t + // outputHeight*dH does not overflow a int64_t + // outputWidth*dW does not overflow a int64_t + + int64_t k; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(k) + for(k = 0; k < (int64_t)nInputPlane*kH*kW; k++) { + int64_t nip = k / (kH*kW); + int64_t rest = k % (kH*kW); + int64_t kh = rest / kW; + int64_t kw = rest % kW; + int x, y; + int64_t ix, iy; + real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth); + real *src = input_data + nip*((size_t)inputHeight*inputWidth); + if (padW > 0 || padH > 0) { + int64_t lpad,rpad; + for(y = 0; y < outputHeight; y++) { + iy = (int64_t)y*dH - padH + kh; + if (iy < 0 || iy >= inputHeight) { + memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth); + } else { + if (dW==1){ + ix = 0 - padW + kw; + lpad = fmaxf(0,padW-kw); + rpad = fmaxf(0,padW-(kW-kw-1)); + if (outputWidth-rpad-lpad <= 0) { + memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth); + } else { + if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad); + memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad)); + if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad); + } + } + else{ + for (x=0; x= inputWidth) + memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1); + else + memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1)); + } + } + } + } + } else { + for(y = 0; y < outputHeight; y++) { + iy = (int64_t)y*dH + kh; + ix = 0 + kw; + if (dW == 1) + memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth); + else{ + for (x=0; x + +#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) +#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME) + +#define THNN_CHECK_SHAPE(I1, I2) \ + if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \ + { \ + THDescBuff s1 = THTensor_(sizeDesc)(I1); \ + THDescBuff s2 = THTensor_(sizeDesc)(I2); \ + THError(#I1 " and " #I2 " shapes do not match: " \ + #I1 " %s, " #I2 " %s", s1.str, s2.str); \ + } + +#define THNN_CHECK_SHAPE_INDICES(I1, I2) \ + THLongStorage *size2 = THLongTensor_newSizeOf(I2); \ + if (I1 != NULL && I2 != NULL && !THTensor_(isSize)(I1, size2)) \ + { \ + THDescBuff s1 = THTensor_(sizeDesc)(I1); \ + THDescBuff s2 = THLongTensor_sizeDesc(I2); \ + THLongStorage_free(size2); \ + THError(#I1 " and " #I2 " shapes do not match: " \ + #I1 " %s, " #I2 " %s", s1.str, s2.str); \ + } else { \ + THLongStorage_free(size2); \ + } + +#define THNN_CHECK_NELEMENT(I1, I2) \ + if (I1 != NULL && I2 != NULL ) { \ + ptrdiff_t n1 = THTensor_(nElement)(I1); \ + ptrdiff_t n2 = THTensor_(nElement)(I2); \ + if (n1 != n2) \ + { \ + THDescBuff s1 = THTensor_(sizeDesc)(I1); \ + THDescBuff s2 = THTensor_(sizeDesc)(I2); \ + THError(#I1 " and " #I2 " have different number of elements: " \ + #I1 "%s has %ld elements, while " \ + #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \ + } \ + } + +#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ + if (THTensor_(nDimension)(T) != DIM || \ + THTensor_(size)(T, DIM_SIZE) != SIZE) { \ + THDescBuff s1 = THTensor_(sizeDesc)(T); \ + THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ + " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ + } + +#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \ + if (THIndexTensor_(nDimension)(T) != DIM || \ + THIndexTensor_(size)(T, DIM_SIZE) != SIZE) { \ + THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \ + THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ + " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ + } + +#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \ + if (!(COND)) { \ + THDescBuff s1 = THTensor_(sizeDesc)(T); \ + THArgCheck(COND, ARG, FORMAT, s1.str); \ + } + +#include "generic/Abs.c" +#include "THGenerateFloatTypes.h" + +#include "generic/AbsCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/BCECriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/ClassNLLCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Col2Im.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialClassNLLCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/DistKLDivCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/ELU.c" +#include "THGenerateFloatTypes.h" + +#include "generic/HardTanh.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Im2Col.c" +#include "THGenerateFloatTypes.h" + +#include "generic/GatedLinearUnit.c" +#include "THGenerateFloatTypes.h" + +#include "generic/L1Cost.c" +#include "THGenerateFloatTypes.h" + +#include "generic/LeakyReLU.c" +#include "THGenerateFloatTypes.h" + +#include "generic/FusedRNNKernel.c" +#include "THGenerateFloatTypes.h" + +#include "generic/LogSigmoid.c" +#include "THGenerateFloatTypes.h" + +#include "generic/LookupTable.c" +#include "THGenerateFloatTypes.h" + +#include "generic/MSECriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/MarginCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SoftMarginCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/MultiLabelMarginCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/MultiMarginCriterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Linear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/PReLU.c" +#include "THGenerateFloatTypes.h" + +#include "generic/RReLU.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Sigmoid.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SmoothL1Criterion.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SoftPlus.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SoftShrink.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SparseLinear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/IndexLinear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Sqrt.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Square.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Tanh.c" +#include "THGenerateFloatTypes.h" + +#include "generic/Threshold.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalSubSampling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalRowConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalUpSamplingNearest.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalUpSamplingLinear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/FeatureLPPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/BatchNormalization.c" +#include "THGenerateFloatTypes.h" + +#include "generic/unfold.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialConvolutionMap.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialConvolutionMM.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialConvolutionLocal.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialFullDilatedConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialFullConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialFullConvolutionMap.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialDilatedConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialAdaptiveMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialAdaptiveAveragePooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialAveragePooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialFractionalMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialDilatedMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialMaxUnpooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialSubSampling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialUpSamplingNearest.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialUpSamplingBilinear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialGridSamplerBilinear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricAveragePooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricConvolutionMM.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricFullDilatedConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricFullConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricDilatedConvolution.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricAdaptiveMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricAdaptiveAveragePooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricDilatedMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricFractionalMaxPooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricMaxUnpooling.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialReflectionPadding.c" +#include "THGenerateFloatTypes.h" + +#include "generic/SpatialReplicationPadding.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalReflectionPadding.c" +#include "THGenerateFloatTypes.h" + +#include "generic/TemporalReplicationPadding.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricReplicationPadding.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricUpSamplingNearest.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricUpSamplingTrilinear.c" +#include "THGenerateFloatTypes.h" + +#include "generic/VolumetricGridSamplerBilinear.c" +#include "THGenerateFloatTypes.h" diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh new file mode 100755 index 0000000..c341b88 --- /dev/null +++ b/aten/tools/run_tests.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -x +set -e + +VALGRIND_SUP="${PWD}/`dirname $0`/valgrind.sup" +pushd $1 + +VALGRIND=${VALGRIND:=ON} +./basic +./atest +./scalar_test +./broadcast_test +./wrapdim_test +./apply_utils_test +./dlconvertor_test +./native_test +./scalar_tensor_test +./undefined_tensor_test +if [[ -x ./cudnn_test ]]; then + ./cudnn_test +fi +if [[ -x ./cuda_rng_test ]]; then + ./cuda_rng_test +fi +if [[ -x ./apply_test ]]; then + ./apply_test +fi +if [[ -x ./stream_test ]]; then + ./stream_test +fi +if [ "$VALGRIND" == "ON" ] +then + valgrind --suppressions="$VALGRIND_SUP" --error-exitcode=1 ./basic "[cpu]" +fi + +popd diff --git a/aten/tools/test_install.sh b/aten/tools/test_install.sh new file mode 100755 index 0000000..381d1e7 --- /dev/null +++ b/aten/tools/test_install.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -x +set -e +INSTALL_ROOT=$1 +SRC_ROOT=$2 +rm -rf test_build +mkdir test_build +cd test_build +cmake -DCMAKE_PREFIX_PATH=$INSTALL_ROOT $SRC_ROOT/src/ATen/test/test_install +make +./main diff --git a/aten/tools/update_doc.sh b/aten/tools/update_doc.sh new file mode 100755 index 0000000..f8fb6c3 --- /dev/null +++ b/aten/tools/update_doc.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cp build/src/ATen/ATen/{Tensor,Type,Functions}.h doc + diff --git a/aten/tools/valgrind.sup b/aten/tools/valgrind.sup new file mode 100644 index 0000000..fd1c39d --- /dev/null +++ b/aten/tools/valgrind.sup @@ -0,0 +1,11 @@ +{ + + Memcheck:Cond + fun:index + fun:expand_dynamic_string_token + fun:_dl_map_object + fun:map_doit + fun:_dl_catch_error + fun:handle_ld_preload + ... +} diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt new file mode 100644 index 0000000..2f37470 --- /dev/null +++ b/binaries/CMakeLists.txt @@ -0,0 +1,56 @@ +caffe2_binary_target("convert_caffe_image_db.cc") +caffe2_binary_target("convert_db.cc") +caffe2_binary_target("make_cifar_db.cc") +caffe2_binary_target("make_mnist_db.cc") +caffe2_binary_target("predictor_verifier.cc") +caffe2_binary_target("print_registered_core_operators.cc") +caffe2_binary_target("run_plan.cc") +caffe2_binary_target("speed_benchmark.cc") +caffe2_binary_target("split_db.cc") + +caffe2_binary_target("db_throughput.cc") + + +if (USE_CUDA) + caffe2_binary_target("inspect_gpus.cc") + target_link_libraries(inspect_gpus ${CUDA_LIBRARIES}) + caffe2_binary_target("print_core_object_sizes.cc") + + if (BUILD_TEST) + # Core overhead benchmark + caffe2_binary_target("core_overhead_benchmark.cc") + target_link_libraries(core_overhead_benchmark benchmark ${CUDA_curand_LIBRARY}) + endif() +endif() + +if (USE_ZMQ) + caffe2_binary_target("zmq_feeder.cc") + target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES}) +endif() + +if(USE_MPI) + caffe2_binary_target("run_plan_mpi.cc") + target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES}) +endif() + +if (USE_OPENCV AND USE_LEVELDB) + caffe2_binary_target("convert_encoded_to_raw_leveldb.cc") + target_link_libraries( + convert_encoded_to_raw_leveldb + ${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES}) +endif() + +if (USE_OPENCV) + caffe2_binary_target("make_image_db.cc") + target_link_libraries(make_image_db ${OpenCV_LIBS}) +endif() + +if (USE_OBSERVERS) + add_executable(caffe2_benchmark "caffe2_benchmark.cc" "benchmark_helper.cc") + target_link_libraries(caffe2_benchmark ${Caffe2_MAIN_LIBS}) + target_link_libraries(caffe2_benchmark ${Caffe2_MODULES}) + install(TARGETS caffe2_benchmark DESTINATION bin) +endif() + +# ---[ tutorials +caffe2_binary_target("tutorial_blob.cc") diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py new file mode 100644 index 0000000..048e151 --- /dev/null +++ b/binaries/bench_gen/bench_gen.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse + +from caffe2.python.model_helper import ModelHelper +from caffe2.python.predictor import mobile_exporter +from caffe2.python import workspace, brew + + +def parse_kwarg(kwarg_str): + key, value = kwarg_str.split('=') + try: + value = int(value) + except ValueError: + try: + value = float(value) + except ValueError: + pass + return key, value + + +def main(args): + # User defined keyword arguments + kwargs = {"order": "NCHW"} + kwargs.update(dict(args.kwargs)) + + model = ModelHelper(name=args.benchmark_name) + + op_type = args.operator # assumes a brew type op name + input_name = args.input_name + output_name = args.output_name + + iters = int(args.instances) + for i in range(iters): + input_blob_name = input_name + (str(i) if i > 0 and args.chain else '') + output_blob_name = output_name + str(i + 1) + add_op = getattr(brew, op_type) + add_op(model, input_blob_name, output_blob_name, **kwargs) + if args.chain: + input_name, output_name = output_name, input_name + + workspace.RunNetOnce(model.param_init_net) + + init_net, predict_net = mobile_exporter.Export( + workspace, model.net, model.params + ) + + if args.debug: + print("init_net:") + for op in init_net.op: + print(" ", op.type, op.input, "-->", op.output) + print("predict_net:") + for op in predict_net.op: + print(" ", op.type, op.input, "-->", op.output) + + with open(args.predict_net, 'wb') as f: + f.write(predict_net.SerializeToString()) + with open(args.init_net, 'wb') as f: + f.write(init_net.SerializeToString()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Utilitity to generate Caffe2 benchmark models.") + parser.add_argument("operator", help="Caffe2 operator to benchmark.") + parser.add_argument("-b", "--blob", + help="Instantiate a blob --blob name=dim1,dim2,dim3", + action='append') + parser.add_argument("--context", help="Context to run on.", default="CPU") + parser.add_argument("--kwargs", help="kwargs to pass to operator.", + nargs="*", type=parse_kwarg, default=[]) + parser.add_argument("--init_net", help="Output initialization net.", + default="init_net.pb") + parser.add_argument("--predict_net", help="Output prediction net.", + default="predict_net.pb") + parser.add_argument("--benchmark_name", + help="Name of the benchmark network", + default="benchmark") + parser.add_argument("--input_name", help="Name of the input blob.", + default="data") + parser.add_argument("--output_name", help="Name of the output blob.", + default="output") + parser.add_argument("--instances", + help="Number of instances to run the operator.", + default="1") + parser.add_argument("-d", "--debug", help="Print debug information.", + action='store_true') + parser.add_argument("-c", "--chain", + help="Chain ops together (create data dependencies)", + action='store_true') + args = parser.parse_args() + main(args) diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc new file mode 100644 index 0000000..52b5117 --- /dev/null +++ b/binaries/benchmark_helper.cc @@ -0,0 +1,300 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "binaries/benchmark_helper.h" +#include "caffe2/core/blob_serialization.h" +#ifdef __CUDA_ARCH__ +#include "caffe2/core/context_gpu.h" +#endif +#include "caffe2/core/init.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/net.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/bench_utils.h" +#include "caffe2/utils/string_utils.h" +#include "observers/net_observer_reporter_print.h" +#include "observers/observer_config.h" +#include "observers/perf_observer.h" + +using std::map; +using std::shared_ptr; +using std::string; +using std::unique_ptr; +using std::vector; + +void observerConfig() { + caffe2::ClearGlobalNetObservers(); + caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) { + return caffe2::make_unique(subject); + }); + caffe2::ObserverConfig::setReporter( + caffe2::make_unique()); +} + +bool backendCudaSet(const string& backend) { + bool run_on_gpu = false; + if (backend == "cuda") { +#ifdef __CUDA_ARCH__ + if (caffe2::HasCudaGPU()) { + run_on_gpu = true; + } else { + CAFFE_THROW("NO GPU support on this host machine"); + } +#else + CAFFE_THROW("NO GPU support"); +#endif + } + return run_on_gpu; +} + +void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) { + for (int j = 0; j < net_def->op_size(); j++) { + caffe2::OperatorDef* op = net_def->mutable_op(j); + op->mutable_device_option()->set_device_type(run_dev); + } +} + +void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) { + if (backend != "builtin") { + string engine = backend == "nnpack" + ? "NNPACK" + : backend == "eigen" ? "EIGEN" + : backend == "mkl" ? "MKLDNN" + : backend == "cuda" + ? "CUDA" + : backend == "dnnlowp" ? "DNNLOWP" + : backend == "dnnlowp_16" + ? "DNNLOWP_16" + : backend == "default" ? "" : "NONE"; + CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); + for (int i = 0; i < net_def->op_size(); i++) { + caffe2::OperatorDef* op_def = net_def->mutable_op(i); + op_def->set_engine(engine); + } + } +} + +void loadInput( + shared_ptr workspace, + const bool run_on_gpu, + map& tensor_protos_map, + const string& input, + const string& input_file, + const string& input_dims, + const string& input_type) { + // Load input. + if (input.size()) { + vector input_names = caffe2::split(',', input); + if (input_file.size()) { + vector input_files = caffe2::split(',', input_file); + CAFFE_ENFORCE_EQ( + input_names.size(), + input_files.size(), + "Input name and file should have the same number."); + for (int i = 0; i < input_names.size(); ++i) { + caffe2::TensorProtos tensor_protos; + CAFFE_ENFORCE( + caffe2::ReadProtoFromFile(input_files[i], &tensor_protos)); + workspace->CreateBlob(input_names[i]); + tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos)); + } + } else if (input_dims.size() || input_type.size()) { + CAFFE_ENFORCE_GE( + input_dims.size(), + 0, + "Input dims must be specified when input tensors are used."); + CAFFE_ENFORCE_GE( + input_type.size(), + 0, + "Input type must be specified when input tensors are used."); + + vector input_dims_list = caffe2::split(';', input_dims); + CAFFE_ENFORCE_EQ( + input_names.size(), + input_dims_list.size(), + "Input name and dims should have the same number of items."); + vector input_type_list = caffe2::split(';', input_type); + CAFFE_ENFORCE_EQ( + input_names.size(), + input_type_list.size(), + "Input name and type should have the same number of items."); + for (size_t i = 0; i < input_names.size(); ++i) { + vector input_dims_str = caffe2::split(',', input_dims_list[i]); + vector input_dims; + for (const string& s : input_dims_str) { + input_dims.push_back(caffe2::stoi(s)); + } + caffe2::Blob* blob = workspace->GetBlob(input_names[i]); + if (blob == nullptr) { + blob = workspace->CreateBlob(input_names[i]); + } + if (run_on_gpu) { + LOG(INFO) << "Running on GPU."; +#ifdef __CUDA_ARCH__ + caffe2::TensorCUDA* tensor = blob->GetMutable(); + CHECK_NOTNULL(tensor); + tensor->Resize(input_dims); + if (input_type_list[i] == "uint8_t") { + tensor->mutable_data(); + } else if (input_type_list[i] == "float") { + tensor->mutable_data(); + } else { + CAFFE_THROW("Unsupported input type: ", input_type_list[i]); + } +#else + CAFFE_THROW("Not support GPU on mobile."); +#endif + } else { + caffe2::TensorCPU* tensor = blob->GetMutable(); + CHECK_NOTNULL(tensor); + tensor->Resize(input_dims); + if (input_type_list[i] == "uint8_t") { + tensor->mutable_data(); + } else if (input_type_list[i] == "float") { + tensor->mutable_data(); + } else { + CAFFE_THROW("Unsupported input type: ", input_type_list[i]); + } + } + } + } else { + CAFFE_THROW( + "You requested input tensors, but neither input_file nor " + "input_dims is set."); + } + } +} + +void fillInputBlob( + shared_ptr workspace, + map& tensor_protos_map, + int iteration) { + if (tensor_protos_map.empty()) { + return; + } + + for (auto& tensor_kv : tensor_protos_map) { + caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first); + if (blob == nullptr) { + blob = workspace->CreateBlob(tensor_kv.first); + } + // todo: support gpu and make this function a tempalte + int protos_size = tensor_kv.second.protos_size(); + caffe2::TensorProto* tensor_proto = + tensor_kv.second.mutable_protos(iteration % protos_size); + caffe2::TensorCPU* tensor = blob->GetMutable(); + tensor->Resize(std::vector()); + if (tensor_proto->data_type() == caffe2::TensorProto::STRING) { + (tensor->mutable_data())[0] = tensor_proto->string_data(0); + } else if (tensor_proto->data_type() == caffe2::TensorProto::FLOAT) { + (tensor->mutable_data())[0] = tensor_proto->float_data(0); + } + // todo: for other types + } +} + +void runNetwork( + shared_ptr workspace, + caffe2::NetDef& net_def, + map& tensor_protos_map, + const bool wipe_cache, + const bool run_individual, + const int warmup, + const int iter) { + if (!net_def.has_name()) { + net_def.set_name("benchmark"); + } + + caffe2::NetBase* net = workspace->CreateNet(net_def); + CHECK_NOTNULL(net); + + LOG(INFO) << "Starting benchmark."; + caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup); + LOG(INFO) << "Running warmup runs."; + for (int i = 0; i < warmup; ++i) { + fillInputBlob(workspace, tensor_protos_map, i); + CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); + } + + if (wipe_cache) { + caffe2::wipe_cache(); + } + LOG(INFO) << "Main runs."; + CAFFE_ENFORCE( + iter >= 0, + "Number of main runs should be non negative, provided ", + iter, + "."); + for (int i = 0; i < iter; ++i) { + caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup); + fillInputBlob(workspace, tensor_protos_map, i); + CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); + if (wipe_cache) { + caffe2::wipe_cache(); + } + if (run_individual) { + caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup); + CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); + if (wipe_cache) { + caffe2::wipe_cache(); + } + } + } +} + +void writeOutput( + shared_ptr workspace, + const bool run_on_gpu, + const string& output, + const string& output_folder, + const bool text_output) { + string output_prefix = output_folder.size() ? output_folder + "/" : ""; + if (output.size()) { + vector output_names = caffe2::split(',', output); + if (output == "*") { + output_names = workspace->Blobs(); + } + for (const string& name : output_names) { + CAFFE_ENFORCE( + workspace->HasBlob(name), + "You requested a non-existing blob: ", + name); + if (text_output) { + if (run_on_gpu) { +#ifdef __CUDA_ARCH__ + writeTextOutput( + workspace->GetBlob(name)->GetMutable(), + output_prefix, + name); +#else + CAFFE_THROW("Not support GPU."); +#endif + } else { + writeTextOutput( + workspace->GetBlob(name)->GetMutable(), + output_prefix, + name); + } + } else { + string serialized = workspace->GetBlob(name)->Serialize(name); + string output_filename = output_prefix + name; + caffe2::WriteStringToFile(serialized, output_filename.c_str()); + } + } + } +} diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h new file mode 100644 index 0000000..0a52e16 --- /dev/null +++ b/binaries/benchmark_helper.h @@ -0,0 +1,99 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "caffe2/core/blob_serialization.h" +#include "caffe2/core/init.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/net.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/string_utils.h" + +using std::map; +using std::shared_ptr; +using std::string; +using std::vector; + +template +void writeTextOutput( + TensorType* tensor, + const string& output_prefix, + const string& name) { + string output_name = output_prefix + "/" + name + ".txt"; + caffe2::TensorSerializer ser; + caffe2::BlobProto blob_proto; + ser.Serialize( + *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size()); + blob_proto.set_name(output_name); + blob_proto.set_type("Tensor"); + CAFFE_ENFORCE(blob_proto.has_tensor()); + caffe2::TensorProto tensor_proto = blob_proto.tensor(); + vector data; + switch (tensor_proto.data_type()) { + case caffe2::TensorProto::FLOAT: { + std::copy( + tensor_proto.float_data().begin(), + tensor_proto.float_data().end(), + std::back_inserter(data)); + break; + } + case caffe2::TensorProto::INT32: { + std::copy( + tensor_proto.int32_data().begin(), + tensor_proto.int32_data().end(), + std::back_inserter(data)); + break; + } + default: + CAFFE_THROW("Unimplemented Blob type."); + } + std::ofstream output_file(output_name); + std::ostream_iterator output_iterator(output_file, "\n"); + std::copy(data.begin(), data.end(), output_iterator); +} + +void observerConfig(); +bool backendCudaSet(const string&); +void setDeviceType(caffe2::NetDef*, caffe2::DeviceType&); +void setOperatorEngine(caffe2::NetDef*, const string&); +void loadInput( + shared_ptr, + const bool, + map&, + const string&, + const string&, + const string&, + const string&); +void fillInputBlob( + shared_ptr, + map&, + int iteration); +void writeOutput( + shared_ptr, + const bool, + const string&, + const string&, + const bool); +void runNetwork( + shared_ptr, + caffe2::NetDef&, + map&, + const bool, + const bool, + const int, + const int); diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc new file mode 100644 index 0000000..729479a --- /dev/null +++ b/binaries/caffe2_benchmark.cc @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "binaries/benchmark_helper.h" + +using std::make_shared; +using std::map; +using std::string; +using std::vector; + +CAFFE2_DEFINE_string( + backend, + "builtin", + "The backend to use when running the model. The allowed " + "backend choices are: builtin, default, nnpack, eigen, mkl, cuda"); + +CAFFE2_DEFINE_string( + init_net, + "", + "The given net to initialize any parameters."); +CAFFE2_DEFINE_string( + input, + "", + "Input that is needed for running the network. If " + "multiple input needed, use comma separated string."); +CAFFE2_DEFINE_string( + input_dims, + "", + "Alternate to input_files, if all inputs are simple " + "float TensorCPUs, specify the dimension using comma " + "separated numbers. If multiple input needed, use " + "semicolon to separate the dimension of different " + "tensors."); +CAFFE2_DEFINE_string( + input_file, + "", + "Input file that contain the serialized protobuf for " + "the input blobs. If multiple input needed, use comma " + "separated string. Must have the same number of items " + "as input does."); +CAFFE2_DEFINE_string( + input_type, + "float", + "Input type when specifying the input dimension." + "The supported types are float, uint8_t."); +CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run."); +CAFFE2_DEFINE_string(net, "", "The given net to benchmark."); +CAFFE2_DEFINE_string( + output, + "", + "Output that should be dumped after the execution " + "finishes. If multiple outputs are needed, use comma " + "separated string. If you want to dump everything, pass " + "'*' as the output value."); +CAFFE2_DEFINE_string( + output_folder, + "", + "The folder that the output should be written to. This " + "folder must already exist in the file system."); +CAFFE2_DEFINE_bool( + run_individual, + false, + "Whether to benchmark individual operators."); +CAFFE2_DEFINE_bool( + text_output, + false, + "Whether to write out output in text format for regression purpose."); +CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up."); +CAFFE2_DEFINE_bool( + wipe_cache, + false, + "Whether to evict the cache before running network."); + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + + observerConfig(); + caffe2::ShowLogInfoToStderr(); + + auto workspace = make_shared(new caffe2::Workspace()); + bool run_on_gpu = backendCudaSet(caffe2::FLAGS_backend); + + // support other device type in the future? + caffe2::DeviceType run_dev = run_on_gpu ? caffe2::CUDA : caffe2::CPU; + + // Run initialization network. + caffe2::NetDef init_net_def; + CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def)); + setDeviceType(&init_net_def, run_dev); + setOperatorEngine(&init_net_def, caffe2::FLAGS_backend); + CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); + + // Run main network. + caffe2::NetDef net_def; + CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def)); + setDeviceType(&net_def, run_dev); + setOperatorEngine(&net_def, caffe2::FLAGS_backend); + + map tensor_protos_map; + + loadInput( + workspace, + run_on_gpu, + tensor_protos_map, + caffe2::FLAGS_input, + caffe2::FLAGS_input_file, + caffe2::FLAGS_input_dims, + caffe2::FLAGS_input_type); + + runNetwork( + workspace, + net_def, + tensor_protos_map, + caffe2::FLAGS_wipe_cache, + caffe2::FLAGS_run_individual, + caffe2::FLAGS_warmup, + caffe2::FLAGS_iter); + + writeOutput( + workspace, + run_on_gpu, + caffe2::FLAGS_output, + caffe2::FLAGS_output_folder, + caffe2::FLAGS_text_output); + + return 0; +} diff --git a/binaries/convert_caffe_image_db.cc b/binaries/convert_caffe_image_db.cc new file mode 100644 index 0000000..ef5a570 --- /dev/null +++ b/binaries/convert_caffe_image_db.cc @@ -0,0 +1,90 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe/proto/caffe.pb.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(input_db, "", "The input db."); +CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); +CAFFE2_DEFINE_string(output_db, "", "The output db."); +CAFFE2_DEFINE_string(output_db_type, "", "The output db type."); +CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size."); + +using caffe2::db::Cursor; +using caffe2::db::DB; +using caffe2::db::Transaction; +using caffe2::TensorProto; +using caffe2::TensorProtos; + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + + std::unique_ptr in_db(caffe2::db::CreateDB( + caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ)); + std::unique_ptr out_db(caffe2::db::CreateDB( + caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW)); + std::unique_ptr cursor(in_db->NewCursor()); + std::unique_ptr transaction(out_db->NewTransaction()); + int count = 0; + for (; cursor->Valid(); cursor->Next()) { + caffe::Datum datum; + CAFFE_ENFORCE(datum.ParseFromString(cursor->value())); + TensorProtos protos; + TensorProto* data = protos.add_protos(); + TensorProto* label = protos.add_protos(); + label->set_data_type(TensorProto::INT32); + label->add_dims(1); + label->add_int32_data(datum.label()); + if (datum.encoded()) { + // This is an encoded image. we will copy over the data directly. + data->set_data_type(TensorProto::STRING); + data->add_dims(1); + data->add_string_data(datum.data()); + } else { + // float data not supported right now. + CAFFE_ENFORCE_EQ(datum.float_data_size(), 0); + std::vector buffer_vec(datum.data().size()); + char* buffer = buffer_vec.data(); + // swap order from CHW to HWC + int channels = datum.channels(); + int size = datum.height() * datum.width(); + CAFFE_ENFORCE_EQ(datum.data().size(), channels * size); + for (int c = 0; c < channels; ++c) { + char* dst = buffer + c; + const char* src = datum.data().c_str() + c * size; + for (int n = 0; n < size; ++n) { + dst[n*channels] = src[n]; + } + } + data->set_data_type(TensorProto::BYTE); + data->add_dims(datum.height()); + data->add_dims(datum.width()); + data->add_dims(datum.channels()); + data->set_byte_data(buffer, datum.data().size()); + } + transaction->Put(cursor->key(), protos.SerializeAsString()); + if (++count % caffe2::FLAGS_batch_size == 0) { + transaction->Commit(); + LOG(INFO) << "Converted " << count << " items so far."; + } + } + LOG(INFO) << "A total of " << count << " items processed."; + return 0; +} + diff --git a/binaries/convert_db.cc b/binaries/convert_db.cc new file mode 100644 index 0000000..cb0710a --- /dev/null +++ b/binaries/convert_db.cc @@ -0,0 +1,51 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(input_db, "", "The input db."); +CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); +CAFFE2_DEFINE_string(output_db, "", "The output db."); +CAFFE2_DEFINE_string(output_db_type, "", "The output db type."); +CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size."); + +using caffe2::db::Cursor; +using caffe2::db::DB; +using caffe2::db::Transaction; + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + + std::unique_ptr in_db(caffe2::db::CreateDB( + caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ)); + std::unique_ptr out_db(caffe2::db::CreateDB( + caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW)); + std::unique_ptr cursor(in_db->NewCursor()); + std::unique_ptr transaction(out_db->NewTransaction()); + int count = 0; + for (; cursor->Valid(); cursor->Next()) { + transaction->Put(cursor->key(), cursor->value()); + if (++count % caffe2::FLAGS_batch_size == 0) { + transaction->Commit(); + LOG(INFO) << "Converted " << count << " items so far."; + } + } + LOG(INFO) << "A total of " << count << " items processed."; + return 0; +} diff --git a/binaries/convert_encoded_to_raw_leveldb.cc b/binaries/convert_encoded_to_raw_leveldb.cc new file mode 100644 index 0000000..4e272fc --- /dev/null +++ b/binaries/convert_encoded_to_raw_leveldb.cc @@ -0,0 +1,156 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This script converts an image dataset to leveldb. +// +// caffe2::FLAGS_input_folder is the root folder that holds all the images, and +// caffe2::FLAGS_list_file should be a list of files as well as their labels, in the +// format as +// subfolder1/file1.JPEG 7 +// .... + +#include + +#include // NOLINT(readability/streams) +#include +#include +#include + +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/core/logging.h" +#include "leveldb/db.h" +#include "leveldb/write_batch.h" + +CAFFE2_DEFINE_string(input_db_name, "", "The input image file name."); +CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name."); +CAFFE2_DEFINE_bool(color, true, "If set, load images in color."); +CAFFE2_DEFINE_int(scale, 256, + "If caffe2::FLAGS_raw is set, scale all the images' shorter edge to the given " + "value."); +CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square."); + + +namespace caffe2 { + +using std::string; +using std::unique_ptr; + +void ConvertToRawDataset( + const string& input_db_name, const string& output_db_name) { + // input leveldb + std::unique_ptr input_db; + LOG(INFO) << "Opening input leveldb " << input_db_name; + { + leveldb::Options options; + options.create_if_missing = false; + leveldb::DB* db_temp; + leveldb::Status status = leveldb::DB::Open( + options, input_db_name, &db_temp); + CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, "."); + input_db.reset(db_temp); + } + + // output leveldb + std::unique_ptr output_db; + std::unique_ptr batch; + LOG(INFO) << "Opening leveldb " << output_db_name; + { + leveldb::Options options; + options.error_if_exists = true; + options.create_if_missing = true; + options.write_buffer_size = 268435456; + leveldb::DB* db_temp; + leveldb::Status status = leveldb::DB::Open( + options, output_db_name, &db_temp); + CAFFE_ENFORCE( + status.ok(), + "Failed to open leveldb ", + output_db_name, + ". Is it already existing?"); + output_db.reset(db_temp); + } + batch.reset(new leveldb::WriteBatch()); + + TensorProtos input_protos; + TensorProtos output_protos; + TensorProto* data = output_protos.add_protos(); + TensorProto* label = output_protos.add_protos(); + data->set_data_type(TensorProto::BYTE); + data->add_dims(0); + data->add_dims(0); + if (caffe2::FLAGS_color) { + data->add_dims(3); + } + string value; + + unique_ptr iter; + iter.reset(input_db->NewIterator(leveldb::ReadOptions())); + iter->SeekToFirst(); + int count = 0; + for (; iter->Valid(); iter->Next()) { + CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString())); + label->CopyFrom(input_protos.protos(1)); + const string& encoded_image = input_protos.protos(0).string_data(0); + int encoded_size = encoded_image.size(); + cv::Mat img = cv::imdecode( + cv::Mat(1, &encoded_size, CV_8UC1, + const_cast(encoded_image.data())), + caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); + cv::Mat resized_img; + int scaled_width, scaled_height; + if (caffe2::FLAGS_warp) { + scaled_width = caffe2::FLAGS_scale; + scaled_height = caffe2::FLAGS_scale; + } else if (img.rows > img.cols) { + scaled_width = caffe2::FLAGS_scale; + scaled_height = static_cast(img.rows) * caffe2::FLAGS_scale / img.cols; + } else { + scaled_height = caffe2::FLAGS_scale; + scaled_width = static_cast(img.cols) * caffe2::FLAGS_scale / img.rows; + } + cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0, + cv::INTER_LINEAR); + data->set_dims(0, scaled_height); + data->set_dims(1, scaled_width); + DCHECK(resized_img.isContinuous()); + data->set_byte_data(resized_img.ptr(), + scaled_height * scaled_width * (caffe2::FLAGS_color ? 3 : 1)); + output_protos.SerializeToString(&value); + // Put in db + batch->Put(iter->key(), value); + if (++count % 1000 == 0) { + output_db->Write(leveldb::WriteOptions(), batch.get()); + batch.reset(new leveldb::WriteBatch()); + LOG(INFO) << "Processed " << count << " files."; + } + } + // write the last batch + if (count % 1000 != 0) { + output_db->Write(leveldb::WriteOptions(), batch.get()); + } + LOG(INFO) << "Processed a total of " << count << " files."; +} + +} // namespace caffe2 + + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::ConvertToRawDataset( + caffe2::FLAGS_input_db_name, caffe2::FLAGS_output_db_name); + return 0; +} diff --git a/binaries/core_overhead_benchmark.cc b/binaries/core_overhead_benchmark.cc new file mode 100644 index 0000000..74f19d5 --- /dev/null +++ b/binaries/core_overhead_benchmark.cc @@ -0,0 +1,223 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark.h" + +#include "caffe2/core/context.h" +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/operator.h" + +#define CAFFE2_SKIP_IF_NO_GPU \ + if (!caffe2::NumCudaDevices()) { \ + state.SkipWithError("No CUDA available, skipping benchmark."); \ + return; \ + } + +using namespace caffe2; + +static void BM_CUDAContextCreation(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + volatile CUDAContext context_so_we_do_initialization_work; + while (state.KeepRunning()) { + volatile CUDAContext context; + } +} +BENCHMARK(BM_CUDAContextCreation); + +static void BM_CUDAContextStreamAccess(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + CUDAContext context; + while (state.KeepRunning()) { + volatile cudaStream_t stream = context.cuda_stream(); + } +} +BENCHMARK(BM_CUDAContextStreamAccess); + +static void BM_cudaGetDevice(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + int id; + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaGetDevice(&id)); + } +} +BENCHMARK(BM_cudaGetDevice); + +static void BM_cudaSetDevice(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + int total = NumCudaDevices(); + int i = 0; + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaSetDevice((i++) % total)); + } +} +BENCHMARK(BM_cudaSetDevice); + +static void BM_cudaSetAndGetDevice(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + int total = NumCudaDevices(); + int i = 0; + int id; + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaSetDevice((i++) % total)); + CUDA_ENFORCE(cudaGetDevice(&id)); + } +} +BENCHMARK(BM_cudaSetAndGetDevice); + +static void BM_cudaSetSameDevice(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaSetDevice(0)); + } +} +BENCHMARK(BM_cudaSetSameDevice); + +static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + cudaStream_t stream; + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaStreamCreate(&stream)); + CUDA_ENFORCE(cudaStreamSynchronize(stream)); + CUDA_ENFORCE(cudaStreamDestroy(stream)); + } +} +BENCHMARK(BM_cudaStreamCreateSyncDelete); + +static void BM_cudaStreamSynchronize(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + cudaStream_t stream; + CUDA_ENFORCE(cudaStreamCreate(&stream)); + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaStreamSynchronize(stream)); + } +} +BENCHMARK(BM_cudaStreamSynchronize); + +static void BM_cudaEventRecord(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + cudaStream_t stream; + cudaEvent_t event; + CUDA_ENFORCE(cudaStreamCreate(&stream)); + CUDA_ENFORCE(cudaEventCreateWithFlags( + &event, cudaEventDefault | cudaEventDisableTiming)); + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaEventRecord(event, stream)); + } +} +BENCHMARK(BM_cudaEventRecord); + +static void BM_cudaStreamWaitEventThenStreamSynchronize( + benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + cudaStream_t stream; + cudaEvent_t event; + CUDA_ENFORCE(cudaStreamCreate(&stream)); + CUDA_ENFORCE(cudaEventCreateWithFlags( + &event, cudaEventDefault | cudaEventDisableTiming)); + CUDA_ENFORCE(cudaEventRecord(event, stream)); + CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0)); + CUDA_ENFORCE(cudaStreamSynchronize(stream)); + while (state.KeepRunning()) { + CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0)); + CUDA_ENFORCE(cudaStreamSynchronize(stream)); + } +} +BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize); + +static void BM_CudaPointerAffinity(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + TensorCUDA tensor(vector{1, 2, 3, 4}); + float* ptr = tensor.mutable_data(); + while (state.KeepRunning()) { + volatile int id = GetGPUIDForPointer(ptr); + } +} +BENCHMARK(BM_CudaPointerAffinity); + +namespace { +template +class DummyEmptyOp : public Operator { + public: + DummyEmptyOp(const OperatorDef& def, Workspace* ws) + : Operator(def, ws) {} + + bool RunOnDevice() final { return true; } +}; + +REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp); +REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp); +OPERATOR_SCHEMA(DummyEmpty); +} // namespace + +static void BM_OperatorCreationCPU(benchmark::State& state) { + std::unique_ptr op; + OperatorDef def; + Workspace ws; + def.set_type("DummyEmpty"); + def.mutable_device_option()->set_device_type(CPU); + while (state.KeepRunning()) { + op = CreateOperator(def, &ws); + } +} +BENCHMARK(BM_OperatorCreationCPU); + +static void BM_OperatorCreationCUDA(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + std::unique_ptr op; + OperatorDef def; + Workspace ws; + def.set_type("DummyEmpty"); + def.mutable_device_option()->set_device_type(CUDA); + while (state.KeepRunning()) { + op = CreateOperator(def, &ws); + } +} +BENCHMARK(BM_OperatorCreationCUDA); + +static void BM_RawAllocDeallocCPU(benchmark::State& state) { + while (state.KeepRunning()) { + // Allocating only 1 byte in order to measure the overhead. + auto ptr_and_deleter = GetCPUAllocator()->New(1); + // Deallocate. + ptr_and_deleter.second(ptr_and_deleter.first); + } +} +BENCHMARK(BM_RawAllocDeallocCPU); + +static void BM_TensorAllocDeallocCPU(benchmark::State& state) { + Tensor tensor; + // small allocation + tensor.Resize(32, 32); + while (state.KeepRunning()) { + CHECK(tensor.mutable_data()); + tensor.FreeMemory(); + } +} +BENCHMARK(BM_TensorAllocDeallocCPU); + +static void BM_TensorAllocDeallocCUDA(benchmark::State& state) { + CAFFE2_SKIP_IF_NO_GPU; + Tensor tensor; + // small allocation + tensor.Resize(32, 32); + while (state.KeepRunning()) { + CHECK(tensor.mutable_data()); + tensor.FreeMemory(); + } +} +BENCHMARK(BM_TensorAllocDeallocCUDA); + +BENCHMARK_MAIN(); diff --git a/binaries/db_throughput.cc b/binaries/db_throughput.cc new file mode 100644 index 0000000..5d8fe5c --- /dev/null +++ b/binaries/db_throughput.cc @@ -0,0 +1,98 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/core/timer.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(input_db, "", "The input db."); +CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); +CAFFE2_DEFINE_int(report_interval, 1000, "The report interval."); +CAFFE2_DEFINE_int(repeat, 10, "The number to repeat the throughput test."); +CAFFE2_DEFINE_bool(use_reader, false, "If true, use the reader interface."); +CAFFE2_DEFINE_int(num_read_threads, 1, + "The number of concurrent reading threads."); + +using caffe2::db::Cursor; +using caffe2::db::DB; +using caffe2::db::DBReader; +using caffe2::string; + +void TestThroughputWithDB() { + std::unique_ptr in_db(caffe2::db::CreateDB( + caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ)); + std::unique_ptr cursor(in_db->NewCursor()); + for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) { + caffe2::Timer timer; + for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) { + string key = cursor->key(); + string value = cursor->value(); + //VLOG(1) << "Key " << key; + cursor->Next(); + if (!cursor->Valid()) { + cursor->SeekToFirst(); + } + } + double elapsed_seconds = timer.Seconds(); + printf("Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n", + iter_id, elapsed_seconds, + caffe2::FLAGS_report_interval / elapsed_seconds); + } +} + +void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) { + string key, value; + for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) { + caffe2::Timer timer; + for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) { + reader->Read(&key, &value); + } + double elapsed_seconds = timer.Seconds(); + printf("Thread %03d iteration %03d, took %4.5f seconds, " + "throughput %f items/sec.\n", + thread_id, iter_id, elapsed_seconds, + caffe2::FLAGS_report_interval / elapsed_seconds); + } +} + +void TestThroughputWithReader() { + caffe2::db::DBReader reader( + caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db); + std::vector> reading_threads( + caffe2::FLAGS_num_read_threads); + for (int i = 0; i < reading_threads.size(); ++i) { + reading_threads[i].reset(new std::thread( + TestThroughputWithReaderWorker, &reader, i)); + } + for (int i = 0; i < reading_threads.size(); ++i) { + reading_threads[i]->join(); + } +} + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + if (caffe2::FLAGS_use_reader) { + TestThroughputWithReader(); + } else { + TestThroughputWithDB(); + } + return 0; +} diff --git a/binaries/inspect_gpus.cc b/binaries/inspect_gpus.cc new file mode 100644 index 0000000..6b80a4e --- /dev/null +++ b/binaries/inspect_gpus.cc @@ -0,0 +1,57 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include "caffe2/core/common_gpu.h" +#include "caffe2/core/init.h" +#include "caffe2/core/logging.h" + +using std::vector; + +CAFFE2_DECLARE_int(caffe2_log_level); + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::SetUsageMessage( + "Inspects the GPUs on the current machine and prints out their details " + "provided by cuda."); + + int gpu_count; + CUDA_ENFORCE(cudaGetDeviceCount(&gpu_count)); + for (int i = 0; i < gpu_count; ++i) { + LOG(INFO) << "Querying device ID = " << i; + caffe2::DeviceQuery(i); + } + + vector > access_pattern; + CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern)); + + std::stringstream sstream; + // Find topology + for (int i = 0; i < gpu_count; ++i) { + for (int j = 0; j < gpu_count; ++j) { + sstream << (access_pattern[i][j] ? "+" : "-") << " "; + } + sstream << std::endl; + } + LOG(INFO) << "Access pattern: " << std::endl << sstream.str(); + + return 0; +} diff --git a/binaries/make_cifar_db.cc b/binaries/make_cifar_db.cc new file mode 100644 index 0000000..9f9c0bc --- /dev/null +++ b/binaries/make_cifar_db.cc @@ -0,0 +1,148 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// +// This script converts the CIFAR dataset to the leveldb format used +// by caffe to perform classification. +// Usage: +// convert_cifar_data input_folder output_db_file +// The CIFAR dataset could be downloaded at +// http://www.cs.toronto.edu/~kriz/cifar.html + +#include +#include // NOLINT(readability/streams) +#include +#include + +#include "caffe2/core/common.h" +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(input_folder, "", "The input folder name."); +CAFFE2_DEFINE_string(output_train_db_name, + "", "The output training db name."); +CAFFE2_DEFINE_string(output_test_db_name, + "", "The output testing db name."); +CAFFE2_DEFINE_string(db, "leveldb", "The db type."); +CAFFE2_DEFINE_bool(is_cifar100, false, + "If set, convert cifar100. Otherwise do cifar10."); + +namespace caffe2 { + +using std::stringstream; + +const int kCIFARSize = 32; +const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3; +const int kCIFAR10BatchSize = 10000; +const int kCIFAR10TestDataSize = 10000; +const int kCIFAR10TrainBatches = 5; + +const int kCIFAR100TrainDataSize = 50000; +const int kCIFAR100TestDataSize = 10000; + +void ReadImage(std::ifstream* file, int* label, char* buffer) { + char label_char; + if (caffe2::FLAGS_is_cifar100) { + // Skip the coarse label. + file->read(&label_char, 1); + } + file->read(&label_char, 1); + *label = label_char; + // Yes, there are better ways to do it, like in-place swap... but I am too + // lazy so let's just write it in a memory-wasteful way. + std::array channel_first_storage; + file->read(channel_first_storage.data(), kCIFARImageNBytes); + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) { + buffer[i * 3 + c] = + channel_first_storage[c * kCIFARSize * kCIFARSize + i]; + } + } + return; +} + +void WriteToDB(const string& filename, const int num_items, + const int& offset, db::DB* db) { + TensorProtos protos; + TensorProto* data = protos.add_protos(); + TensorProto* label = protos.add_protos(); + data->set_data_type(TensorProto::BYTE); + data->add_dims(kCIFARSize); + data->add_dims(kCIFARSize); + data->add_dims(3); + label->set_data_type(TensorProto::INT32); + label->add_dims(1); + label->add_int32_data(0); + + LOG(INFO) << "Converting file " << filename; + std::ifstream data_file(filename.c_str(), + std::ios::in | std::ios::binary); + CAFFE_ENFORCE(data_file, "Unable to open file ", filename); + char str_buffer[kCIFARImageNBytes]; + int label_value; + string serialized_protos; + std::unique_ptr transaction(db->NewTransaction()); + for (int itemid = 0; itemid < num_items; ++itemid) { + ReadImage(&data_file, &label_value, str_buffer); + data->set_byte_data(str_buffer, kCIFARImageNBytes); + label->set_int32_data(0, label_value); + protos.SerializeToString(&serialized_protos); + snprintf(str_buffer, kCIFARImageNBytes, "%05d", + offset + itemid); + transaction->Put(string(str_buffer), serialized_protos); + } +} + +void ConvertCIFAR() { + std::unique_ptr train_db( + db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_train_db_name, + db::NEW)); + std::unique_ptr test_db( + db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_test_db_name, + db::NEW)); + + if (!caffe2::FLAGS_is_cifar100) { + // This is cifar 10. + for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) { + stringstream train_file; + train_file << caffe2::FLAGS_input_folder << "/data_batch_" << fileid + 1 + << ".bin"; + WriteToDB(train_file.str(), kCIFAR10BatchSize, + fileid * kCIFAR10BatchSize, train_db.get()); + } + stringstream test_file; + test_file << caffe2::FLAGS_input_folder << "/test_batch.bin"; + WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get()); + } else { + // This is cifar 100. + stringstream train_file; + train_file << caffe2::FLAGS_input_folder << "/train.bin"; + WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get()); + stringstream test_file; + test_file << caffe2::FLAGS_input_folder << "/test.bin"; + WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get()); + } +} + +} // namespace caffe2 + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::ConvertCIFAR(); + return 0; +} diff --git a/binaries/make_image_db.cc b/binaries/make_image_db.cc new file mode 100644 index 0000000..2bdbb53 --- /dev/null +++ b/binaries/make_image_db.cc @@ -0,0 +1,280 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This script converts an image dataset to a database. +// +// caffe2::FLAGS_input_folder is the root folder that holds all the images +// +// caffe2::FLAGS_list_file is the path to a file containing a list of files +// and their labels, as follows: +// +// subfolder1/file1.JPEG 7 +// subfolder1/file2.JPEG 7 +// subfolder2/file1.JPEG 8 +// ... +// + +#include + +#include +#include +#include +#include +#include +#include + +#include "caffe2/core/common.h" +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_bool(shuffle, false, + "Randomly shuffle the order of images and their labels"); +CAFFE2_DEFINE_string(input_folder, "", "The input image file name."); +CAFFE2_DEFINE_string( + list_file, + "", + "The text file containing the list of images."); +CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name."); +CAFFE2_DEFINE_string(db, "leveldb", "The db type."); +CAFFE2_DEFINE_bool(raw, false, + "If set, we pre-read the images and store the raw buffer."); +CAFFE2_DEFINE_bool(color, true, "If set, load images in color."); +CAFFE2_DEFINE_int( + scale, + 256, + "If caffe2::FLAGS_raw is set, scale the shorter edge to the given value."); +CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square."); +CAFFE2_DEFINE_int( + num_threads, + -1, + "Number of image parsing and conversion threads."); + +namespace caffe2 { + +class Converter { + public: + explicit Converter() { + data_ = protos_.add_protos(); + label_ = protos_.add_protos(); + if (caffe2::FLAGS_raw) { + data_->set_data_type(TensorProto::BYTE); + data_->add_dims(0); + data_->add_dims(0); + if (caffe2::FLAGS_color) { + data_->add_dims(3); + } + } else { + data_->set_data_type(TensorProto::STRING); + data_->add_dims(1); + data_->add_string_data(""); + } + label_->set_data_type(TensorProto::INT32); + label_->add_dims(1); + label_->add_int32_data(0); + } + + ~Converter() { + if (thread_.joinable()) { + thread_.join(); + } + } + + void queue(const std::pair& pair) { + in_.push(pair); + } + + void start() { + thread_ = std::thread(&Converter::run, this); + } + + std::string get() { + std::unique_lock lock(mutex_); + while (out_.empty()) { + cv_.wait(lock); + } + + auto value = out_.front(); + out_.pop(); + cv_.notify_one(); + return value; + } + + void run() { + const auto& input_folder = caffe2::FLAGS_input_folder; + std::unique_lock lock(mutex_); + std::string value; + while (!in_.empty()) { + auto pair = in_.front(); + in_.pop(); + lock.unlock(); + + label_->set_int32_data(0, pair.second); + + // Add raw file contents to DB if !raw + if (!caffe2::FLAGS_raw) { + std::ifstream image_file_stream(input_folder + pair.first); + if (!image_file_stream) { + LOG(ERROR) << "Cannot open " << input_folder << pair.first + << ". Skipping."; + } else { + data_->mutable_string_data(0)->assign( + std::istreambuf_iterator(image_file_stream), + std::istreambuf_iterator()); + } + } else { + // Load image + cv::Mat img = cv::imread( + input_folder + pair.first, + caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR + : CV_LOAD_IMAGE_GRAYSCALE); + + // Resize image + cv::Mat resized_img; + int scaled_width, scaled_height; + if (caffe2::FLAGS_warp) { + scaled_width = caffe2::FLAGS_scale; + scaled_height = caffe2::FLAGS_scale; + } else if (img.rows > img.cols) { + scaled_width = caffe2::FLAGS_scale; + scaled_height = + static_cast(img.rows) * caffe2::FLAGS_scale / img.cols; + } else { + scaled_height = caffe2::FLAGS_scale; + scaled_width = + static_cast(img.cols) * caffe2::FLAGS_scale / img.rows; + } + cv::resize( + img, + resized_img, + cv::Size(scaled_width, scaled_height), + 0, + 0, + cv::INTER_LINEAR); + data_->set_dims(0, scaled_height); + data_->set_dims(1, scaled_width); + + // Assert we don't have to deal with alignment + DCHECK(resized_img.isContinuous()); + auto nbytes = resized_img.total() * resized_img.elemSize(); + data_->set_byte_data(resized_img.ptr(), nbytes); + } + + protos_.SerializeToString(&value); + + // Add serialized proto to out queue or wait if it is not empty + lock.lock(); + while (!out_.empty()) { + cv_.wait(lock); + } + out_.push(value); + cv_.notify_one(); + } + } + + protected: + TensorProtos protos_; + TensorProto* data_; + TensorProto* label_; + std::queue> in_; + std::queue out_; + + std::mutex mutex_; + std::condition_variable cv_; + std::thread thread_; +}; + +void ConvertImageDataset( + const string& input_folder, + const string& list_filename, + const string& output_db_name, + const bool /*shuffle*/) { + std::ifstream list_file(list_filename); + std::vector > lines; + std::string filename; + int file_label; + while (list_file >> filename >> file_label) { + lines.push_back(std::make_pair(filename, file_label)); + } + + if (caffe2::FLAGS_shuffle) { + LOG(INFO) << "Shuffling data"; + std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701)); + } + + auto num_threads = caffe2::FLAGS_num_threads; + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + } + + LOG(INFO) << "Processing " << lines.size() << " images..."; + LOG(INFO) << "Opening DB " << output_db_name; + + auto db = db::CreateDB(caffe2::FLAGS_db, output_db_name, db::NEW); + auto transaction = db->NewTransaction(); + + LOG(INFO) << "Using " << num_threads << " processing threads..."; + std::vector converters(num_threads); + + // Queue entries across converters + for (auto i = 0; i < lines.size(); i++) { + converters[i % converters.size()].queue(lines[i]); + } + + // Start all converters + for (auto& converter : converters) { + converter.start(); + } + + constexpr auto key_max_length = 256; + char key_cstr[key_max_length]; + string value; + int count = 0; + for (auto i = 0; i < lines.size(); i++) { + // Get serialized proto for this entry + auto value = converters[i % converters.size()].get(); + + // Synthesize key for this entry + auto key_len = snprintf( + key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str()); + DCHECK_LE(key_len, sizeof(key_cstr)); + + // Put in db + transaction->Put(string(key_cstr), value); + + if (++count % 1000 == 0) { + // Commit the current writes. + transaction->Commit(); + LOG(INFO) << "Processed " << count << " files."; + } + } + + // Commit final transaction + transaction->Commit(); + LOG(INFO) << "Processed " << count << " files."; +} + +} // namespace caffe2 + + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::ConvertImageDataset( + caffe2::FLAGS_input_folder, caffe2::FLAGS_list_file, + caffe2::FLAGS_output_db_name, caffe2::FLAGS_shuffle); + return 0; +} diff --git a/binaries/make_mnist_db.cc b/binaries/make_mnist_db.cc new file mode 100644 index 0000000..8737d0e --- /dev/null +++ b/binaries/make_mnist_db.cc @@ -0,0 +1,139 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This script converts the MNIST dataset to leveldb. +// The MNIST dataset could be downloaded at +// http://yann.lecun.com/exdb/mnist/ + +#include // NOLINT(readability/streams) +#include + +#include "caffe2/core/common.h" +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(image_file, "", "The input image file name."); +CAFFE2_DEFINE_string(label_file, "", "The label file name."); +CAFFE2_DEFINE_string(output_file, "", "The output db name."); +CAFFE2_DEFINE_string(db, "leveldb", "The db type."); +CAFFE2_DEFINE_int(data_limit, -1, + "If set, only output this number of data points."); +CAFFE2_DEFINE_bool(channel_first, false, + "If set, write the data as channel-first (CHW order) as the old " + "Caffe does."); + +namespace caffe2 { +uint32_t swap_endian(uint32_t val) { + val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); + return (val << 16) | (val >> 16); +} + +void convert_dataset(const char* image_filename, const char* label_filename, + const char* db_path, const int data_limit) { + // Open files + std::ifstream image_file(image_filename, std::ios::in | std::ios::binary); + std::ifstream label_file(label_filename, std::ios::in | std::ios::binary); + CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename); + CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename); + // Read the magic and the meta data + uint32_t magic; + uint32_t num_items; + uint32_t num_labels; + uint32_t rows; + uint32_t cols; + + image_file.read(reinterpret_cast(&magic), 4); + magic = swap_endian(magic); + if (magic == 529205256) { + LOG(FATAL) << + "It seems that you forgot to unzip the mnist dataset. You should " + "first unzip them using e.g. gunzip on Linux."; + } + CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic."); + label_file.read(reinterpret_cast(&magic), 4); + magic = swap_endian(magic); + CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic."); + image_file.read(reinterpret_cast(&num_items), 4); + num_items = swap_endian(num_items); + label_file.read(reinterpret_cast(&num_labels), 4); + num_labels = swap_endian(num_labels); + CAFFE_ENFORCE_EQ(num_items, num_labels); + image_file.read(reinterpret_cast(&rows), 4); + rows = swap_endian(rows); + image_file.read(reinterpret_cast(&cols), 4); + cols = swap_endian(cols); + + // leveldb + std::unique_ptr mnist_db(db::CreateDB(caffe2::FLAGS_db, db_path, db::NEW)); + std::unique_ptr transaction(mnist_db->NewTransaction()); + // Storing to db + char label_value; + std::vector pixels(rows * cols); + int count = 0; + const int kMaxKeyLength = 10; + char key_cstr[kMaxKeyLength]; + string value; + + TensorProtos protos; + TensorProto* data = protos.add_protos(); + TensorProto* label = protos.add_protos(); + data->set_data_type(TensorProto::BYTE); + if (caffe2::FLAGS_channel_first) { + data->add_dims(1); + data->add_dims(rows); + data->add_dims(cols); + } else { + data->add_dims(rows); + data->add_dims(cols); + data->add_dims(1); + } + label->set_data_type(TensorProto::INT32); + label->add_int32_data(0); + + LOG(INFO) << "A total of " << num_items << " items."; + LOG(INFO) << "Rows: " << rows << " Cols: " << cols; + for (int item_id = 0; item_id < num_items; ++item_id) { + image_file.read(pixels.data(), rows * cols); + label_file.read(&label_value, 1); + for (int i = 0; i < rows * cols; ++i) { + data->set_byte_data(pixels.data(), rows * cols); + } + label->set_int32_data(0, static_cast(label_value)); + snprintf(key_cstr, kMaxKeyLength, "%08d", item_id); + protos.SerializeToString(&value); + string keystr(key_cstr); + + // Put in db + transaction->Put(keystr, value); + if (++count % 1000 == 0) { + transaction->Commit(); + } + if (data_limit > 0 && count == data_limit) { + LOG(INFO) << "Reached data limit of " << data_limit << ", stop."; + break; + } + } +} +} // namespace caffe2 + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::convert_dataset(caffe2::FLAGS_image_file.c_str(), caffe2::FLAGS_label_file.c_str(), + caffe2::FLAGS_output_file.c_str(), caffe2::FLAGS_data_limit); + return 0; +} diff --git a/binaries/predictor_verifier.cc b/binaries/predictor_verifier.cc new file mode 100644 index 0000000..e82a8e9 --- /dev/null +++ b/binaries/predictor_verifier.cc @@ -0,0 +1,57 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "caffe2/core/flags.h" +#include "caffe2/core/init.h" +#include "caffe2/core/predictor.h" +#include "caffe2/utils/proto_utils.h" + +CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer."); +CAFFE2_DEFINE_string( + predict_net, + "", + "The given path to the predict protobuffer."); + +namespace caffe2 { + +void run() { + if (FLAGS_init_net.empty()) { + LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net."; + } + if (FLAGS_predict_net.empty()) { + LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net."; + } + caffe2::NetDef init_net, predict_net; + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net)); + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net)); + // Can be large due to constant fills + VLOG(1) << "Init net: " << ProtoDebugString(init_net); + LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net); + auto predictor = caffe2::make_unique(init_net, predict_net); + LOG(INFO) << "Checking that a null forward-pass works"; + Predictor::TensorVector inputVec, outputVec; + predictor->run(inputVec, &outputVec); + CAFFE_ENFORCE_GT(outputVec.size(), 0); +} +} + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::run(); + // This is to allow us to use memory leak checks. + caffe2::ShutdownProtobufLibrary(); + return 0; +} diff --git a/binaries/print_core_object_sizes.cc b/binaries/print_core_object_sizes.cc new file mode 100644 index 0000000..2000c34 --- /dev/null +++ b/binaries/print_core_object_sizes.cc @@ -0,0 +1,42 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "caffe2/core/init.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/context.h" +#include "caffe2/core/context_gpu.h" +#include "caffe2/proto/caffe2.pb.h" + +#define PRINT_SIZE(cls) \ + std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \ + << std::endl; + +int main(int /* unused */, char** /* unused */) { + PRINT_SIZE(caffe2::Blob); + PRINT_SIZE(caffe2::Tensor); + PRINT_SIZE(caffe2::Tensor); + PRINT_SIZE(caffe2::CPUContext); + PRINT_SIZE(caffe2::CUDAContext); + PRINT_SIZE(caffe2::OperatorBase); + PRINT_SIZE(caffe2::OperatorDef); + PRINT_SIZE(caffe2::Operator); + PRINT_SIZE(caffe2::Operator); + PRINT_SIZE(caffe2::TypeMeta); + PRINT_SIZE(caffe2::Workspace); + return 0; +} diff --git a/binaries/print_registered_core_operators.cc b/binaries/print_registered_core_operators.cc new file mode 100644 index 0000000..c76ea3e --- /dev/null +++ b/binaries/print_registered_core_operators.cc @@ -0,0 +1,73 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "caffe2/core/init.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/operator_schema.h" + +CAFFE2_DEFINE_string(schema, "", + "Print doc and schema of a particular operator"); + +static bool HasSchema(const std::string& str) { + return caffe2::OpSchemaRegistry::Schema(str); +} + +static bool HasDoc(const std::string& str) { + const auto* schema = caffe2::OpSchemaRegistry::Schema(str); + return (schema != nullptr) && (schema->doc() != nullptr); +} + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + + if (!caffe2::FLAGS_schema.empty()) { + const auto* schema = caffe2::OpSchemaRegistry::Schema( + caffe2::FLAGS_schema); + if (!schema) { + std::cerr << "Operator " << caffe2::FLAGS_schema + << " doesn't have a schema" << std::endl; + return 1; + } + std::cout << "Operator " << caffe2::FLAGS_schema << ": " << std::endl + << *schema; + return 0; + } + + for (const auto& pair : *caffe2::gDeviceTypeRegistry()) { + std::cout << "Device type " << pair.first +#ifndef CAFFE2_USE_LITE_PROTO + << " (" << caffe2::DeviceType_Name( + static_cast(pair.first)) + << ")" +#endif + << std::endl; + for (const auto& key : pair.second->Keys()) { + std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key) + << ")\t" << key << std::endl; + } + } + + std::cout << "Operators that have gradients registered:" << std::endl; + for (const auto& key : caffe2::GradientRegistry()->Keys()) { + std::cout << "\t(schema: " << HasSchema(key) << ", doc: " + << HasDoc(key) << ")\t" + << key << std::endl; + } + return 0; +} diff --git a/binaries/run_plan.cc b/binaries/run_plan.cc new file mode 100644 index 0000000..5ad2c3a --- /dev/null +++ b/binaries/run_plan.cc @@ -0,0 +1,40 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "caffe2/core/init.h" +#include "caffe2/core/operator.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/utils/proto_utils.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer."); + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + if (caffe2::FLAGS_plan.size() == 0) { + LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan."; + return 0; + } + LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan; + caffe2::PlanDef plan_def; + CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def)); + std::unique_ptr workspace(new caffe2::Workspace()); + workspace->RunPlan(plan_def); + + // This is to allow us to use memory leak checks. + caffe2::ShutdownProtobufLibrary(); + return 0; +} diff --git a/binaries/run_plan_mpi.cc b/binaries/run_plan_mpi.cc new file mode 100644 index 0000000..ee720fa --- /dev/null +++ b/binaries/run_plan_mpi.cc @@ -0,0 +1,48 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "caffe2/core/init.h" +#include "caffe2/core/operator.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/utils/proto_utils.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer."); + +int main(int argc, char** argv) { + caffe2::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it."); + int mpi_ret; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret); + if (mpi_ret != MPI_THREAD_MULTIPLE && + mpi_ret != MPI_THREAD_SERIALIZED) { + std::cerr << "Caffe2 MPI requires the underlying MPI to support the " + "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n"; + return 1; + } + caffe2::GlobalInit(&argc, &argv); + LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan; + caffe2::PlanDef plan_def; + CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def)); + std::unique_ptr workspace(new caffe2::Workspace()); + workspace->RunPlan(plan_def); + + // This is to allow us to use memory leak checks. + caffe2::ShutdownProtobufLibrary(); + MPI_Finalize(); + return 0; +} diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc new file mode 100644 index 0000000..196be4a --- /dev/null +++ b/binaries/speed_benchmark.cc @@ -0,0 +1,211 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "caffe2/core/init.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#ifdef CAFFE2_OPTIMIZER +#include "caffe2/opt/optimizer.h" +#endif +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/utils/proto_utils.h" +#include "caffe2/utils/string_utils.h" + +CAFFE2_DEFINE_string(net, "", "The given net to benchmark."); +CAFFE2_DEFINE_string( + init_net, + "", + "The given net to initialize any parameters."); +CAFFE2_DEFINE_string( + input, + "", + "Input that is needed for running the network. If " + "multiple input needed, use comma separated string."); +CAFFE2_DEFINE_string( + input_file, + "", + "Input file that contain the serialized protobuf for " + "the input blobs. If multiple input needed, use comma " + "separated string. Must have the same number of items " + "as input does."); +CAFFE2_DEFINE_string( + input_dims, + "", + "Alternate to input_files, if all inputs are simple " + "float TensorCPUs, specify the dimension using comma " + "separated numbers. If multiple input needed, use " + "semicolon to separate the dimension of different " + "tensors."); +CAFFE2_DEFINE_string(input_type, "", "Input type (uint8_t/float)"); +CAFFE2_DEFINE_string( + output, + "", + "Output that should be dumped after the execution " + "finishes. If multiple outputs are needed, use comma " + "separated string. If you want to dump everything, pass " + "'*' as the output value."); +CAFFE2_DEFINE_string( + output_folder, + "", + "The folder that the output should be written to. This " + "folder must already exist in the file system."); +CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up."); +CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run."); +CAFFE2_DEFINE_int(opt, 0, "The level of optimization to run automatically."); +CAFFE2_DEFINE_bool( + run_individual, + false, + "Whether to benchmark individual operators."); + +CAFFE2_DEFINE_bool(force_engine, false, "Force engine field for all operators"); +CAFFE2_DEFINE_string(engine, "", "Forced engine field value"); +CAFFE2_DEFINE_bool(force_algo, false, "Force algo arg for all operators"); +CAFFE2_DEFINE_string(algo, "", "Forced algo arg value"); + +using std::string; +using std::unique_ptr; +using std::vector; + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + unique_ptr workspace(new caffe2::Workspace()); + + // Run initialization network. + caffe2::NetDef net_def; + CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def)); + CAFFE_ENFORCE(workspace->RunNetOnce(net_def)); + + // Load input. + if (caffe2::FLAGS_input.size()) { + vector input_names = caffe2::split(',', caffe2::FLAGS_input); + if (caffe2::FLAGS_input_file.size()) { + vector input_files = caffe2::split(',', caffe2::FLAGS_input_file); + CAFFE_ENFORCE_EQ( + input_names.size(), + input_files.size(), + "Input name and file should have the same number."); + for (int i = 0; i < input_names.size(); ++i) { + caffe2::BlobProto blob_proto; + CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto)); + workspace->CreateBlob(input_names[i])->Deserialize(blob_proto); + } + } else if (caffe2::FLAGS_input_dims.size() || caffe2::FLAGS_input_type.size()) { + CAFFE_ENFORCE_GE( + caffe2::FLAGS_input_dims.size(), + 0, + "Input dims must be specified when input tensors are used."); + CAFFE_ENFORCE_GE( + caffe2::FLAGS_input_type.size(), + 0, + "Input type must be specified when input tensors are used."); + + vector input_dims_list = + caffe2::split(';', caffe2::FLAGS_input_dims); + CAFFE_ENFORCE_EQ( + input_names.size(), + input_dims_list.size(), + "Input name and dims should have the same number of items."); + vector input_type_list = + caffe2::split(';', caffe2::FLAGS_input_type); + CAFFE_ENFORCE_EQ( + input_names.size(), + input_type_list.size(), + "Input name and type should have the same number of items."); + for (size_t i = 0; i < input_names.size(); ++i) { + vector input_dims_str = caffe2::split(',', input_dims_list[i]); + vector input_dims; + for (const string& s : input_dims_str) { + input_dims.push_back(caffe2::stoi(s)); + } + caffe2::Blob* blob = workspace->GetBlob(input_names[i]); + if (blob == nullptr) { + blob = workspace->CreateBlob(input_names[i]); + } + caffe2::TensorCPU* tensor = blob->GetMutable(); + CHECK_NOTNULL(tensor); + tensor->Resize(input_dims); + if (input_type_list[i] == "uint8_t") { + tensor->mutable_data(); + } else if (input_type_list[i] == "float") { + tensor->mutable_data(); + } else { + CAFFE_THROW("Unsupported input type: ", input_type_list[i]); + } + } + } else { + CAFFE_THROW( + "You requested input tensors, but neither input_file nor " + "input_dims is set."); + } + } + + // Run main network. + CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def)); + if (!net_def.has_name()) { + net_def.set_name("benchmark"); + } + // force changing engine and algo + if (caffe2::FLAGS_force_engine) { + LOG(INFO) << "force engine be: " << caffe2::FLAGS_engine; + for (const auto& op : net_def.op()) { + const_cast(&op)->set_engine(caffe2::FLAGS_engine); + } + } + if (caffe2::FLAGS_force_algo) { + LOG(INFO) << "force algo be: " << caffe2::FLAGS_algo; + for (const auto& op : net_def.op()) { + caffe2::GetMutableArgument( + "algo", true, const_cast(&op)) + ->set_s(caffe2::FLAGS_algo); + } + } + if (caffe2::FLAGS_opt) { +#ifdef CAFFE2_OPTIMIZER + net_def = caffe2::opt::optimize(net_def, workspace.get(), caffe2::FLAGS_opt); +#else + LOG(WARNING) << "Caffe2 not compiled with optimization passes."; +#endif + } + + caffe2::NetBase* net = workspace->CreateNet(net_def); + CHECK_NOTNULL(net); + CAFFE_ENFORCE(net->Run()); + net->TEST_Benchmark( + caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual); + + string output_prefix = caffe2::FLAGS_output_folder.size() + ? caffe2::FLAGS_output_folder + "/" + : ""; + if (caffe2::FLAGS_output.size()) { + vector output_names = caffe2::split(',', caffe2::FLAGS_output); + if (caffe2::FLAGS_output == "*") { + output_names = workspace->Blobs(); + } + for (const string& name : output_names) { + CAFFE_ENFORCE( + workspace->HasBlob(name), + "You requested a non-existing blob: ", + name); + string serialized = workspace->GetBlob(name)->Serialize(name); + string output_filename = output_prefix + name; + caffe2::WriteStringToFile(serialized, output_filename.c_str()); + } + } + + return 0; +} diff --git a/binaries/split_db.cc b/binaries/split_db.cc new file mode 100644 index 0000000..077afda --- /dev/null +++ b/binaries/split_db.cc @@ -0,0 +1,77 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/core/logging.h" + +CAFFE2_DEFINE_string(input_db, "", "The input db."); +CAFFE2_DEFINE_int(splits, 0, "The number of splits."); +CAFFE2_DEFINE_string(db_type, "", "The db type."); +CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size."); + +namespace caffe2 { + +static int Split(int argc, char** argv) { + GlobalInit(&argc, &argv); + + CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db."); + CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number."); + CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type]."); + + unique_ptr in_db( + db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ)); + CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db); + unique_ptr cursor(in_db->NewCursor()); + // This usually won't happen, but FWIW. + CAFFE_ENFORCE( + cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db); + + vector> out_dbs; + vector> transactions; + for (int i = 0; i < FLAGS_splits; ++i) { + out_dbs.push_back(unique_ptr(db::CreateDB( + FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW))); + CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i); + transactions.push_back( + unique_ptr(out_dbs[i]->NewTransaction())); + CAFFE_ENFORCE( + transactions.back().get(), "Cannot get transaction for output db #", i); + } + + int count = 0; + for (; cursor->Valid(); cursor->Next()) { + transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value()); + if (++count % FLAGS_batch_size == 0) { + for (int i = 0; i < FLAGS_splits; ++i) { + transactions[i]->Commit(); + } + LOG(INFO) << "Split " << count << " items so far."; + } + } + LOG(INFO) << "A total of " << count << " items processed."; + return 0; +} + +} // namespace caffe2 + +int main(int argc, char** argv) { + return caffe2::Split(argc, argv); +} diff --git a/binaries/tsv_2_proto.cc b/binaries/tsv_2_proto.cc new file mode 100644 index 0000000..e9dba77 --- /dev/null +++ b/binaries/tsv_2_proto.cc @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "caffe2/core/blob_serialization.h" +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/core/logging.h" +#include "caffe2/proto/caffe2.pb.h" +#include "caffe2/utils/proto_utils.h" + +CAFFE2_DEFINE_string(f_in, "", "The input data file name."); +CAFFE2_DEFINE_string(f_out, "", "The output data file name."); + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + std::ifstream f_in(caffe2::FLAGS_f_in); + std::ofstream f_out(caffe2::FLAGS_f_out); + std::string line; + caffe2::TensorProtos tensor_protos; + while (std::getline(f_in, line)) { + caffe2::TensorProto* data = tensor_protos.add_protos(); + data->set_data_type(caffe2::TensorProto::STRING); + data->add_dims(0); + data->add_string_data(line); + data->set_name("text"); + } + f_in.close(); + std::string output_str; + tensor_protos.SerializeToString(&output_str); + f_out << output_str; + f_out.close(); + return 0; +} diff --git a/binaries/tutorial_blob.cc b/binaries/tutorial_blob.cc new file mode 100644 index 0000000..f379eac --- /dev/null +++ b/binaries/tutorial_blob.cc @@ -0,0 +1,89 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "caffe2/core/blob.h" +#include "caffe2/core/init.h" +#include "caffe2/core/tensor.h" +#include "caffe2/core/logging.h" + +// We will be lazy and just use the whole namespace. +using namespace caffe2; + + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + caffe2::ShowLogInfoToStderr(); + + LOG(INFO) << + "This script corresponds to the Blob part of the Caffe2 C++ " + "tutorial."; + + LOG(INFO) << "Let's create a blob myblob."; + + Blob myblob; + + LOG(INFO) << "Let's set it to int and set the value to 10."; + + int* myint = myblob.GetMutable(); + *myint = 10; + + LOG(INFO) + << "Is the blob type int? " + << myblob.IsType(); + + LOG(INFO) + << "Is the blob type float? " + << myblob.IsType(); + + const int& myint_const = myblob.Get(); + LOG(INFO) + << "The value of the int number stored in the blob is: " + << myint_const; + + LOG(INFO) + << "Let's try to get a float pointer. This will trigger an exception."; + + try { + const float& myfloat = myblob.Get(); + LOG(FATAL) << "This line should never happen."; + } catch (std::exception& e) { + LOG(INFO) + << "As expected, we got an exception. Its content says: " + << e.what(); + } + + LOG(INFO) << + "However, we can change the content type (and destroy the old " + "content) by calling GetMutable. Let's change it to double."; + + double* mydouble = myblob.GetMutable(); + *mydouble = 3.14; + + LOG(INFO) << "The new content is: " << myblob.Get(); + + LOG(INFO) << + "If we have a pre-created object, we can use Reset() to transfer the " + "object to a blob."; + + std::string* pvec = new std::string(); + myblob.Reset(pvec); // no need to release pvec, myblob takes ownership. + + LOG(INFO) << "Is the blob now of type string? " + << myblob.IsType(); + + LOG(INFO) << "This concludes the blob tutorial."; + return 0; +} diff --git a/binaries/zmq_feeder.cc b/binaries/zmq_feeder.cc new file mode 100644 index 0000000..27e8684 --- /dev/null +++ b/binaries/zmq_feeder.cc @@ -0,0 +1,66 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This binary provides an easy way to open a zeromq server and feeds data to +// clients connect to it. It uses the Caffe2 db as the backend, thus allowing +// one to convert any db-compliant storage to a zeromq service. + +#include "caffe2/core/db.h" +#include "caffe2/core/init.h" +#include "caffe2/core/logging.h" +#include "caffe2/utils/zmq_helper.h" + +CAFFE2_DEFINE_string(server, "tcp://*:5555", "The server address."); +CAFFE2_DEFINE_string(input_db, "", "The input db."); +CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); + +using caffe2::db::DB; +using caffe2::db::Cursor; +using caffe2::string; + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + + LOG(INFO) << "Opening DB..."; + auto in_db = caffe2::db::CreateDB( + caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ); + CAFFE_ENFORCE( + in_db, + "Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " + + caffe2::FLAGS_input_db_type); + auto cursor = in_db->NewCursor(); + LOG(INFO) << "DB opened."; + + LOG(INFO) << "Starting ZeroMQ server..."; + + // Socket to talk to clients + caffe2::ZmqSocket sender(ZMQ_PUSH); + sender.Bind(caffe2::FLAGS_server); + LOG(INFO) << "Server created at " << caffe2::FLAGS_server; + + while (1) { + VLOG(1) << "Sending " << cursor->key(); + sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE); + sender.SendTillSuccess(cursor->value(), 0); + cursor->Next(); + if (!cursor->Valid()) { + cursor->SeekToFirst(); + } + } + // We do not do an elegant quit since this binary is going to be terminated by + // control+C. + return 0; +} diff --git a/caffe/__init__.py b/caffe/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caffe/proto/CMakeLists.txt b/caffe/proto/CMakeLists.txt new file mode 100644 index 0000000..558c224 --- /dev/null +++ b/caffe/proto/CMakeLists.txt @@ -0,0 +1,17 @@ +file(GLOB Caffe_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto") + +caffe2_protobuf_generate_cpp_py(Caffe_PROTO_SRCS Caffe_PROTO_HEADERS Caffe_PROTO_PY ${Caffe_PROTOBUF_FILES}) + +add_library(Caffe_PROTO OBJECT ${Caffe_PROTO_HEADERS} ${Caffe_PROTO_SRCS}) + +if (MSVC) + if(BUILD_SHARED_LIBS) + set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)") + else() + set(Caffe2_API_DEFINE "-DCAFFE2_API=") + endif() + target_compile_definitions( + Caffe_PROTO PRIVATE ${Caffe2_API_DEFINE}) +endif() + +install(FILES ${Caffe_PROTO_HEADERS} DESTINATION include/caffe/proto) diff --git a/caffe/proto/__init__.py b/caffe/proto/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caffe/proto/caffe.proto b/caffe/proto/caffe.proto new file mode 100644 index 0000000..1556781 --- /dev/null +++ b/caffe/proto/caffe.proto @@ -0,0 +1,1399 @@ +syntax = "proto2"; + +package caffe; + +// Specifies the shape (dimensions) of a Blob. +message BlobShape { + repeated int64 dim = 1 [packed = true]; +} + +message BlobProto { + optional BlobShape shape = 7; + repeated float data = 5 [packed = true]; + repeated float diff = 6 [packed = true]; + repeated double double_data = 8 [packed = true]; + repeated double double_diff = 9 [packed = true]; + + // 4D dimensions -- deprecated. Use "shape" instead. + optional int32 num = 1 [default = 0]; + optional int32 channels = 2 [default = 0]; + optional int32 height = 3 [default = 0]; + optional int32 width = 4 [default = 0]; +} + +// The BlobProtoVector is simply a way to pass multiple blobproto instances +// around. +message BlobProtoVector { + repeated BlobProto blobs = 1; +} + +message Datum { + optional int32 channels = 1; + optional int32 height = 2; + optional int32 width = 3; + // the actual image data, in bytes + optional bytes data = 4; + optional int32 label = 5; + // Optionally, the datum could also hold float data. + repeated float float_data = 6; + // If true data contains an encoded image that need to be decoded + optional bool encoded = 7 [default = false]; +} + +message FillerParameter { + // The filler type. + optional string type = 1 [default = 'constant']; + optional float value = 2 [default = 0]; // the value in constant filler + optional float min = 3 [default = 0]; // the min value in uniform filler + optional float max = 4 [default = 1]; // the max value in uniform filler + optional float mean = 5 [default = 0]; // the mean value in Gaussian filler + optional float std = 6 [default = 1]; // the std value in Gaussian filler + // The expected number of non-zero output weights for a given input in + // Gaussian filler -- the default -1 means don't perform sparsification. + optional int32 sparse = 7 [default = -1]; + // Normalize the filler variance by fan_in, fan_out, or their average. + // Applies to 'xavier' and 'msra' fillers. + enum VarianceNorm { + FAN_IN = 0; + FAN_OUT = 1; + AVERAGE = 2; + } + optional VarianceNorm variance_norm = 8 [default = FAN_IN]; +} + +message NetParameter { + optional string name = 1; // consider giving the network a name + // DEPRECATED. See InputParameter. The input blobs to the network. + repeated string input = 3; + // DEPRECATED. See InputParameter. The shape of the input blobs. + repeated BlobShape input_shape = 8; + + // 4D input dimensions -- deprecated. Use "input_shape" instead. + // If specified, for each input blob there should be four + // values specifying the num, channels, height and width of the input blob. + // Thus, there should be a total of (4 * #input) numbers. + repeated int32 input_dim = 4; + + // Whether the network will force every layer to carry out backward operation. + // If set False, then whether to carry out backward is determined + // automatically according to the net structure and learning rates. + optional bool force_backward = 5 [default = false]; + // The current "state" of the network, including the phase, level, and stage. + // Some layers may be included/excluded depending on this state and the states + // specified in the layers' include and exclude fields. + optional NetState state = 6; + + // Print debugging information about results while running Net::Forward, + // Net::Backward, and Net::Update. + optional bool debug_info = 7 [default = false]; + + // The layers that make up the net. Each of their configurations, including + // connectivity and behavior, is specified as a LayerParameter. + repeated LayerParameter layer = 100; // ID 100 so layers are printed last. + + // DEPRECATED: use 'layer' instead. + repeated V1LayerParameter layers = 2; +} + +// NOTE +// Update the next available ID when you add a new SolverParameter field. +// +// SolverParameter next available ID: 41 (last added: type) +message SolverParameter { + ////////////////////////////////////////////////////////////////////////////// + // Specifying the train and test networks + // + // Exactly one train net must be specified using one of the following fields: + // train_net_param, train_net, net_param, net + // One or more test nets may be specified using any of the following fields: + // test_net_param, test_net, net_param, net + // If more than one test net field is specified (e.g., both net and + // test_net are specified), they will be evaluated in the field order given + // above: (1) test_net_param, (2) test_net, (3) net_param/net. + // A test_iter must be specified for each test_net. + // A test_level and/or a test_stage may also be specified for each test_net. + ////////////////////////////////////////////////////////////////////////////// + + // Proto filename for the train net, possibly combined with one or more + // test nets. + optional string net = 24; + // Inline train net param, possibly combined with one or more test nets. + optional NetParameter net_param = 25; + + optional string train_net = 1; // Proto filename for the train net. + repeated string test_net = 2; // Proto filenames for the test nets. + optional NetParameter train_net_param = 21; // Inline train net params. + repeated NetParameter test_net_param = 22; // Inline test net params. + + // The states for the train/test nets. Must be unspecified or + // specified once per net. + // + // By default, all states will have solver = true; + // train_state will have phase = TRAIN, + // and all test_state's will have phase = TEST. + // Other defaults are set according to the NetState defaults. + optional NetState train_state = 26; + repeated NetState test_state = 27; + + // The number of iterations for each test net. + repeated int32 test_iter = 3; + + // The number of iterations between two testing phases. + optional int32 test_interval = 4 [default = 0]; + optional bool test_compute_loss = 19 [default = false]; + // If true, run an initial test pass before the first iteration, + // ensuring memory availability and printing the starting value of the loss. + optional bool test_initialization = 32 [default = true]; + optional float base_lr = 5; // The base learning rate + // the number of iterations between displaying info. If display = 0, no info + // will be displayed. + optional int32 display = 6; + // Display the loss averaged over the last average_loss iterations + optional int32 average_loss = 33 [default = 1]; + optional int32 max_iter = 7; // the maximum number of iterations + // accumulate gradients over `iter_size` x `batch_size` instances + optional int32 iter_size = 36 [default = 1]; + + // The learning rate decay policy. The currently implemented learning rate + // policies are as follows: + // - fixed: always return base_lr. + // - step: return base_lr * gamma ^ (floor(iter / step)) + // - exp: return base_lr * gamma ^ iter + // - inv: return base_lr * (1 + gamma * iter) ^ (- power) + // - multistep: similar to step but it allows non uniform steps defined by + // stepvalue + // - poly: the effective learning rate follows a polynomial decay, to be + // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) + // - sigmoid: the effective learning rate follows a sigmod decay + // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) + // + // where base_lr, max_iter, gamma, step, stepvalue and power are defined + // in the solver parameter protocol buffer, and iter is the current iteration. + optional string lr_policy = 8; + optional float gamma = 9; // The parameter to compute the learning rate. + optional float power = 10; // The parameter to compute the learning rate. + optional float momentum = 11; // The momentum value. + optional float weight_decay = 12; // The weight decay. + // regularization types supported: L1 and L2 + // controlled by weight_decay + optional string regularization_type = 29 [default = "L2"]; + // the stepsize for learning rate policy "step" + optional int32 stepsize = 13; + // the stepsize for learning rate policy "multistep" + repeated int32 stepvalue = 34; + + // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm, + // whenever their actual L2 norm is larger. + optional float clip_gradients = 35 [default = -1]; + + optional int32 snapshot = 14 [default = 0]; // The snapshot interval + optional string snapshot_prefix = 15; // The prefix for the snapshot. + // whether to snapshot diff in the results or not. Snapshotting diff will help + // debugging but the final protocol buffer size will be much larger. + optional bool snapshot_diff = 16 [default = false]; + enum SnapshotFormat { + HDF5 = 0; + BINARYPROTO = 1; + } + optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO]; + // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default. + enum SolverMode { + CPU = 0; + GPU = 1; + } + optional SolverMode solver_mode = 17 [default = GPU]; + // the device_id will that be used in GPU mode. Use device_id = 0 in default. + optional int32 device_id = 18 [default = 0]; + // If non-negative, the seed with which the Solver will initialize the Caffe + // random number generator -- useful for reproducible results. Otherwise, + // (and by default) initialize using a seed derived from the system clock. + optional int64 random_seed = 20 [default = -1]; + + // type of the solver + optional string type = 40 [default = "SGD"]; + + // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam + optional float delta = 31 [default = 1e-8]; + // parameters for the Adam solver + optional float momentum2 = 39 [default = 0.999]; + + // RMSProp decay value + // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) + optional float rms_decay = 38; + + // If true, print information about the state of the net that may help with + // debugging learning problems. + optional bool debug_info = 23 [default = false]; + + // If false, don't save a snapshot after training finishes. + optional bool snapshot_after_train = 28 [default = true]; + + // DEPRECATED: old solver enum types, use string instead + enum SolverType { + SGD = 0; + NESTEROV = 1; + ADAGRAD = 2; + RMSPROP = 3; + ADADELTA = 4; + ADAM = 5; + } + // DEPRECATED: use type instead of solver_type + optional SolverType solver_type = 30 [default = SGD]; +} + +// A message that stores the solver snapshots +message SolverState { + optional int32 iter = 1; // The current iteration + optional string learned_net = 2; // The file that stores the learned net. + repeated BlobProto history = 3; // The history for sgd solvers + optional int32 current_step = 4 [default = 0]; // The current step for learning rate +} + +enum Phase { + TRAIN = 0; + TEST = 1; +} + +message NetState { + optional Phase phase = 1 [default = TEST]; + optional int32 level = 2 [default = 0]; + repeated string stage = 3; +} + +message NetStateRule { + // Set phase to require the NetState have a particular phase (TRAIN or TEST) + // to meet this rule. + optional Phase phase = 1; + + // Set the minimum and/or maximum levels in which the layer should be used. + // Leave undefined to meet the rule regardless of level. + optional int32 min_level = 2; + optional int32 max_level = 3; + + // Customizable sets of stages to include or exclude. + // The net must have ALL of the specified stages and NONE of the specified + // "not_stage"s to meet the rule. + // (Use multiple NetStateRules to specify conjunctions of stages.) + repeated string stage = 4; + repeated string not_stage = 5; +} + +// Specifies training parameters (multipliers on global learning constants, +// and the name and other settings used for weight sharing). +message ParamSpec { + // The names of the parameter blobs -- useful for sharing parameters among + // layers, but never required otherwise. To share a parameter between two + // layers, give it a (non-empty) name. + optional string name = 1; + + // Whether to require shared weights to have the same shape, or just the same + // count -- defaults to STRICT if unspecified. + optional DimCheckMode share_mode = 2; + enum DimCheckMode { + // STRICT (default) requires that num, channels, height, width each match. + STRICT = 0; + // PERMISSIVE requires only the count (num*channels*height*width) to match. + PERMISSIVE = 1; + } + + // The multiplier on the global learning rate for this parameter. + optional float lr_mult = 3 [default = 1.0]; + + // The multiplier on the global weight decay for this parameter. + optional float decay_mult = 4 [default = 1.0]; +} + +// NOTE +// Update the next available ID when you add a new LayerParameter field. +// +// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param) +message LayerParameter { + optional string name = 1; // the layer name + optional string type = 2; // the layer type + repeated string bottom = 3; // the name of each bottom blob + repeated string top = 4; // the name of each top blob + + // The train / test phase for computation. + optional Phase phase = 10; + + // The amount of weight to assign each top blob in the objective. + // Each layer assigns a default value, usually of either 0 or 1, + // to each top blob. + repeated float loss_weight = 5; + + // Specifies training parameters (multipliers on global learning constants, + // and the name and other settings used for weight sharing). + repeated ParamSpec param = 6; + + // The blobs containing the numeric parameters of the layer. + repeated BlobProto blobs = 7; + + // Specifies whether to backpropagate to each bottom. If unspecified, + // Caffe will automatically infer whether each input needs backpropagation + // to compute parameter gradients. If set to true for some inputs, + // backpropagation to those inputs is forced; if set false for some inputs, + // backpropagation to those inputs is skipped. + // + // The size must be either 0 or equal to the number of bottoms. + repeated bool propagate_down = 11; + + // Rules controlling whether and when a layer is included in the network, + // based on the current NetState. You may specify a non-zero number of rules + // to include OR exclude, but not both. If no include or exclude rules are + // specified, the layer is always included. If the current NetState meets + // ANY (i.e., one or more) of the specified rules, the layer is + // included/excluded. + repeated NetStateRule include = 8; + repeated NetStateRule exclude = 9; + + // Parameters for data pre-processing. + optional TransformationParameter transform_param = 100; + + // Parameters shared by loss layers. + optional LossParameter loss_param = 101; + + // Layer type-specific parameters. + // + // Note: certain layers may have more than one computational engine + // for their implementation. These layers include an Engine type and + // engine parameter for selecting the implementation. + // The default for the engine is set by the ENGINE switch at compile-time. + optional AccuracyParameter accuracy_param = 102; + optional ArgMaxParameter argmax_param = 103; + optional BatchNormParameter batch_norm_param = 139; + optional BiasParameter bias_param = 141; + optional ConcatParameter concat_param = 104; + optional ContrastiveLossParameter contrastive_loss_param = 105; + optional ConvolutionParameter convolution_param = 106; + optional CropParameter crop_param = 144; + optional DataParameter data_param = 107; + optional DropoutParameter dropout_param = 108; + optional DummyDataParameter dummy_data_param = 109; + optional EltwiseParameter eltwise_param = 110; + optional ELUParameter elu_param = 140; + optional EmbedParameter embed_param = 137; + optional ExpParameter exp_param = 111; + optional FlattenParameter flatten_param = 135; + optional HDF5DataParameter hdf5_data_param = 112; + optional HDF5OutputParameter hdf5_output_param = 113; + optional HingeLossParameter hinge_loss_param = 114; + optional ImageDataParameter image_data_param = 115; + optional InfogainLossParameter infogain_loss_param = 116; + optional InnerProductParameter inner_product_param = 117; + optional InputParameter input_param = 143; + optional LogParameter log_param = 134; + optional LRNParameter lrn_param = 118; + optional MemoryDataParameter memory_data_param = 119; + optional MVNParameter mvn_param = 120; + optional ParameterParameter parameter_param = 145; + optional PoolingParameter pooling_param = 121; + optional PowerParameter power_param = 122; + optional PReLUParameter prelu_param = 131; + optional PythonParameter python_param = 130; + optional RecurrentParameter recurrent_param = 146; + optional ReductionParameter reduction_param = 136; + optional ReLUParameter relu_param = 123; + optional ReshapeParameter reshape_param = 133; + optional ScaleParameter scale_param = 142; + optional SigmoidParameter sigmoid_param = 124; + optional SoftmaxParameter softmax_param = 125; + optional SPPParameter spp_param = 132; + optional SliceParameter slice_param = 126; + optional TanHParameter tanh_param = 127; + optional ThresholdParameter threshold_param = 128; + optional TileParameter tile_param = 138; + optional WindowDataParameter window_data_param = 129; +} + +// Message that stores parameters used to apply transformation +// to the data layer's data +message TransformationParameter { + // For data pre-processing, we can do simple scaling and subtracting the + // data mean, if provided. Note that the mean subtraction is always carried + // out before scaling. + optional float scale = 1 [default = 1]; + // Specify if we want to randomly mirror data. + optional bool mirror = 2 [default = false]; + // Specify if we would like to randomly crop an image. + optional uint32 crop_size = 3 [default = 0]; + // mean_file and mean_value cannot be specified at the same time + optional string mean_file = 4; + // if specified can be repeated once (would substract it from all the channels) + // or can be repeated the same number of times as channels + // (would subtract them from the corresponding channel) + repeated float mean_value = 5; + // Force the decoded image to have 3 color channels. + optional bool force_color = 6 [default = false]; + // Force the decoded image to have 1 color channels. + optional bool force_gray = 7 [default = false]; +} + +// Message that stores parameters shared by loss layers +message LossParameter { + // If specified, ignore instances with the given label. + optional int32 ignore_label = 1; + // How to normalize the loss for loss layers that aggregate across batches, + // spatial dimensions, or other dimensions. Currently only implemented in + // SoftmaxWithLoss layer. + enum NormalizationMode { + // Divide by the number of examples in the batch times spatial dimensions. + // Outputs that receive the ignore label will NOT be ignored in computing + // the normalization factor. + FULL = 0; + // Divide by the total number of output locations that do not take the + // ignore_label. If ignore_label is not set, this behaves like FULL. + VALID = 1; + // Divide by the batch size. + BATCH_SIZE = 2; + // Do not normalize the loss. + NONE = 3; + } + optional NormalizationMode normalization = 3 [default = VALID]; + // Deprecated. Ignored if normalization is specified. If normalization + // is not specified, then setting this to false will be equivalent to + // normalization = BATCH_SIZE to be consistent with previous behavior. + optional bool normalize = 2; +} + +// Messages that store parameters used by individual layer types follow, in +// alphabetical order. + +message AccuracyParameter { + // When computing accuracy, count as correct by comparing the true label to + // the top k scoring classes. By default, only compare to the top scoring + // class (i.e. argmax). + optional uint32 top_k = 1 [default = 1]; + + // The "label" axis of the prediction blob, whose argmax corresponds to the + // predicted label -- may be negative to index from the end (e.g., -1 for the + // last axis). For example, if axis == 1 and the predictions are + // (N x C x H x W), the label blob is expected to contain N*H*W ground truth + // labels with integer values in {0, 1, ..., C-1}. + optional int32 axis = 2 [default = 1]; + + // If specified, ignore instances with the given label. + optional int32 ignore_label = 3; +} + +message ArgMaxParameter { + // If true produce pairs (argmax, maxval) + optional bool out_max_val = 1 [default = false]; + optional uint32 top_k = 2 [default = 1]; + // The axis along which to maximise -- may be negative to index from the + // end (e.g., -1 for the last axis). + // By default ArgMaxLayer maximizes over the flattened trailing dimensions + // for each index of the first / num dimension. + optional int32 axis = 3; +} + +message ConcatParameter { + // The axis along which to concatenate -- may be negative to index from the + // end (e.g., -1 for the last axis). Other axes must have the + // same dimension for all the bottom blobs. + // By default, ConcatLayer concatenates blobs along the "channels" axis (1). + optional int32 axis = 2 [default = 1]; + + // DEPRECATED: alias for "axis" -- does not support negative indexing. + optional uint32 concat_dim = 1 [default = 1]; +} + +message BatchNormParameter { + // If false, accumulate global mean/variance values via a moving average. If + // true, use those accumulated values instead of computing mean/variance + // across the batch. + optional bool use_global_stats = 1; + // How much does the moving average decay each iteration? + optional float moving_average_fraction = 2 [default = .999]; + // Small value to add to the variance estimate so that we don't divide by + // zero. + optional float eps = 3 [default = 1e-5]; +} + +message BiasParameter { + // The first axis of bottom[0] (the first input Blob) along which to apply + // bottom[1] (the second input Blob). May be negative to index from the end + // (e.g., -1 for the last axis). + // + // For example, if bottom[0] is 4D with shape 100x3x40x60, the output + // top[0] will have the same shape, and bottom[1] may have any of the + // following shapes (for the given value of axis): + // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 + // (axis == 1 == -3) 3; 3x40; 3x40x60 + // (axis == 2 == -2) 40; 40x60 + // (axis == 3 == -1) 60 + // Furthermore, bottom[1] may have the empty shape (regardless of the value of + // "axis") -- a scalar bias. + optional int32 axis = 1 [default = 1]; + + // (num_axes is ignored unless just one bottom is given and the bias is + // a learned parameter of the layer. Otherwise, num_axes is determined by the + // number of axes by the second bottom.) + // The number of axes of the input (bottom[0]) covered by the bias + // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. + // Set num_axes := 0, to add a zero-axis Blob: a scalar. + optional int32 num_axes = 2 [default = 1]; + + // (filler is ignored unless just one bottom is given and the bias is + // a learned parameter of the layer.) + // The initialization for the learned bias parameter. + // Default is the zero (0) initialization, resulting in the BiasLayer + // initially performing the identity operation. + optional FillerParameter filler = 3; +} + +message ContrastiveLossParameter { + // margin for dissimilar pair + optional float margin = 1 [default = 1.0]; + // The first implementation of this cost did not exactly match the cost of + // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2. + // legacy_version = false (the default) uses (margin - d)^2 as proposed in the + // Hadsell paper. New models should probably use this version. + // legacy_version = true uses (margin - d^2). This is kept to support / + // reproduce existing models and results + optional bool legacy_version = 2 [default = false]; +} + +message ConvolutionParameter { + optional uint32 num_output = 1; // The number of outputs for the layer + optional bool bias_term = 2 [default = true]; // whether to have bias terms + + // Pad, kernel size, and stride are all given as a single value for equal + // dimensions in all spatial dimensions, or once per spatial dimension. + repeated uint32 pad = 3; // The padding size; defaults to 0 + repeated uint32 kernel_size = 4; // The kernel size + repeated uint32 stride = 6; // The stride; defaults to 1 + // Factor used to dilate the kernel, (implicitly) zero-filling the resulting + // holes. (Kernel dilation is sometimes referred to by its use in the + // algorithme à trous from Holschneider et al. 1987.) + repeated uint32 dilation = 18; // The dilation; defaults to 1 + + // For 2D convolution only, the *_h and *_w versions may also be used to + // specify both spatial dimensions. + optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only) + optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only) + optional uint32 kernel_h = 11; // The kernel height (2D only) + optional uint32 kernel_w = 12; // The kernel width (2D only) + optional uint32 stride_h = 13; // The stride height (2D only) + optional uint32 stride_w = 14; // The stride width (2D only) + + optional uint32 group = 5 [default = 1]; // The group size for group conv + + optional FillerParameter weight_filler = 7; // The filler for the weight + optional FillerParameter bias_filler = 8; // The filler for the bias + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 15 [default = DEFAULT]; + + // The axis to interpret as "channels" when performing convolution. + // Preceding dimensions are treated as independent inputs; + // succeeding dimensions are treated as "spatial". + // With (N, C, H, W) inputs, and axis == 1 (the default), we perform + // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for + // groups g>1) filters across the spatial axes (H, W) of the input. + // With (N, C, D, H, W) inputs, and axis == 1, we perform + // N independent 3D convolutions, sliding (C/g)-channels + // filters across the spatial axes (D, H, W) of the input. + optional int32 axis = 16 [default = 1]; + + // Whether to force use of the general ND convolution, even if a specific + // implementation for blobs of the appropriate number of spatial dimensions + // is available. (Currently, there is only a 2D-specific convolution + // implementation; for input blobs with num_axes != 2, this option is + // ignored and the ND implementation will be used.) + optional bool force_nd_im2col = 17 [default = false]; +} + +message CropParameter { + // To crop, elements of the first bottom are selected to fit the dimensions + // of the second, reference bottom. The crop is configured by + // - the crop `axis` to pick the dimensions for cropping + // - the crop `offset` to set the shift for all/each dimension + // to align the cropped bottom with the reference bottom. + // All dimensions up to but excluding `axis` are preserved, while + // the dimensions including and trailing `axis` are cropped. + // If only one `offset` is set, then all dimensions are offset by this amount. + // Otherwise, the number of offsets must equal the number of cropped axes to + // shift the crop in each dimension accordingly. + // Note: standard dimensions are N,C,H,W so the default is a spatial crop, + // and `axis` may be negative to index from the end (e.g., -1 for the last + // axis). + optional int32 axis = 1 [default = 2]; + repeated uint32 offset = 2; +} + +message DataParameter { + enum DB { + LEVELDB = 0; + LMDB = 1; + } + // Specify the data source. + optional string source = 1; + // Specify the batch size. + optional uint32 batch_size = 4; + // The rand_skip variable is for the data layer to skip a few data points + // to avoid all asynchronous sgd clients to start at the same point. The skip + // point would be set as rand_skip * rand(0,1). Note that rand_skip should not + // be larger than the number of keys in the database. + // DEPRECATED. Each solver accesses a different subset of the database. + optional uint32 rand_skip = 7 [default = 0]; + optional DB backend = 8 [default = LEVELDB]; + // DEPRECATED. See TransformationParameter. For data pre-processing, we can do + // simple scaling and subtracting the data mean, if provided. Note that the + // mean subtraction is always carried out before scaling. + optional float scale = 2 [default = 1]; + optional string mean_file = 3; + // DEPRECATED. See TransformationParameter. Specify if we would like to randomly + // crop an image. + optional uint32 crop_size = 5 [default = 0]; + // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror + // data. + optional bool mirror = 6 [default = false]; + // Force the encoded image to have 3 color channels + optional bool force_encoded_color = 9 [default = false]; + // Prefetch queue (Number of batches to prefetch to host memory, increase if + // data access bandwidth varies). + optional uint32 prefetch = 10 [default = 4]; +} + +message DropoutParameter { + optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio +} + +// DummyDataLayer fills any number of arbitrarily shaped blobs with random +// (or constant) data generated by "Fillers" (see "message FillerParameter"). +message DummyDataParameter { + // This layer produces N >= 1 top blobs. DummyDataParameter must specify 1 or N + // shape fields, and 0, 1 or N data_fillers. + // + // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used. + // If 1 data_filler is specified, it is applied to all top blobs. If N are + // specified, the ith is applied to the ith top blob. + repeated FillerParameter data_filler = 1; + repeated BlobShape shape = 6; + + // 4D dimensions -- deprecated. Use "shape" instead. + repeated uint32 num = 2; + repeated uint32 channels = 3; + repeated uint32 height = 4; + repeated uint32 width = 5; +} + +message EltwiseParameter { + enum EltwiseOp { + PROD = 0; + SUM = 1; + MAX = 2; + } + optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation + repeated float coeff = 2; // blob-wise coefficient for SUM operation + + // Whether to use an asymptotically slower (for >2 inputs) but stabler method + // of computing the gradient for the PROD operation. (No effect for SUM op.) + optional bool stable_prod_grad = 3 [default = true]; +} + +// Message that stores parameters used by ELULayer +message ELUParameter { + // Described in: + // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate + // Deep Network Learning by Exponential Linear Units (ELUs). arXiv + optional float alpha = 1 [default = 1]; +} + +// Message that stores parameters used by EmbedLayer +message EmbedParameter { + optional uint32 num_output = 1; // The number of outputs for the layer + // The input is given as integers to be interpreted as one-hot + // vector indices with dimension num_input. Hence num_input should be + // 1 greater than the maximum possible input value. + optional uint32 input_dim = 2; + + optional bool bias_term = 3 [default = true]; // Whether to use a bias term + optional FillerParameter weight_filler = 4; // The filler for the weight + optional FillerParameter bias_filler = 5; // The filler for the bias + +} + +// Message that stores parameters used by ExpLayer +message ExpParameter { + // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0. + // Or if base is set to the default (-1), base is set to e, + // so y = exp(shift + scale * x). + optional float base = 1 [default = -1.0]; + optional float scale = 2 [default = 1.0]; + optional float shift = 3 [default = 0.0]; +} + +/// Message that stores parameters used by FlattenLayer +message FlattenParameter { + // The first axis to flatten: all preceding axes are retained in the output. + // May be negative to index from the end (e.g., -1 for the last axis). + optional int32 axis = 1 [default = 1]; + + // The last axis to flatten: all following axes are retained in the output. + // May be negative to index from the end (e.g., the default -1 for the last + // axis). + optional int32 end_axis = 2 [default = -1]; +} + +// Message that stores parameters used by HDF5DataLayer +message HDF5DataParameter { + // Specify the data source. + optional string source = 1; + // Specify the batch size. + optional uint32 batch_size = 2; + + // Specify whether to shuffle the data. + // If shuffle == true, the ordering of the HDF5 files is shuffled, + // and the ordering of data within any given HDF5 file is shuffled, + // but data between different files are not interleaved; all of a file's + // data are output (in a random order) before moving onto another file. + optional bool shuffle = 3 [default = false]; +} + +message HDF5OutputParameter { + optional string file_name = 1; +} + +message HingeLossParameter { + enum Norm { + L1 = 1; + L2 = 2; + } + // Specify the Norm to use L1 or L2 + optional Norm norm = 1 [default = L1]; +} + +message ImageDataParameter { + // Specify the data source. + optional string source = 1; + // Specify the batch size. + optional uint32 batch_size = 4 [default = 1]; + // The rand_skip variable is for the data layer to skip a few data points + // to avoid all asynchronous sgd clients to start at the same point. The skip + // point would be set as rand_skip * rand(0,1). Note that rand_skip should not + // be larger than the number of keys in the database. + optional uint32 rand_skip = 7 [default = 0]; + // Whether or not ImageLayer should shuffle the list of files at every epoch. + optional bool shuffle = 8 [default = false]; + // It will also resize images if new_height or new_width are not zero. + optional uint32 new_height = 9 [default = 0]; + optional uint32 new_width = 10 [default = 0]; + // Specify if the images are color or gray + optional bool is_color = 11 [default = true]; + // DEPRECATED. See TransformationParameter. For data pre-processing, we can do + // simple scaling and subtracting the data mean, if provided. Note that the + // mean subtraction is always carried out before scaling. + optional float scale = 2 [default = 1]; + optional string mean_file = 3; + // DEPRECATED. See TransformationParameter. Specify if we would like to randomly + // crop an image. + optional uint32 crop_size = 5 [default = 0]; + // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror + // data. + optional bool mirror = 6 [default = false]; + optional string root_folder = 12 [default = ""]; +} + +message InfogainLossParameter { + // Specify the infogain matrix source. + optional string source = 1; +} + +message InnerProductParameter { + optional uint32 num_output = 1; // The number of outputs for the layer + optional bool bias_term = 2 [default = true]; // whether to have bias terms + optional FillerParameter weight_filler = 3; // The filler for the weight + optional FillerParameter bias_filler = 4; // The filler for the bias + + // The first axis to be lumped into a single inner product computation; + // all preceding axes are retained in the output. + // May be negative to index from the end (e.g., -1 for the last axis). + optional int32 axis = 5 [default = 1]; + // Specify whether to transpose the weight matrix or not. + // If transpose == true, any operations will be performed on the transpose + // of the weight matrix. The weight matrix itself is not going to be transposed + // but rather the transfer flag of operations will be toggled accordingly. + optional bool transpose = 6 [default = false]; +} + +message InputParameter { + // This layer produces N >= 1 top blob(s) to be assigned manually. + // Define N shapes to set a shape for each top. + // Define 1 shape to set the same shape for every top. + // Define no shape to defer to reshaping manually. + repeated BlobShape shape = 1; +} + +// Message that stores parameters used by LogLayer +message LogParameter { + // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0. + // Or if base is set to the default (-1), base is set to e, + // so y = ln(shift + scale * x) = log_e(shift + scale * x) + optional float base = 1 [default = -1.0]; + optional float scale = 2 [default = 1.0]; + optional float shift = 3 [default = 0.0]; +} + +// Message that stores parameters used by LRNLayer +message LRNParameter { + optional uint32 local_size = 1 [default = 5]; + optional float alpha = 2 [default = 1.]; + optional float beta = 3 [default = 0.75]; + enum NormRegion { + ACROSS_CHANNELS = 0; + WITHIN_CHANNEL = 1; + } + optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS]; + optional float k = 5 [default = 1.]; + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 6 [default = DEFAULT]; +} + +message MemoryDataParameter { + optional uint32 batch_size = 1; + optional uint32 channels = 2; + optional uint32 height = 3; + optional uint32 width = 4; +} + +message MVNParameter { + // This parameter can be set to false to normalize mean only + optional bool normalize_variance = 1 [default = true]; + + // This parameter can be set to true to perform DNN-like MVN + optional bool across_channels = 2 [default = false]; + + // Epsilon for not dividing by zero while normalizing variance + optional float eps = 3 [default = 1e-9]; +} + +message ParameterParameter { + optional BlobShape shape = 1; +} + +message PoolingParameter { + enum PoolMethod { + MAX = 0; + AVE = 1; + STOCHASTIC = 2; + } + optional PoolMethod pool = 1 [default = MAX]; // The pooling method + // Pad, kernel size, and stride are all given as a single value for equal + // dimensions in height and width or as Y, X pairs. + optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X) + optional uint32 pad_h = 9 [default = 0]; // The padding height + optional uint32 pad_w = 10 [default = 0]; // The padding width + optional uint32 kernel_size = 2; // The kernel size (square) + optional uint32 kernel_h = 5; // The kernel height + optional uint32 kernel_w = 6; // The kernel width + optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X) + optional uint32 stride_h = 7; // The stride height + optional uint32 stride_w = 8; // The stride width + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 11 [default = DEFAULT]; + // If global_pooling then it will pool over the size of the bottom by doing + // kernel_h = bottom->height and kernel_w = bottom->width + optional bool global_pooling = 12 [default = false]; +} + +message PowerParameter { + // PowerLayer computes outputs y = (shift + scale * x) ^ power. + optional float power = 1 [default = 1.0]; + optional float scale = 2 [default = 1.0]; + optional float shift = 3 [default = 0.0]; +} + +message PythonParameter { + optional string module = 1; + optional string layer = 2; + // This value is set to the attribute `param_str` of the `PythonLayer` object + // in Python before calling the `setup()` method. This could be a number, + // string, dictionary in Python dict format, JSON, etc. You may parse this + // string in `setup` method and use it in `forward` and `backward`. + optional string param_str = 3 [default = '']; + // Whether this PythonLayer is shared among worker solvers during data parallelism. + // If true, each worker solver sequentially run forward from this layer. + // This value should be set true if you are using it as a data layer. + optional bool share_in_parallel = 4 [default = false]; +} + +// Message that stores parameters used by RecurrentLayer +message RecurrentParameter { + // The dimension of the output (and usually hidden state) representation -- + // must be explicitly set to non-zero. + optional uint32 num_output = 1 [default = 0]; + + optional FillerParameter weight_filler = 2; // The filler for the weight + optional FillerParameter bias_filler = 3; // The filler for the bias + + // Whether to enable displaying debug_info in the unrolled recurrent net. + optional bool debug_info = 4 [default = false]; + + // Whether to add as additional inputs (bottoms) the initial hidden state + // blobs, and add as additional outputs (tops) the final timestep hidden state + // blobs. The number of additional bottom/top blobs required depends on the + // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs. + optional bool expose_hidden = 5 [default = false]; +} + +// Message that stores parameters used by ReductionLayer +message ReductionParameter { + enum ReductionOp { + SUM = 1; + ASUM = 2; + SUMSQ = 3; + MEAN = 4; + } + + optional ReductionOp operation = 1 [default = SUM]; // reduction operation + + // The first axis to reduce to a scalar -- may be negative to index from the + // end (e.g., -1 for the last axis). + // (Currently, only reduction along ALL "tail" axes is supported; reduction + // of axis M through N, where N < num_axes - 1, is unsupported.) + // Suppose we have an n-axis bottom Blob with shape: + // (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)). + // If axis == m, the output Blob will have shape + // (d0, d1, d2, ..., d(m-1)), + // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1)) + // times, each including (dm * d(m+1) * ... * d(n-1)) individual data. + // If axis == 0 (the default), the output Blob always has the empty shape + // (count 1), performing reduction across the entire input -- + // often useful for creating new loss functions. + optional int32 axis = 2 [default = 0]; + + optional float coeff = 3 [default = 1.0]; // coefficient for output +} + +// Message that stores parameters used by ReLULayer +message ReLUParameter { + // Allow non-zero slope for negative inputs to speed up optimization + // Described in: + // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities + // improve neural network acoustic models. In ICML Workshop on Deep Learning + // for Audio, Speech, and Language Processing. + optional float negative_slope = 1 [default = 0]; + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 2 [default = DEFAULT]; +} + +message ReshapeParameter { + // Specify the output dimensions. If some of the dimensions are set to 0, + // the corresponding dimension from the bottom layer is used (unchanged). + // Exactly one dimension may be set to -1, in which case its value is + // inferred from the count of the bottom blob and the remaining dimensions. + // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8: + // + // layer { + // type: "Reshape" bottom: "input" top: "output" + // reshape_param { ... } + // } + // + // If "input" is 2D with shape 2 x 8, then the following reshape_param + // specifications are all equivalent, producing a 3D blob "output" with shape + // 2 x 2 x 4: + // + // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } + // reshape_param { shape { dim: 0 dim: 2 dim: 4 } } + // reshape_param { shape { dim: 0 dim: 2 dim: -1 } } + // reshape_param { shape { dim: 0 dim:-1 dim: 4 } } + // + optional BlobShape shape = 1; + + // axis and num_axes control the portion of the bottom blob's shape that are + // replaced by (included in) the reshape. By default (axis == 0 and + // num_axes == -1), the entire bottom blob shape is included in the reshape, + // and hence the shape field must specify the entire output shape. + // + // axis may be non-zero to retain some portion of the beginning of the input + // shape (and may be negative to index from the end; e.g., -1 to begin the + // reshape after the last axis, including nothing in the reshape, + // -2 to include only the last axis, etc.). + // + // For example, suppose "input" is a 2D blob with shape 2 x 8. + // Then the following ReshapeLayer specifications are all equivalent, + // producing a blob "output" with shape 2 x 2 x 4: + // + // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } + // reshape_param { shape { dim: 2 dim: 4 } axis: 1 } + // reshape_param { shape { dim: 2 dim: 4 } axis: -3 } + // + // num_axes specifies the extent of the reshape. + // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on + // input axes in the range [axis, axis+num_axes]. + // num_axes may also be -1, the default, to include all remaining axes + // (starting from axis). + // + // For example, suppose "input" is a 2D blob with shape 2 x 8. + // Then the following ReshapeLayer specifications are equivalent, + // producing a blob "output" with shape 1 x 2 x 8. + // + // reshape_param { shape { dim: 1 dim: 2 dim: 8 } } + // reshape_param { shape { dim: 1 dim: 2 } num_axes: 1 } + // reshape_param { shape { dim: 1 } num_axes: 0 } + // + // On the other hand, these would produce output blob shape 2 x 1 x 8: + // + // reshape_param { shape { dim: 2 dim: 1 dim: 8 } } + // reshape_param { shape { dim: 1 } axis: 1 num_axes: 0 } + // + optional int32 axis = 2 [default = 0]; + optional int32 num_axes = 3 [default = -1]; +} + +message ScaleParameter { + // The first axis of bottom[0] (the first input Blob) along which to apply + // bottom[1] (the second input Blob). May be negative to index from the end + // (e.g., -1 for the last axis). + // + // For example, if bottom[0] is 4D with shape 100x3x40x60, the output + // top[0] will have the same shape, and bottom[1] may have any of the + // following shapes (for the given value of axis): + // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 + // (axis == 1 == -3) 3; 3x40; 3x40x60 + // (axis == 2 == -2) 40; 40x60 + // (axis == 3 == -1) 60 + // Furthermore, bottom[1] may have the empty shape (regardless of the value of + // "axis") -- a scalar multiplier. + optional int32 axis = 1 [default = 1]; + + // (num_axes is ignored unless just one bottom is given and the scale is + // a learned parameter of the layer. Otherwise, num_axes is determined by the + // number of axes by the second bottom.) + // The number of axes of the input (bottom[0]) covered by the scale + // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. + // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar. + optional int32 num_axes = 2 [default = 1]; + + // (filler is ignored unless just one bottom is given and the scale is + // a learned parameter of the layer.) + // The initialization for the learned scale parameter. + // Default is the unit (1) initialization, resulting in the ScaleLayer + // initially performing the identity operation. + optional FillerParameter filler = 3; + + // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but + // may be more efficient). Initialized with bias_filler (defaults to 0). + optional bool bias_term = 4 [default = false]; + optional FillerParameter bias_filler = 5; +} + +message SigmoidParameter { + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 1 [default = DEFAULT]; +} + +message SliceParameter { + // The axis along which to slice -- may be negative to index from the end + // (e.g., -1 for the last axis). + // By default, SliceLayer concatenates blobs along the "channels" axis (1). + optional int32 axis = 3 [default = 1]; + repeated uint32 slice_point = 2; + + // DEPRECATED: alias for "axis" -- does not support negative indexing. + optional uint32 slice_dim = 1 [default = 1]; +} + +// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer +message SoftmaxParameter { + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 1 [default = DEFAULT]; + + // The axis along which to perform the softmax -- may be negative to index + // from the end (e.g., -1 for the last axis). + // Any other axes will be evaluated as independent softmaxes. + optional int32 axis = 2 [default = 1]; +} + +message TanHParameter { + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 1 [default = DEFAULT]; +} + +// Message that stores parameters used by TileLayer +message TileParameter { + // The index of the axis to tile. + optional int32 axis = 1 [default = 1]; + + // The number of copies (tiles) of the blob to output. + optional int32 tiles = 2; +} + +// Message that stores parameters used by ThresholdLayer +message ThresholdParameter { + optional float threshold = 1 [default = 0]; // Strictly positive values +} + +message WindowDataParameter { + // Specify the data source. + optional string source = 1; + // For data pre-processing, we can do simple scaling and subtracting the + // data mean, if provided. Note that the mean subtraction is always carried + // out before scaling. + optional float scale = 2 [default = 1]; + optional string mean_file = 3; + // Specify the batch size. + optional uint32 batch_size = 4; + // Specify if we would like to randomly crop an image. + optional uint32 crop_size = 5 [default = 0]; + // Specify if we want to randomly mirror data. + optional bool mirror = 6 [default = false]; + // Foreground (object) overlap threshold + optional float fg_threshold = 7 [default = 0.5]; + // Background (non-object) overlap threshold + optional float bg_threshold = 8 [default = 0.5]; + // Fraction of batch that should be foreground objects + optional float fg_fraction = 9 [default = 0.25]; + // Amount of contextual padding to add around a window + // (used only by the window_data_layer) + optional uint32 context_pad = 10 [default = 0]; + // Mode for cropping out a detection window + // warp: cropped window is warped to a fixed size and aspect ratio + // square: the tightest square around the window is cropped + optional string crop_mode = 11 [default = "warp"]; + // cache_images: will load all images in memory for faster access + optional bool cache_images = 12 [default = false]; + // append root_folder to locate images + optional string root_folder = 13 [default = ""]; +} + +message SPPParameter { + enum PoolMethod { + MAX = 0; + AVE = 1; + STOCHASTIC = 2; + } + optional uint32 pyramid_height = 1; + optional PoolMethod pool = 2 [default = MAX]; // The pooling method + enum Engine { + DEFAULT = 0; + CAFFE = 1; + CUDNN = 2; + } + optional Engine engine = 6 [default = DEFAULT]; +} + +// DEPRECATED: use LayerParameter. +message V1LayerParameter { + repeated string bottom = 2; + repeated string top = 3; + optional string name = 4; + repeated NetStateRule include = 32; + repeated NetStateRule exclude = 33; + enum LayerType { + NONE = 0; + ABSVAL = 35; + ACCURACY = 1; + ARGMAX = 30; + BNLL = 2; + CONCAT = 3; + CONTRASTIVE_LOSS = 37; + CONVOLUTION = 4; + DATA = 5; + DECONVOLUTION = 39; + DROPOUT = 6; + DUMMY_DATA = 32; + EUCLIDEAN_LOSS = 7; + ELTWISE = 25; + EXP = 38; + FLATTEN = 8; + HDF5_DATA = 9; + HDF5_OUTPUT = 10; + HINGE_LOSS = 28; + IM2COL = 11; + IMAGE_DATA = 12; + INFOGAIN_LOSS = 13; + INNER_PRODUCT = 14; + LRN = 15; + MEMORY_DATA = 29; + MULTINOMIAL_LOGISTIC_LOSS = 16; + MVN = 34; + POOLING = 17; + POWER = 26; + RELU = 18; + SIGMOID = 19; + SIGMOID_CROSS_ENTROPY_LOSS = 27; + SILENCE = 36; + SOFTMAX = 20; + SOFTMAX_LOSS = 21; + SPLIT = 22; + SLICE = 33; + TANH = 23; + WINDOW_DATA = 24; + THRESHOLD = 31; + } + optional LayerType type = 5; + repeated BlobProto blobs = 6; + repeated string param = 1001; + repeated DimCheckMode blob_share_mode = 1002; + enum DimCheckMode { + STRICT = 0; + PERMISSIVE = 1; + } + repeated float blobs_lr = 7; + repeated float weight_decay = 8; + repeated float loss_weight = 35; + optional AccuracyParameter accuracy_param = 27; + optional ArgMaxParameter argmax_param = 23; + optional ConcatParameter concat_param = 9; + optional ContrastiveLossParameter contrastive_loss_param = 40; + optional ConvolutionParameter convolution_param = 10; + optional DataParameter data_param = 11; + optional DropoutParameter dropout_param = 12; + optional DummyDataParameter dummy_data_param = 26; + optional EltwiseParameter eltwise_param = 24; + optional ExpParameter exp_param = 41; + optional HDF5DataParameter hdf5_data_param = 13; + optional HDF5OutputParameter hdf5_output_param = 14; + optional HingeLossParameter hinge_loss_param = 29; + optional ImageDataParameter image_data_param = 15; + optional InfogainLossParameter infogain_loss_param = 16; + optional InnerProductParameter inner_product_param = 17; + optional LRNParameter lrn_param = 18; + optional MemoryDataParameter memory_data_param = 22; + optional MVNParameter mvn_param = 34; + optional PoolingParameter pooling_param = 19; + optional PowerParameter power_param = 21; + optional ReLUParameter relu_param = 30; + optional SigmoidParameter sigmoid_param = 38; + optional SoftmaxParameter softmax_param = 39; + optional SliceParameter slice_param = 31; + optional TanHParameter tanh_param = 37; + optional ThresholdParameter threshold_param = 25; + optional WindowDataParameter window_data_param = 20; + optional TransformationParameter transform_param = 36; + optional LossParameter loss_param = 42; + optional V0LayerParameter layer = 1; +} + +// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters +// in Caffe. We keep this message type around for legacy support. +message V0LayerParameter { + optional string name = 1; // the layer name + optional string type = 2; // the string to specify the layer type + + // Parameters to specify layers with inner products. + optional uint32 num_output = 3; // The number of outputs for the layer + optional bool biasterm = 4 [default = true]; // whether to have bias terms + optional FillerParameter weight_filler = 5; // The filler for the weight + optional FillerParameter bias_filler = 6; // The filler for the bias + + optional uint32 pad = 7 [default = 0]; // The padding size + optional uint32 kernelsize = 8; // The kernel size + optional uint32 group = 9 [default = 1]; // The group size for group conv + optional uint32 stride = 10 [default = 1]; // The stride + enum PoolMethod { + MAX = 0; + AVE = 1; + STOCHASTIC = 2; + } + optional PoolMethod pool = 11 [default = MAX]; // The pooling method + optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio + + optional uint32 local_size = 13 [default = 5]; // for local response norm + optional float alpha = 14 [default = 1.]; // for local response norm + optional float beta = 15 [default = 0.75]; // for local response norm + optional float k = 22 [default = 1.]; + + // For data layers, specify the data source + optional string source = 16; + // For data pre-processing, we can do simple scaling and subtracting the + // data mean, if provided. Note that the mean subtraction is always carried + // out before scaling. + optional float scale = 17 [default = 1]; + optional string meanfile = 18; + // For data layers, specify the batch size. + optional uint32 batchsize = 19; + // For data layers, specify if we would like to randomly crop an image. + optional uint32 cropsize = 20 [default = 0]; + // For data layers, specify if we want to randomly mirror data. + optional bool mirror = 21 [default = false]; + + // The blobs containing the numeric parameters of the layer + repeated BlobProto blobs = 50; + // The ratio that is multiplied on the global learning rate. If you want to + // set the learning ratio for one blob, you need to set it for all blobs. + repeated float blobs_lr = 51; + // The weight decay that is multiplied on the global weight decay. + repeated float weight_decay = 52; + + // The rand_skip variable is for the data layer to skip a few data points + // to avoid all asynchronous sgd clients to start at the same point. The skip + // point would be set as rand_skip * rand(0,1). Note that rand_skip should not + // be larger than the number of keys in the database. + optional uint32 rand_skip = 53 [default = 0]; + + // Fields related to detection (det_*) + // foreground (object) overlap threshold + optional float det_fg_threshold = 54 [default = 0.5]; + // background (non-object) overlap threshold + optional float det_bg_threshold = 55 [default = 0.5]; + // Fraction of batch that should be foreground objects + optional float det_fg_fraction = 56 [default = 0.25]; + + // optional bool OBSOLETE_can_clobber = 57 [default = true]; + + // Amount of contextual padding to add around a window + // (used only by the window_data_layer) + optional uint32 det_context_pad = 58 [default = 0]; + + // Mode for cropping out a detection window + // warp: cropped window is warped to a fixed size and aspect ratio + // square: the tightest square around the window is cropped + optional string det_crop_mode = 59 [default = "warp"]; + + // For ReshapeLayer, one needs to specify the new dimensions. + optional int32 new_num = 60 [default = 0]; + optional int32 new_channels = 61 [default = 0]; + optional int32 new_height = 62 [default = 0]; + optional int32 new_width = 63 [default = 0]; + + // Whether or not ImageLayer should shuffle the list of files at every epoch. + // It will also resize images if new_height or new_width are not zero. + optional bool shuffle_images = 64 [default = false]; + + // For ConcatLayer, one needs to specify the dimension for concatenation, and + // the other dimensions must be the same for all the bottom blobs. + // By default it will concatenate blobs along the channels dimension. + optional uint32 concat_dim = 65 [default = 1]; + + optional HDF5OutputParameter hdf5_output_param = 1001; +} + +message PReLUParameter { + // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: + // Surpassing Human-Level Performance on ImageNet Classification, 2015. + + // Initial value of a_i. Default is a_i=0.25 for all i. + optional FillerParameter filler = 1; + // Whether or not slope paramters are shared across channels. + optional bool channel_shared = 2 [default = false]; +} diff --git a/caffe2/.clang-format b/caffe2/.clang-format new file mode 100644 index 0000000..1307bf2 --- /dev/null +++ b/caffe2/.clang-format @@ -0,0 +1,87 @@ +--- +AccessModifierOffset: -1 +AlignAfterOpenBracket: AlwaysBreak +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: false +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] +IncludeCategories: + - Regex: '^<.*\.h(pp)?>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never +... diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt new file mode 100644 index 0000000..07f0164 --- /dev/null +++ b/caffe2/CMakeLists.txt @@ -0,0 +1,553 @@ +# ---[ Generate and install header and cpp files +include(../cmake/Codegen.cmake) + +# ---[ Declare source file lists + +# ---[ Shared build +add_subdirectory(utils) + +# ---[ ATen build +if(BUILD_ATEN) + set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE}) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + set(AT_LINK_STYLE INTERFACE) + add_subdirectory(../aten aten) + set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE}) + + if(BUILD_CAFFE2) + # Generate the headers wrapped by our operator + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h + COMMAND + ${PYCMD} ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py + --aten_root=${CMAKE_CURRENT_SOURCE_DIR}/../aten + --template_dir=${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten + --yaml_dir=${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen + --install_dir=${CMAKE_CURRENT_BINARY_DIR}/contrib/aten + DEPENDS + ATEN_CPU_FILES_GEN_TARGET + ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py + ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/aten_op_template.h) + + add_custom_target(__aten_op_header_gen + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h) + add_library(aten_op_header_gen INTERFACE) + add_dependencies(aten_op_header_gen __aten_op_header_gen) + endif() + + # Add source, includes, and libs to lists + list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) + list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS}) + # ATen tests use catch instead of gtest so keep separate for now + # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) + # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) + list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE}) + list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE}) + list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS}) + list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS}) + list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE}) + + IF(USE_ROCM) + # Set the HIP Variables + set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) + set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) + ENDIF(USE_ROCM) +endif() + +# ---[ Caffe2 build +if(BUILD_CAFFE2) + # Note: the folders that are being commented out have not been properly + # addressed yet. + add_subdirectory(proto) + add_subdirectory(contrib) + add_subdirectory(core) + add_subdirectory(core/nomnigraph) + add_subdirectory(core/dispatch) + if (USE_NVRTC) + add_subdirectory(cuda_rtc) + endif() + add_subdirectory(db) + add_subdirectory(distributed) + # add_subdirectory(experiments) # note, we may remove this folder at some point + add_subdirectory(ideep) + add_subdirectory(image) + add_subdirectory(video) + add_subdirectory(mkl) + add_subdirectory(mobile) + add_subdirectory(mpi) + add_subdirectory(observers) + add_subdirectory(onnx) + add_subdirectory(operators) + add_subdirectory(operators/rnn) + add_subdirectory(opt) + add_subdirectory(perfkernels) + add_subdirectory(python) + add_subdirectory(queue) + add_subdirectory(sgd) + add_subdirectory(share) + # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit + add_subdirectory(transforms) +endif() + +# Advanced: if we have white list specified, we will do intersections for all +# main lib srcs. +if (CAFFE2_WHITELISTED_FILES) + caffe2_do_whitelist(Caffe2_CPU_SRCS CAFFE2_WHITELISTED_FILES) + caffe2_do_whitelist(Caffe2_GPU_SRCS CAFFE2_WHITELISTED_FILES) + caffe2_do_whitelist(Caffe2_HIP_SRCS CAFFE2_WHITELISTED_FILES) +endif() + +# Debug messages - if you want to get a list of source files, enable the +# following. +if (FALSE) + message(STATUS "CPU sources: ") + foreach(tmp ${Caffe2_CPU_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "GPU sources: ") + foreach(tmp ${Caffe2_GPU_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "CPU include: ") + foreach(tmp ${Caffe2_CPU_INCLUDE}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "GPU include: ") + foreach(tmp ${Caffe2_GPU_INCLUDE}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "CPU test sources: ") + foreach(tmp ${Caffe2_CPU_TEST_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "GPU test sources: ") + foreach(tmp ${Caffe2_GPU_TEST_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "HIP sources: ") + foreach(tmp ${Caffe2_HIP_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "HIP test sources: ") + foreach(tmp ${Caffe2_HIP_TEST_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "ATen CPU test sources: ") + foreach(tmp ${ATen_CPU_TEST_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + + message(STATUS "ATen CUDA test sources: ") + foreach(tmp ${ATen_CUDA_TEST_SRCS}) + message(STATUS " " ${tmp}) + endforeach() +endif() + +# ---[ List of libraries to link with +if (BUILD_CAFFE2) + add_library(caffe2_protos STATIC $ $) + add_dependencies(caffe2_protos Caffe_PROTO Caffe2_PROTO) +else() + # Do not include caffe2 or caffe protos, but rather have it only be + # a library to attach local protobuf. + add_library(caffe2_protos STATIC utils/dummy.cpp) +endif() +# If we are going to link protobuf locally inside caffe2 libraries, what we will do is +# to create a helper static library that always contains libprotobuf source files, and +# link the caffe2 related dependent libraries to it. +target_include_directories(caffe2_protos INTERFACE $) +# Reason for this public dependency is as follows: +# (1) Strictly speaking, we should not expose any Protobuf related functions. We should +# only use function interfaces wrapped with our own public API, and link protobuf +# locally. +# (2) However, currently across the Caffe2 codebase, we have extensive use of protobuf +# functionalities. For example, not only libcaffe2.so uses it, but also other +# binaries such as python extensions etc. As a result, we will have to have a +# transitive dependency to libprotobuf. +# +# Good thing is that, if we specify CAFFE2_LINK_LOCAL_PROTOBUF, then we do not need to +# separately deploy protobuf binaries - libcaffe2.so will contain all functionalities +# one needs. One can verify this via ldd. +# +# TODO item in the future includes: +# (1) Enable using lite protobuf +# (2) Properly define public API that do not directly depend on protobuf itself. +# (3) Expose the libprotobuf.a file for dependent libraries to link to. +# +# What it means for users/developers? +# (1) Users: nothing affecting the users, other than the fact that CAFFE2_LINK_LOCAL_PROTOBUF +# avoids the need to deploy protobuf. +# (2) Developers: if one simply uses core caffe2 functionality without using protobuf, +# nothing changes. If one has a dependent library that uses protobuf, then one needs to +# have the right protobuf version as well as linking to libprotobuf.a. +target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf) + +# Compile exposed libraries. +list(APPEND Caffe2_CPU_SRCs $) +add_library(caffe2 ${Caffe2_CPU_SRCS}) +caffe2_interface_library(caffe2_protos caffe2_protos_whole) +target_link_libraries(caffe2 PRIVATE caffe2_protos_whole) +if (${CAFFE2_LINK_LOCAL_PROTOBUF}) + target_link_libraries(caffe2 INTERFACE protobuf::libprotobuf) +else() + target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf) +endif() +target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) +target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS}) +target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS}) +target_include_directories(caffe2 INTERFACE $) +target_include_directories(caffe2 PRIVATE ${Caffe2_CPU_INCLUDE}) +target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}") +# Set standard properties on the target +aten_set_target_props(caffe2) +target_compile_options(caffe2 INTERFACE "-std=c++11") +target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") +# Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) +target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") +install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) +caffe2_interface_library(caffe2 caffe2_library) +list(APPEND Caffe2_MAIN_LIBS caffe2_library) + +# ---[ CUDA library. +if(USE_CUDA) + # A hack to deal with cuda library dependencies and modern CMake: the + # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result, + # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This + # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with + # it. We will then manually add the cudart library as interface libs. + set(__tmp ${CUDA_LIBRARIES}) + set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES}) + torch_cuda_based_add_library(caffe2_gpu ${Caffe2_GPU_SRCS}) + set(CUDA_LIBRARIES ${__tmp}) + target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart) + + target_include_directories( + caffe2_gpu INTERFACE $) + target_include_directories( + caffe2_gpu PRIVATE ${Caffe2_GPU_INCLUDE}) + target_link_libraries( + caffe2_gpu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS}) + + # These public dependencies must go after the previous dependencies, as the + # order of the libraries in the linker call matters here when statically + # linking; libculibos and cublas must be last. + target_link_libraries( + caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) + + # Set standard properties on the target + aten_set_target_props(caffe2_gpu) + + install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib) + caffe2_interface_library(caffe2_gpu caffe2_gpu_library) + list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library) +endif() + +# ---[ Caffe2 HIP sources. +if(USE_ROCM) + # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs. + IF(BUILD_ATEN) + HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES}) + ENDIF() + IF(BUILD_CAFFE2) + set_source_files_properties(${Caffe2_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + ENDIF() + hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS}) + + # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added. + set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${HIP_HIPCC_FLAGS}) + target_link_libraries(caffe2_hip PUBLIC caffe2) + target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS}) + + # Since PyTorch files contain HIP headers, this is also needed to capture the includes. + target_include_directories(caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES}) + target_include_directories(caffe2_hip INTERFACE $) + + IF(BUILD_ATEN) + aten_set_target_props(caffe2_hip) + ENDIF() + + # When a library has object files that contain device code, it needs to use hipcc/hcc to link. + set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP) + + caffe2_interface_library(caffe2_hip caffe2_hip_library) + list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library) + install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib) +endif() + +# ---[ Check if warnings should be errors. +if ($ENV{WERROR}) + target_compile_options(caffe2 PRIVATE -Werror) + if(USE_CUDA) + target_compile_options(caffe2_gpu PRIVATE -Werror) + endif() +endif() + +# ---[ Test binaries. +if(BUILD_CAFFE2) + if (BUILD_TEST) + set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}) + if (USE_CUDA) + list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}) + endif() + + foreach(test_src ${Caffe2_ALL_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + add_executable(${test_name} "${test_src}") + target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0) + target_compile_features(${test_name} PRIVATE cxx_range_for) + endif() + add_test(NAME ${test_name} COMMAND $) + if (INSTALL_TEST) + install(TARGETS ${test_name} DESTINATION test) + endif() + endforeach() + + if(USE_ROCM) + foreach(test_src ${Caffe2_HIP_TEST_SRCS}) + set_source_files_properties(${test_src} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + get_filename_component(test_name ${test_src} NAME_WE) + hip_add_executable(${test_name} "${test_src}") + target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0) + target_compile_features(${test_name} PRIVATE cxx_range_for) + endif() + add_test(NAME ${test_name} COMMAND $) + if (INSTALL_TEST) + install(TARGETS ${test_name} DESTINATION test) + endif() + endforeach() + endif() + + endif() +endif() + +set(__aten_test_dir "test") +if(BUILD_CAFFE2) + # Aten tests should only run when Caffe2 is not built + set(__aten_test_dir "test/aten") +endif() +# Todo - Set up ATen tests for ROCm in an upcoming PR +if(BUILD_ATEN AND NOT USE_ROCM) + foreach(test_src ${ATen_CPU_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + add_executable(${test_name} "${test_src}") + target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) + target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE}) + target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS}) + add_test(NAME ${test_name} COMMAND $) + install(TARGETS ${test_name} DESTINATION ${__aten_test_dir}) + endforeach() + + if(USE_CUDA OR USE_ROCM) + foreach(test_src ${ATen_CUDA_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + torch_cuda_based_add_executable(${test_name} "${test_src}") + target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) + target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE}) + target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS}) + add_test(NAME ${test_name} COMMAND $) + install(TARGETS ${test_name} DESTINATION ${__aten_test_dir}) + endforeach() + endif() +endif() + +if(BUILD_CAFFE2) + if (BUILD_PYTHON) + # Python site-packages + # Get canonical directory for python site packages (relative to install + # location). It varies from system to system. + pycmd(PYTHON_SITE_PACKAGES " + from distutils import sysconfig + print(sysconfig.get_python_lib(prefix='')) + ") + SET(PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES} PARENT_SCOPE) # for Summary + # ---[ Options. + SET(PYTHON_LIB_REL_PATH "${PYTHON_SITE_PACKAGES}" CACHE STRING "Python installation path (relative to CMake installation prefix)") + message(STATUS "Using ${PYTHON_LIB_REL_PATH} as python relative installation path") + # Python extension suffix + # Try to get from python through sysconfig.get_env_var('EXT_SUFFIX') first, + # fallback to ".pyd" if windows and ".so" for all others. + pycmd(PY_EXT_SUFFIX " + from distutils import sysconfig + ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') + print(ext_suffix if ext_suffix else '') + ") + if("${PY_EXT_SUFFIX}" STREQUAL "") + if (MSVC) + set(PY_EXT_SUFFIX ".pyd") + else() + set(PY_EXT_SUFFIX ".so") + endif() + endif() + + # Allow different install locations for libcaffe2 + # For setuptools installs (that all build Python), install libcaffe2 into + # site-packages, alongside the torch libraries. The pybind11 library needs + # an rpath to the torch library folder + # For cmake installs, including c++ only installs, install libcaffe2 into + # CMAKE_INSTALL_PREFIX/lib . The pybind11 library can have a hardcoded + # rpath + if(APPLE) + set(_rpath_portable_origin "@loader_path") + else() + set(_rpath_portable_origin $ORIGIN) + endif(APPLE) + set(caffe2_pybind11_rpath "${_rpath_portable_origin}") + if(${BUILDING_WITH_TORCH_LIBS}) + # site-packages/caffe2/python/caffe2_pybind11_state + # site-packages/torch/lib + set(caffe2_pybind11_rpath "${_rpath_portable_origin}/../../torch/lib") + endif(${BUILDING_WITH_TORCH_LIBS}) + + + # ---[ Python. + add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS}) + set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden") + set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "" DEBUG_POSTFIX "") + set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) + if (APPLE) + set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + endif() + target_link_libraries( + caffe2_pybind11_state caffe2_library) + if (WIN32) + target_link_libraries(caffe2_pybind11_state ${PYTHON_LIBRARIES}) + endif(WIN32) + + # Install caffe2_pybind11_state(_gpu|hip) in site-packages/caffe2/python, + # so it needs an rpath to find libcaffe2 + set_target_properties( + caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/caffe2/python) + install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python") + set_target_properties(caffe2_pybind11_state PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}") + + if(USE_CUDA) + add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS}) + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden") + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "" DEBUG_POSTFIX "") + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) + if (APPLE) + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + endif() + target_link_libraries( + caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library) + if (WIN32) + target_link_libraries(caffe2_pybind11_state_gpu ${PYTHON_LIBRARIES}) + endif(WIN32) + + # Install with same rpath as non-gpu caffe2_pybind11_state + set_target_properties( + caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/caffe2/python) + install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python") + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}") + endif() + + if(USE_ROCM) + add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS}) + set_target_properties(caffe2_pybind11_state_hip PROPERTIES COMPILE_FLAGS "${HIP_HIPCC_FLAGS} -fvisibility=hidden") + set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "") + set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) + if (APPLE) + set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + endif() + target_link_libraries( + caffe2_pybind11_state_hip caffe2_library caffe2_hip_library) + if (WIN32) + target_link_libraries(caffe2_pybind11_state_hip ${PYTHON_LIBRARIES}) + endif(WIN32) + + # Install with same rpath as non-hip caffe2_pybind11_state + set_target_properties( + caffe2_pybind11_state_hip PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/caffe2/python) + install(TARGETS caffe2_pybind11_state_hip DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python") + set_target_properties(caffe2_pybind11_state_hip PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}") + endif() + + if (MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio") + # If we are building under windows, we will copy the file from + # build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd + # to its parent folder so that we can do in-build execution. + add_custom_target(windows_python_copy_lib ALL) + add_dependencies(windows_python_copy_lib caffe2_pybind11_state) + add_custom_command( + TARGET windows_python_copy_lib POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ + ${CMAKE_BINARY_DIR}/caffe2/python) + if (USE_CUDA) + add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu) + add_custom_command( + TARGET windows_python_copy_lib POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ + ${CMAKE_BINARY_DIR}/caffe2/python) + endif() + if (USE_ROCM) + add_dependencies(windows_python_copy_lib caffe2_pybind11_state_hip) + add_custom_command( + TARGET windows_python_copy_lib POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ + ${CMAKE_BINARY_DIR}/caffe2/python) + endif() + endif() + + # Finally, Copy all python files to build directory + # Generate and create all needed __init__.py files, if they aren't already + # present in the current source tree. + message(STATUS "Automatically generating missing __init__.py files.") + caffe_autogen_init_py_files() + + # Create a custom target that copies all python files. + file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR} + "${PROJECT_SOURCE_DIR}/caffe2/*.py") + add_custom_target(python_copy_files ALL) + if(MSVC OR CMAKE_GENERATOR MATCHES "Ninja") + # ninja fails when the command line is too long so we split + # the target into several. This would be beneficial for VS also + # since it build targets in parallel but not custom commands + foreach(python_src ${PYTHON_SRCS}) + get_filename_component(dir ${python_src} DIRECTORY) + string(SHA1 name_hash "${python_src}") + # get_filename_component(name_we ${python_src} NAME_WE) + add_custom_target(python_copy_files_${name_hash} + COMMAND ${CMAKE_COMMAND} -E copy + ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir}) + add_dependencies(python_copy_files python_copy_files_${name_hash}) + endforeach() + else() + foreach(python_src ${PYTHON_SRCS}) + get_filename_component(dir ${python_src} DIRECTORY) + add_custom_command( + TARGET python_copy_files PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir}) + endforeach() + endif() + + # Install commands + # Pick up static python files + install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH} + FILES_MATCHING PATTERN "*.py") + # Caffe proto files + install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe DESTINATION ${PYTHON_LIB_REL_PATH} + FILES_MATCHING PATTERN "*.py") + # Caffe2 proto files + install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH} + FILES_MATCHING PATTERN "*.py") + endif() +endif() + +# Finally, set the Caffe2_MAIN_LIBS variable in the parent scope. +set(Caffe2_MAIN_LIBS ${Caffe2_MAIN_LIBS} PARENT_SCOPE) diff --git a/caffe2/README.md b/caffe2/README.md new file mode 100644 index 0000000..a1166b8 --- /dev/null +++ b/caffe2/README.md @@ -0,0 +1,21 @@ +# Caffe2 + +[![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/lastCompletedBuild/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master) + +Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind. + +## Questions and Feedback + +Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features. + +### Further Resources on [Caffe2.ai](http://caffe2.ai) + +* [Installation](http://caffe2.ai/docs/getting-started.html) +* [Learn More](http://caffe2.ai/docs/learn-more.html) +* [Upgrading to Caffe2](http://caffe2.ai/docs/caffe-migration.html) +* [Datasets](http://caffe2.ai/docs/datasets.html) +* [Model Zoo](http://caffe2.ai/docs/zoo.html) +* [Tutorials](http://caffe2.ai/docs/tutorials.html) +* [Operators Catalogue](http://caffe2.ai/docs/operators-catalogue.html) +* [C++ API](http://caffe2.ai/doxygen-c/html/classes.html) +* [Python API](http://caffe2.ai/doxygen-python/html/namespaces.html) diff --git a/caffe2/VERSION_NUMBER b/caffe2/VERSION_NUMBER new file mode 100644 index 0000000..53a48a1 --- /dev/null +++ b/caffe2/VERSION_NUMBER @@ -0,0 +1 @@ +0.8.2 \ No newline at end of file diff --git a/caffe2/__init__.py b/caffe2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caffe2/contrib/CMakeLists.txt b/caffe2/contrib/CMakeLists.txt new file mode 100644 index 0000000..be8c0bd --- /dev/null +++ b/caffe2/contrib/CMakeLists.txt @@ -0,0 +1,26 @@ +add_subdirectory(aten) +add_subdirectory(gloo) +add_subdirectory(nccl) +add_subdirectory(opencl) +add_subdirectory(prof) +add_subdirectory(shm_mutex) +add_subdirectory(script) +if (USE_TENSORRT) +add_subdirectory(tensorrt) +endif() + +# Pass the src lists back to the parent + +# CPU source, include, deps, test sources, binary sources +set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) +set(Caffe2_CPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE) +set(Caffe2_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS} PARENT_SCOPE) +set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) +set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE) + +# GPU source, include, deps, test sources, binary sources +set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) +set(Caffe2_GPU_INCLUDE ${Caffe2_GPU_INCLUDE} PARENT_SCOPE) +set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) +set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) +set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE) diff --git a/caffe2/contrib/__init__.py b/caffe2/contrib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt new file mode 100644 index 0000000..5bc2341 --- /dev/null +++ b/caffe2/contrib/aten/CMakeLists.txt @@ -0,0 +1,7 @@ +if(BUILD_ATEN) + # Add source generated by Codegen.cmake and pass to parent + list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc) + list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc) + set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) + set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) +endif() diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md new file mode 100644 index 0000000..d3046aa --- /dev/null +++ b/caffe2/contrib/aten/README.md @@ -0,0 +1,80 @@ +# An ATen operator for Caffe2 + +[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch +and PyTorch directly in C++11. This library provides a generated wrapper around the ATen API +that makes these functions available in Caffe2 as an operator. It also makes it accessible using the +ToffeeIR. + + +### Example Usage in Caffe2 + +First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), +[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). + +We will call the `pow` operator: + +``` +static inline Tensor pow(const Tensor & self, Scalar exponent); +``` + +Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`, +and there is always a string attribute `operator` that defines which ATen function to call: + + +``` +import numpy as np +from caffe2.python import core, workspace + + +# create the Caffe2 Op: +op = core.CreateOperator( + "ATen", + ["MyInput"], + ["MyOutput"], + operator="pow", exponent=2.0) + +``` + +Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob. +Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes. +In the case of `Scalar` the attributes can be either an integers or floating point numbers. + +The op can now be run like any other Caffe2 operator: + +``` +workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32)) +workspace.RunOperatorOnce(op) +print(workspace.FetchBlob("MyOutput") +``` + +For methods, the first input is always the `this` Tensor in C++. +To call methods of ATen's `Type` objects, you provide an additional string attribute +that determines the type: + +``` +# create a 2x4 tensor filled with floating point ones +op = core.CreateOperator( + "ATen", + [], + ["MyOutput"], + operator="ones", type="Float", size={2,4}) +``` + +Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA. + +### Example Usage via PyTorch Symbolic + +The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported +to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API: + +``` +class Add(torch.autograd.Function): + + @staticmethod + def symbolic(g, a, b): + return g.op("ATen", a, b, operator_s = "add") + + @staticmethod + def forward(ctx, a, b): + return a + b +``` diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc new file mode 100644 index 0000000..bc93f48 --- /dev/null +++ b/caffe2/contrib/aten/aten_op.cc @@ -0,0 +1,26 @@ +#include "caffe2/contrib/aten/aten_op.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(ATen, ATenOp); +template<> +at::Backend ATenOp::backend() const { + return at::kCPU; +} + +OPERATOR_SCHEMA(ATen); +CAFFE_KNOWN_TYPE(at::Half); + +namespace math { +template <> +void Set( + const size_t /*N*/, + const at::Half h, + at::Half* v, + CPUContext* c) { + Set(0, h.x, (uint16_t*) v, c); +} +} + +} diff --git a/caffe2/contrib/aten/aten_op.h b/caffe2/contrib/aten/aten_op.h new file mode 100644 index 0000000..7161e4a --- /dev/null +++ b/caffe2/contrib/aten/aten_op.h @@ -0,0 +1 @@ +#include "caffe2/caffe2/contrib/aten/gen_aten_op.h" diff --git a/caffe2/contrib/aten/aten_op_cuda.cc b/caffe2/contrib/aten/aten_op_cuda.cc new file mode 100644 index 0000000..d416e70 --- /dev/null +++ b/caffe2/contrib/aten/aten_op_cuda.cc @@ -0,0 +1,23 @@ +#include "caffe2/contrib/aten/aten_op.h" +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +REGISTER_CUDA_OPERATOR(ATen, ATenOp); +template<> +at::Backend ATenOp::backend() const { + return at::kCUDA; +} + +namespace math { +template <> +void Set( + const size_t /*N*/, + const at::Half h, + at::Half* v, + CUDAContext* c) { + Set(0, h.x, (uint16_t*) v, c); +} +} + +} diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h new file mode 100644 index 0000000..feccafd --- /dev/null +++ b/caffe2/contrib/aten/aten_op_template.h @@ -0,0 +1,226 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +// a map from descriptor strings (see [DESCRIPTORS]) +// to the key in the switch statement that implements them +static std::unordered_map op_to_key = { + ${mappings} +}; + +namespace caffe2 { + +using at::Half; // for AT_FORALL_SCALAR_TYPES + +template +class ATenOp : public Operator { + public: + ATenOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) { + VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n"; + switch(findImplementation(operator_def)) { + ${implementations} + default: + CAFFE_THROW("Unexpected key value for aten operator"); + } + } + USE_OPERATOR_CONTEXT_FUNCTIONS; + + bool RunOnDevice() override { + return run_op(); + } +private: + // actual operator implementation is initialized in ctor. + std::function run_op; + at::Backend backend() const; + + TypeMeta typeMetaFor(const at::Tensor & t) { + return typeMetaFor(t.type().scalarType()); + } + TypeMeta typeMetaFor(at::ScalarType st) { + #define DEFINE_CASE(ctype,aten_name,_) \ + case at::k##aten_name: \ + return TypeMeta::Make(); + switch(st) { + AT_FORALL_SCALAR_TYPES(DEFINE_CASE) + default: + CAFFE_THROW("Unknown ATen Type"); + } + #undef DEFINE_CASE + } + + at::Type & typeFor(const Tensor & ten) { + return at::getType(backend(), atScalarTypeFor(ten.meta())); + } + at::Tensor tensorWrapping(const Tensor& ten_) { + auto& ten = const_cast&>(ten_); + return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims()); + } + + at::Tensor peek(size_t i, size_t N) { + auto real_idx = InputSize() - N + i; + return tensorWrapping(Input(real_idx)); + } + + std::vector peekSlice(size_t i, size_t len, size_t N) { + std::vector results; + for (size_t ii = i; ii < i + len; ++ii) { + results.push_back(peek(ii, N)); + } + return results; + } + + at::ScalarType atScalarTypeFor(const TypeMeta & meta) { + #define DEFINE_IF(ctype,aten_name,_) \ + if(meta.Match()) { \ + return at::k##aten_name; \ + } + AT_FORALL_SCALAR_TYPES(DEFINE_IF) + #undef DEFINE_IF + // Special case for bool, since the type in ATen is actually Byte + if (meta.Match()) { + return at::kByte; + } + CAFFE_THROW("Unknown type meta"); // TODO: improve error message... + } + void assignTo(Tensor * dst, const at::Tensor & src_) { + at::Tensor src = src_.contiguous(); + auto at_sizes = src.sizes(); + std::vector dims(at_sizes.begin(),at_sizes.end()); + dst->Resize(dims); + dst->ShareExternalPointer( + src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable { + // return a closure that holds a handle to t until it is called + // to keep the aten memory alive + return src.reset(); + }); + } + void assignListStartingAt( + size_t offset, + const std::vector& tensors) { + for (size_t i = 0; i < tensors.size(); i++) { + assignTo(Output(offset + i), tensors[i]); + } + } + + // the AT_FORALL_SCALAR_TYPES macro just gives a 'i' or 'd' argument + // for each type to specify if it is stored as a integer or a double. + // We need this workaround here to extract the value in the scalar losslessly + // because in some cases like 'sum' Torch promotes float to double + // and will complain if we downcast it with toFloat, causing it + // to lose precision + double extract_d(const at::Scalar & s) { + return s.toDouble(); + } + int64_t extract_i(const at::Scalar & s) { + return s.toLong(); + } + + void assignTo(Tensor * dst, at::Type & inferred_type, at::Scalar scalar) { + switch(inferred_type.scalarType()) { + #define DEFINE_CASE(ctype,aten_name,native) \ + case at::k##aten_name: { \ + auto value = extract_##native(scalar); \ + assignToValue(dst, at::convert(value)); \ + } break; + AT_FORALL_SCALAR_TYPES(DEFINE_CASE) + #undef DEFINE_CASE + default: + CAFFE_THROW("Unknown ATen Type"); + } + } + template + void assignToValue(Tensor * dst, T v) { + dst->Resize(std::vector()); + math::Set(1, v, dst->template mutable_data(), &context_); + } + int findImplementation(const OperatorDef& operator_def) { + CAFFE_ENFORCE(HasArgument("operator")); + std::string op = OperatorBase::GetSingleArgument("operator", ""); + // construct descriptor string ([DESCRIPTORS]) given the attributes + // and inputs of this operator_def, and look up the implementation key + // for this variant + std::stringstream descriptor; + descriptor << op; + std::vector attrs; + for(size_t i = 0; i < operator_def.arg_size(); i++) { + auto & attr = operator_def.arg(i); + if(attr.name() == "operator" || attr.name() == "type" ) + continue; + attrs.push_back(attr.name()); + } + std::sort(attrs.begin(), attrs.end()); + for(auto & a : attrs) + descriptor << "-" << a; + + std::string descriptor_sized = + descriptor.str() + "-" + caffe2::to_string(InputSize()); + std::string descriptor_var_args = descriptor.str() + "-*"; + if (op_to_key.count(descriptor_sized) > 0) { + return op_to_key[descriptor_sized]; + } + if (op_to_key.count(descriptor_var_args) > 0) { + return op_to_key[descriptor_var_args]; + } + std::stringstream ss; + ss << "Attempting to run unknown ATen operator configuration: " + << descriptor_sized; + CAFFE_THROW(ss.str()); + } + at::Scalar readScalarAttribute(const std::string & name) { + if(OperatorBase::HasSingleArgumentOfType(name)) { + return OperatorBase::GetSingleArgument(name, 0); + } else { + CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); + return OperatorBase::GetSingleArgument(name, 0); + } + } + template + T readAttribute(const std::string & name) { + CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); + return OperatorBase::GetSingleArgument(name, 0); + } + std::vector readIntList(const std::string & name) { + CAFFE_ENFORCE(OperatorBase::HasArgument(name)); + return OperatorBase::GetRepeatedArgument(name, {}); + } + template + std::array readBoolMask(const std::string& name) { + CAFFE_ENFORCE(OperatorBase::HasArgument(name)); + std::vector ints = + OperatorBase::GetRepeatedArgument(name, {}); + std::array result; + for (size_t i = 0; i < N; ++i) { + result[i] = ints.at(i); + } + return result; + } + at::ScalarType stringToScalarType(const std::string & name) { + #define DEFINE_IF(type,aten) \ + if(#type == name) \ + return at::k##aten; + DEFINE_IF(float16, Half) + DEFINE_IF(float, Float) + DEFINE_IF(double, Double) + DEFINE_IF(uint8, Byte) + DEFINE_IF(int8, Char) + DEFINE_IF(int16, Short) + DEFINE_IF(int32, Int) + DEFINE_IF(int64, Long) + CAFFE_THROW("unsupported type annotation: ", name); + } + at::Type & stringToType(const std::string & name) { + return at::getType(backend(), stringToScalarType(name)); + } + at::Type * readTypeAttribute(const std::string & name) { + CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); + return &stringToType(OperatorBase::GetSingleArgument(name, "")); + } +}; + +} diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py new file mode 100644 index 0000000..52d5c38 --- /dev/null +++ b/caffe2/contrib/aten/aten_test.py @@ -0,0 +1,106 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.python import core, dyndep +from hypothesis import given + +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np + + +dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/aten:aten_op') + + +class TestATen(hu.HypothesisTestCase): + + @given(inputs=hu.tensors(n=2), **hu.gcs) + def test_add(self, inputs, gc, dc): + op = core.CreateOperator( + "ATen", + ["X", "Y"], + ["Z"], + operator="add") + + def ref(X, Y): + return [X + Y] + self.assertReferenceChecks(gc, op, inputs, ref) + + @given(inputs=hu.tensors(n=1), **hu.gcs) + def test_pow(self, inputs, gc, dc): + op = core.CreateOperator( + "ATen", + ["S"], + ["Z"], + operator="pow", exponent=2.0) + + def ref(X): + return [np.square(X)] + + self.assertReferenceChecks(gc, op, inputs, ref) + + @given(x=st.integers(min_value=2, max_value=8), **hu.gcs) + def test_sort(self, x, gc, dc): + inputs = [np.random.permutation(x)] + op = core.CreateOperator( + "ATen", + ["S"], + ["Z", "I"], + operator="sort") + + def ref(X): + return [np.sort(X), np.argsort(X)] + self.assertReferenceChecks(gc, op, inputs, ref) + + @given(inputs=hu.tensors(n=1), **hu.gcs) + def test_sum(self, inputs, gc, dc): + op = core.CreateOperator( + "ATen", + ["S"], + ["Z"], + operator="sum") + + def ref(X): + return [np.sum(X)] + + self.assertReferenceChecks(gc, op, inputs, ref) + + @given(**hu.gcs) + def test_ones(self, gc, dc): + op = core.CreateOperator( + "ATen", + [], + ["Z"], + operator="ones", type="float", size={2, 4}) + + def ref(): + return [np.ones([2, 4])] + + self.assertReferenceChecks(gc, op, [], ref) + + @given(**hu.gcs) + def test_index_put(self, gc, dc): + op = core.CreateOperator( + "ATen", + ['self', 'indices', 'values'], + ["Z"], + operator="index_put") + + def ref(self, indices, values): + self[indices] = values + return (self,) + + + tensor = np.random.randn(3, 3).astype(np.float32) + mask = np.array([[True, True, True], [True, False, False], [True, True, False]]) + values = np.random.randn(6).astype(np.float32) + + self.assertReferenceChecks(gc, op, [tensor, mask, values], ref) + + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md new file mode 100644 index 0000000..04ddaef --- /dev/null +++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md @@ -0,0 +1,157 @@ +# Using ONNX and ATen to export models from PyTorch to Caffe2 + +When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up +hitting operators that are not yet part of the ONNX specification. These may be +operators that haven't been standardized yet, or custom `torch.autograd.Function` types that +are specific to a network. + +To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library. +[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten) +that can run these tensor functions in a Caffe2 network after importing them through ONNX. + +This guide explains how to configure Caffe2 and modify your PyTorch program to use +this functionality. + +### Enable ATen in Caffe2 + +The ATen facility in Caffe2 is part of a contrib package and needs to be enabled +when you configure Caffe2 using cmake: + +``` +git clone https://github.com/caffe2/caffe2/ +mkdir caffe2/build +cd caffe2/build +cmake -DUSE_ATEN=ON .. +make install +``` + +### Describe How to Export a PyTorch Autograd Function using ATen + +To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run +in the forward pass of a network. For each function in the trace, it calls that function's +`symbolic` method which describes how to construct the part of the ONNX graph +that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/master/torch/autograd/_functions/basic_ops.py#L59) for examples). + +When equivalent ONNX operators do not exist, you can instead call any ATen function. +As an example let's assume we have an autograd function which computes `x*x+y`: + +``` + class MyFunction(Function): + @staticmethod + def forward(ctx, x, y): + return x*x + y +``` + +We can add a `symbolic` method to it like so: + +``` + class MyFunction(Function): + @staticmethod + def forward(ctx, x, y): + return x*x + y + @staticmethod + def symbolic(graph, x, y): + x2 = graph.at("mul", x, x) + r = graph.at("add", x2, y) + # x, y, x2, and r are 'Node' objects + # print(r) or print(graph) will print out a textual representation for debugging. + # this representation will be converted to ONNX protobufs on export. + return r +``` + +The function `graph.at` adds a new ATen op the computation graph. +You can call any ATen function using this facility. To do so, +first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), +[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). + +As an example, we might want to call the `pow` operator: + +``` +static inline Tensor pow(const Tensor & self, Scalar exponent); +``` + +We can translate this into the equivalent `graph.at` function: + +``` + def symbolic(graph, x): + graph.at("pow", x, exponent_f = 2.0) # compute x**2 +``` + +Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar` +like `exponent` becomes a keyword argument that specify ONNX attributes. +Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings). + +For methods, the first input is always the `this` Tensor in C++. +To call methods of ATen's `Type` objects, you provide an additional string attribute +that determines the type. For instance, `ones` creates a new constant tensor of all ones: +``` +class Type { + ... + virtual Tensor ones(IntList size) const; + ... +}; +``` + +From PyTorch it can be created by adding the type as an additional attribute: + +``` + def symbolic(graph, x): + return graph.at("ones", type_s="float", size_i=[2,4]) +``` + + +Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA. + +## Putting it together + +With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`: + +``` +class MyModule(nn.Module): + def forward(self, x, y): + # you can combine your ATen ops with standard onnx ones + x = nn.ReLU()(x) + return MyFunction.apply(x, y) + +torch.onnx.export(MyModule(), + (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))), + "output.onnx", + verbose=True) +``` + +This exports the following graph, which contains calls the `ATen` operator: + +``` +graph(%1 : Float(3, 4) + %2 : Float(3, 4)) { + %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1]; + %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0]; + %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0]; + return (%5); +} +``` + +The graph can then be imported using ONNX and run with Caffe2: + +``` +import onnx +import caffe2.python.onnx.backend +import numpy as np + +graph = onnx.load("output.onnx") + +a = np.random.randn(3, 2).astype(np.float32) +b = np.random.randn(3, 2).astype(np.float32) + +prepared_backend = caffe2.python.onnx.backend.prepare(graph) +W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b} +c2_out = prepared_backend.run(W)[0] + +x = np.maximum(a, 0) +r = x*x + b +np.testing.assert_array_almost_equal(r, c2_out) +``` + +### Code + +For the full source code for this tutorial, see [sample.py](sample.py). diff --git a/caffe2/contrib/aten/docs/sample.py b/caffe2/contrib/aten/docs/sample.py new file mode 100644 index 0000000..71e2005 --- /dev/null +++ b/caffe2/contrib/aten/docs/sample.py @@ -0,0 +1,54 @@ +import numpy as np + +from torch import nn +from torch.autograd import Variable, Function +import torch.onnx + +import onnx +import caffe2.python.onnx.backend + +class MyFunction(Function): + @staticmethod + def forward(ctx, x, y): + return x*x + y + @staticmethod + def symbolic(graph, x, y): + x2 = graph.at("mul", x, x) + r = graph.at("add", x2, y) + # x, y, x2, and r are 'Node' objects + # print(r) or print(graph) will print out a textual representation for debugging. + # this representation will be converted to ONNX protobufs on export. + return r + +class MyModule(nn.Module): + def forward(self, x, y): + # you can combine your ATen ops with standard onnx ones + x = nn.ReLU()(x) + return MyFunction.apply(x, y) + +torch.onnx.export(MyModule(), + (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))), + "output.onnx", + verbose=True) + +# prints the graph for debugging: +# graph(%1 : Float(3, 4) +# %2 : Float(3, 4)) { +# %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1]; +# %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0]; +# %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0]; +# return (%5); +# } + +graph = onnx.load("output.onnx") + +a = np.random.randn(3, 4).astype(np.float32) +b = np.random.randn(3, 4).astype(np.float32) + +prepared_backend = caffe2.python.onnx.backend.prepare(graph) +W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b} +c2_out = prepared_backend.run(W)[0] + +x = np.maximum(a, 0) +r = x*x + b +np.testing.assert_array_almost_equal(r, c2_out) diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py new file mode 100755 index 0000000..18a3db4 --- /dev/null +++ b/caffe2/contrib/aten/gen_op.py @@ -0,0 +1,304 @@ +#!/bin/env python + +# Copyright (c) 2016-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +import sys +import yaml +import argparse +import os +from copy import deepcopy + +parser = argparse.ArgumentParser() +parser.add_argument("--template_dir", default=".", help="where template.h is") +parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen", + help="where ATen yaml files are") +parser.add_argument("--output_prefix", default="", help="") +parser.add_argument( + "--install_dir", default=".", help="where to put generated file") +parser.add_argument("--aten_root", default="", help="root directory of aten") +args, _ = parser.parse_known_args() + +if args.aten_root: + if not os.path.exists(args.aten_root): + raise ValueError('aten_root ({}) does not exist'.format( + args.aten_root)) + sys.path.append(os.path.join(args.aten_root, 'src', 'ATen')) + from code_template import CodeTemplate as CT +else: + from src.ATen.code_template import CodeTemplate as CT + +OP_TEMPLATE = CT.from_file( + os.path.join(args.template_dir, 'aten_op_template.h')) + + +try: + # use faster C loader if available + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + + +def write(filename, s): + with open(filename, "w") as f: + f.write(s) + + +def read(filename): + with open(filename, "r") as f: + return f.read() + + +def value_has_tensors(v): + # Sparse shouldn't appear in public API, seems to be temporary bug + return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type'] + + +def value_is_tensor_type(v): + return value_has_tensors(v) and v['dynamic_type'] != 'TensorList' + + +# for each aten type, how do we handle a return value of that type? +RETURN_MAP = { + 'Tensor': 'assignTo(Output(${offset}),${output});', + 'Scalar': 'assignTo(Output(${offset}),*inferred_type, ${output});', + 'bool': 'assignToValue(Output(${offset}),${output});', + 'int64_t': 'assignToValue(Output(${offset}),${output});', + 'std::vector': 'assignListStartingAt(${offset}, ${output});', +} + +# for each non-Tensor aten argument, how to we read it from caffe2's +# attribute list. Most of these call runtime functions defined in the +# template class. +ARGUMENT_MAP = { + 'Scalar': 'at::Scalar ${arg} = readScalarAttribute("${arg}");', + 'bool': 'bool ${arg} = readAttribute("${arg}");', + 'int': 'int ${arg} = readAttribute("${arg}");', + 'double': 'double ${arg} = readAttribute("${arg}");', + 'int64_t': 'int64_t ${arg} = readAttribute("${arg}");', + 'IntList': 'auto ${arg} = readIntList("${arg}");', + 'std::array': 'auto ${arg} = readBoolMask<2>("${arg}");', + 'std::array': 'auto ${arg} = readBoolMask<3>("${arg}");', +} + + +def expand(o): + num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments']) + results = [o] + for i in range(0, num_defaults): + # last num_default values should be default + assert('default' in o['arguments'][-(i + 1)]) + v = deepcopy(o) + v['arguments'] = v['arguments'][:-(i + 1)] + results.append(v) + return results + + +# filter the list of declarations removing things we cannot support +def supports(o, factory_methods): + # Ignore all families (!) of functions that have TensorOptions (i.e. tensor factory methods). + if o['name'] in factory_methods: + if factory_methods[o['name']] == 0: + print("Skipping {} because it is a factory method".format(o['name'])) + factory_methods[o['name']] += 1 + return False + + # skip all in-place operators for now since aten cannot Resize + # caffe2 memory inside an operator + if o['inplace']: + return False + + # _out variants also work in-place on arguments taken as destinations + # we also cannot handle these because aten cannot resize caffe2 Tensors + if "_out" in o['name']: + return False + + # skip return types we cannot handle + for ret in o['returns']: + if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP: + print("Skipping {} Because of Ret: {} ({})".format( + o['name'], ret['type'], ret['dynamic_type'])) + return False + + # skip arguments we cannot handle + for arg in o['arguments']: + if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP: + print("Skipping {} Because of Arg: {} ({}) ".format( + o['name'], arg['type'], arg['dynamic_type'])) + return False + return True + + +# template for each potential operator. +# each operator has an integer 'key' associated with it, and +# a lambda that defines the operator +# non-tensor attributes are created in ${initialization} +# and then saved as arguments to the lambda +# Inputs/Outputs are read inside the lambda +OPTION_TEMPLATE = CT("""\ +case ${key}: { // ${name} + ${initialization} + run_op = [=] { + ${statements} + auto the_result = ${invocation}; + ${assignments} + return true; + }; +} break; +""") + + +def get_output(o, i): + if len(o['returns']) == 1: + return 'the_result' + else: + return 'std::get<{}>(the_result)'.format(i) + + +def attribute_names(o): + return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)]) + + +def required_attribute_names(o): + return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a]) + + +def self_as_first_argument(arguments): + return ([a for a in arguments if a['name'] == 'self'] + + [a for a in arguments if a['name'] != 'self']) + + +def get_num_inputs(o): + args = 0 + for a in o['arguments']: + if a['type'] == 'TensorList': + return '*' + elif value_has_tensors(a): + args += 1 + return str(args) + + +def find_factory_methods(decls): + factory_methods = {} + for o in decls: + if any(arg['dynamic_type'] == 'TensorOptions' for arg in o['arguments']): + factory_methods[o['name']] = 0 + return factory_methods + + +if __name__ == '__main__': + decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader) + factory_methods = find_factory_methods(decls) + filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)] + top_env = { + 'mappings': [], + 'implementations': [], + } + seen = set() + key = 0 + for o in filtered: + # [DESCRIPTORS] + # each option is associated with a descriptor string that is used + # to figure out which version of an op is being used: + # The format is: + # opname-num_inputs-attribute_1-attribute2 + # Example: + # lerp-2-weight + # the operator lerp takes 2 arguments and has the attribute weight + attr_names = attribute_names(o) + num_inputs = get_num_inputs(o) + descriptor = '-'.join([o['name']] + attr_names + [num_inputs]) + if descriptor in seen: + continue + seen.add(descriptor) + + # map from descriptor string to the integer key in the switch statements + # that initializes the operators + top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key)) + env = { + 'name': o['name'], + 'statements': [], + 'arguments': [], + 'assignments': [], + 'initialization': [], + 'key': str(key), + } + defined_inferred_type = False + + if 'Tensor' in o['method_of']: + # make sure 'self' is the first argument. currently Declarations.yaml + # does not always do this. Instead it keeps the argument list the same order + # as the Type method. + o['arguments'] = self_as_first_argument(o['arguments']) + elif 'namespace' not in o['method_of']: + # methods on type like 'ones' or 'zeros' always take a + # string attribute that is translated into the at::Type object + # e.g. "Float" is at::kFloat + assert('Type' in o['method_of']) + defined_inferred_type = True + env['initialization'].append( + 'auto inferred_type = readTypeAttribute("type");') + + static_tensor_inputs = sum(arg['type'] != 'TensorList' and value_is_tensor_type(arg) for arg in o['arguments']) + has_tensorlist = any(arg['type'] == 'TensorList' for arg in o['arguments']) + if has_tensorlist: + tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] == 'TensorList'][0] + + real_inputs = 0 + for i, arg in enumerate(o['arguments']): + env['arguments'].append(arg['name']) + # Emulate logic in gen_jit_dispatch.py. Pretend the flat argument + # list is a stack where the end is the top. + view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs + if arg['type'] == 'TensorList': + # NOTE: do not advance real_inputs here. After this we will + # switch to indexing the "stack" from the end as if we only had + env['statements'].append( + 'auto {} = peekSlice({}, InputSize() - {}, InputSize());' + .format(arg['name'], real_inputs, static_tensor_inputs)) + elif value_is_tensor_type(arg): + # load tensor inputs from Caffe2 + + env['statements'].append( + 'auto {} = peek({}, {});'.format(arg['name'], real_inputs, view_length)) + real_inputs += 1 + if arg['dynamic_type'] == 'Tensor' and not defined_inferred_type: + # first tensor input is used to define the output type. + defined_inferred_type = True + env['statements'].append( + 'auto inferred_type = &({}.type());'.format( + arg['name'])) + else: + init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name']) + env['initialization'].append(init) + + for i, r in enumerate(o['returns']): + t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'Tensor'] + assignment = CT(t).substitute(env, offset=i, output=get_output(o, i)) + env['assignments'].append(assignment) + + if 'Tensor' in o['method_of']: + env['invocation'] = "self.{}({})".format( + o['name'], ', '.join(env['arguments'][1:])) + elif 'namespace' in o['method_of']: + env['invocation'] = CT("at::${name}(${arguments})").substitute(env) + else: + assert('Type' in o['method_of']) + env['invocation'] = CT( + 'inferred_type->${name}(${arguments})').substitute(env) + + top_env['implementations'].append(OPTION_TEMPLATE.substitute(env)) + key += 1 + write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env)) diff --git a/caffe2/contrib/cuda-convnet2/LICENSE b/caffe2/contrib/cuda-convnet2/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/caffe2/contrib/cuda-convnet2/README.md b/caffe2/contrib/cuda-convnet2/README.md new file mode 100644 index 0000000..f921264 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/README.md @@ -0,0 +1,7 @@ +# cuda-convnet2 +Automatically exported from code.google.com/p/cuda-convnet2 + +You can read the documentation in two ways: + +1. On this site: go to branches > wiki. +2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/ diff --git a/caffe2/contrib/cuda-convnet2/build.sh b/caffe2/contrib/cuda-convnet2/build.sh new file mode 100755 index 0000000..1ecbdd2 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/build.sh @@ -0,0 +1,50 @@ +#!/bin/sh +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### + +# Fill in the below environment variables. +# +# If you're not sure what these paths should be, +# you can use the find command to try to locate them. +# For example, NUMPY_INCLUDE_PATH contains the file +# arrayobject.h. So you can search for it like this: +# +# find /usr -name arrayobject.h +# +# (it'll almost certainly be under /usr) + +# CUDA toolkit installation directory. +export CUDA_INSTALL_PATH=/usr/local/cuda + +# Python include directory. This should contain the file Python.h, among others. +export PYTHON_INCLUDE_PATH=/usr/include/python2.7 + +# Numpy include directory. This should contain the file arrayobject.h, among others. +export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ + +# ATLAS library directory. This should contain the file libcblas.so, among others. +export ATLAS_LIB_PATH=/usr/lib/atlas-base + +# You don't have to change these: +export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH +export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples +export PATH=$PATH:$CUDA_INSTALL_PATH/bin + +cd util && make numpy=1 -j $* && cd .. +cd nvmatrix && make -j $* && cd .. +cd cudaconv3 && make -j $* && cd .. +cd cudaconvnet && make -j $* && cd .. +cd make-data/pyext && make -j $* && cd ../.. + diff --git a/caffe2/contrib/cuda-convnet2/convdata.py b/caffe2/contrib/cuda-convnet2/convdata.py new file mode 100644 index 0000000..c79b635 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/convdata.py @@ -0,0 +1,291 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from python_util.data import * +import numpy.random as nr +import numpy as n +import random as r +from time import time +from threading import Thread +from math import sqrt +import sys +#from matplotlib import pylab as pl +from PIL import Image +from StringIO import StringIO +from time import time +import itertools as it + +class JPEGBatchLoaderThread(Thread): + def __init__(self, dp, batch_num, label_offset, list_out): + Thread.__init__(self) + self.list_out = list_out + self.label_offset = label_offset + self.dp = dp + self.batch_num = batch_num + + @staticmethod + def load_jpeg_batch(rawdics, dp, label_offset): + if type(rawdics) != list: + rawdics = [rawdics] + nc_total = sum(len(r['data']) for r in rawdics) + + jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics)) + labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics)) + + img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32) + lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32) + dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview) + lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1)) + for c in xrange(nc_total): + lab_mat[c, [z + label_offset for z in labels[c]]] = 1 + lab_mat = n.tile(lab_mat, (dp.data_mult, 1)) + + + return {'data': img_mat[:nc_total * dp.data_mult,:], + 'labvec': lab_vec[:nc_total * dp.data_mult,:], + 'labmat': lab_mat[:nc_total * dp.data_mult,:]} + + def run(self): + rawdics = self.dp.get_batch(self.batch_num) + p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics, + self.dp, + self.label_offset) + self.list_out.append(p) + +class ColorNoiseMakerThread(Thread): + def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out): + Thread.__init__(self) + self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs + self.num_noise = num_noise + self.list_out = list_out + + def run(self): + noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T) + self.list_out.append(noise) + +class ImageDataProvider(LabeledDataProvider): + def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False): + LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) + self.data_mean = self.batch_meta['data_mean'].astype(n.single) + self.color_eig = self.batch_meta['color_pca'][1].astype(n.single) + self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)] + self.color_noise_coeff = dp_params['color_noise'] + self.num_colors = 3 + self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors)) + self.mini = dp_params['minibatch_size'] + self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size + self.inner_pixels = self.inner_size **2 + self.border_size = (self.img_size - self.inner_size) / 2 + self.multiview = dp_params['multiview_test'] and test + self.num_views = 5*2 + self.data_mult = self.num_views if self.multiview else 1 + self.batch_size = self.batch_meta['batch_size'] + self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset'] + self.scalar_mean = dp_params['scalar_mean'] + # Maintain pointers to previously-returned data matrices so they don't get garbage collected. + self.data = [None, None] # These are pointers to previously-returned data matrices + + self.loader_thread, self.color_noise_thread = None, None + self.convnet = dp_params['convnet'] + + self.num_noise = self.batch_size + self.batches_generated, self.loaders_started = 0, 0 + self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2)) + + if self.scalar_mean >= 0: + self.data_mean_crop = self.scalar_mean + + def showimg(self, img): + from matplotlib import pylab as pl + pixels = img.shape[0] / 3 + size = int(sqrt(pixels)) + img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1) + pl.imshow(img, interpolation='nearest') + pl.show() + + def get_data_dims(self, idx=0): + if idx == 0: + return self.inner_size**2 * 3 + if idx == 2: + return self.get_num_classes() + return 1 + + def start_loader(self, batch_idx): + self.load_data = [] + self.loader_thread = JPEGBatchLoaderThread(self, + self.batch_range[batch_idx], + self.label_offset, + self.load_data) + self.loader_thread.start() + + def start_color_noise_maker(self): + color_noise_list = [] + self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list) + self.color_noise_thread.start() + return color_noise_list + + def set_labels(self, datadic): + pass + + def get_data_from_loader(self): + if self.loader_thread is None: + self.start_loader(self.batch_idx) + self.loader_thread.join() + self.data[self.d_idx] = self.load_data[0] + + self.start_loader(self.get_next_batch_idx()) + else: + # Set the argument to join to 0 to re-enable batch reuse + self.loader_thread.join() + if not self.loader_thread.is_alive(): + self.data[self.d_idx] = self.load_data[0] + self.start_loader(self.get_next_batch_idx()) + #else: + # print "Re-using batch" + self.advance_batch() + + def add_color_noise(self): + # At this point the data already has 0 mean. + # So I'm going to add noise to it, but I'm also going to scale down + # the original data. This is so that the overall scale of the training + # data doesn't become too different from the test data. + + s = self.data[self.d_idx]['data'].shape + cropped_size = self.get_data_dims(0) / 3 + ncases = s[0] + + if self.color_noise_thread is None: + self.color_noise_list = self.start_color_noise_maker() + self.color_noise_thread.join() + self.color_noise = self.color_noise_list[0] + self.color_noise_list = self.start_color_noise_maker() + else: + self.color_noise_thread.join(0) + if not self.color_noise_thread.is_alive(): + self.color_noise = self.color_noise_list[0] + self.color_noise_list = self.start_color_noise_maker() + + self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size)) + self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1)) + self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff + self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size)) + self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division. + + def get_next_batch(self): + self.d_idx = self.batches_generated % 2 + epoch, batchnum = self.curr_epoch, self.curr_batchnum + + self.get_data_from_loader() + + # Subtract mean + self.data[self.d_idx]['data'] -= self.data_mean_crop + + if self.color_noise_coeff > 0 and not self.test: + self.add_color_noise() + self.batches_generated += 1 + + return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T] + + + # Takes as input an array returned by get_next_batch + # Returns a (numCases, imgSize, imgSize, 3) array which can be + # fed to pylab for plotting. + # This is used by shownet.py to plot test case predictions. + def get_plottable_data(self, data, add_mean=True): + mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1)) + return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single) + +class CIFARDataProvider(LabeledDataProvider): + def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False): + LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) + self.img_size = 32 + self.num_colors = 3 + self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size'] + self.border_size = (self.img_size - self.inner_size) / 2 + self.multiview = dp_params['multiview_test'] and test + self.num_views = 9 + self.scalar_mean = dp_params['scalar_mean'] + self.data_mult = self.num_views if self.multiview else 1 + self.data_dic = [] + for i in batch_range: + self.data_dic += [unpickle(self.get_data_file_name(i))] + self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single) + self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C') + self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C') + + self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)] + + self.batches_generated = 0 + self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1)) + + def get_next_batch(self): + epoch, batchnum = self.curr_epoch, self.curr_batchnum + self.advance_batch() + bidx = batchnum - self.batch_range[0] + + cropped = self.cropped_data[self.batches_generated % 2] + + self.__trim_borders(self.data_dic[bidx]['data'], cropped) + cropped -= self.data_mean + self.batches_generated += 1 + return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']] + + def get_data_dims(self, idx=0): + return self.inner_size**2 * self.num_colors if idx == 0 else 1 + + # Takes as input an array returned by get_next_batch + # Returns a (numCases, imgSize, imgSize, 3) array which can be + # fed to pylab for plotting. + # This is used by shownet.py to plot test case predictions. + def get_plottable_data(self, data): + return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single) + + def __trim_borders(self, x, target): + y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1]) + + if self.test: # don't need to loop over cases + if self.multiview: + start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2), + (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2), + (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)] + end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions] + for i in xrange(self.num_views): + target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1])) + else: + pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now + target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1])) + else: + for c in xrange(x.shape[1]): # loop over cases + startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1) + endY, endX = startY + self.inner_size, startX + self.inner_size + pic = y[:,startY:endY,startX:endX, c] + if nr.randint(2) == 0: # also flip the image with 50% probability + pic = pic[:,:,::-1] + target[:,c] = pic.reshape((self.get_data_dims(),)) + +class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider): + def __init__(self, data_dim): + LabeledDummyDataProvider.__init__(self, data_dim) + + self.img_size = int(sqrt(data_dim/3)) + + def get_next_batch(self): + epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self) + dic = {'data': dic[0], 'labels': dic[1]} + print dic['data'].shape, dic['labels'].shape + return epoch, batchnum, [dic['data'], dic['labels']] + + # Returns the dimensionality of the two data matrices returned by get_next_batch + def get_data_dims(self, idx=0): + return self.batch_meta['num_vis'] if idx == 0 else 1 diff --git a/caffe2/contrib/cuda-convnet2/convnet.py b/caffe2/contrib/cuda-convnet2/convnet.py new file mode 100644 index 0000000..99f8a94 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/convnet.py @@ -0,0 +1,289 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as n +import numpy.random as nr +import random as r +from python_util.util import * +from python_util.data import * +from python_util.options import * +from python_util.gpumodel import * +import sys +import math as m +import layer as lay +from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider +from os import linesep as NL +import copy as cp +import os + +class Driver(object): + def __init__(self, convnet): + self.convnet = convnet + + def on_start_batch(self, batch_data, train): + pass + + def on_finish_batch(self): + pass + +class GradCheckDriver(Driver): + def on_start_batch(self, batch_data, train): + data = batch_data[2] + self.convnet.libmodel.checkGradients(data) + +class TrainingDriver(Driver): + def on_start_batch(self, batch_data, train): + data = batch_data[2] + self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train) + +class MultiviewTestDriver(TrainingDriver): + def on_start_batch(self, batch_data, train): + self.write_output = False + if train: + TrainingDriver.on_start_batch(self, batch_data, train) + else: + data = batch_data[2] + num_views = self.convnet.test_data_provider.num_views + if self.convnet.test_out != "" and self.convnet.logreg_name != "": + self.write_output = True + self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1]) + self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single) + self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name) + else: + self.convnet.libmodel.startMultiviewTest(data, num_views) + + def on_finish_batch(self): + if self.write_output: + if not os.path.exists(self.convnet.test_out): + os.makedirs(self.convnet.test_out) + pickle(self.test_file_name, {'data': self.probs, + 'note': 'generated from %s' % self.convnet.save_file}) + +class FeatureWriterDriver(Driver): + def __init__(self, convnet): + Driver.__init__(self, convnet) + self.last_batch = convnet.test_batch_range[-1] + + def on_start_batch(self, batch_data, train): + if train: + raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.") + + self.batchnum, self.data = batch_data[1], batch_data[2] + + if not os.path.exists(self.convnet.feature_path): + os.makedirs(self.convnet.feature_path) + + self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs'] + self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single) + self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features]) + + def on_finish_batch(self): + path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum) + pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]}) + print "Wrote feature file %s" % path_out + if self.batchnum == self.last_batch: + pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file, + 'num_vis':self.num_ftrs, + 'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']}) + +class ConvNet(IGPUModel): + def __init__(self, op, load_dic, dp_params={}): + filename_options = [] + for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'): + dp_params[v] = op.get_value(v) + + IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params) + + def import_model(self): + lib_name = "cudaconvnet._ConvNet" + print "=========================" + print "Importing %s C++ module" % lib_name + self.libmodel = __import__(lib_name,fromlist=['_ConvNet']) + + def init_model_lib(self): + self.libmodel.initModel(self.layers, + self.device_ids, + self.minibatch_size, + self.conserve_mem) + + def init_model_state(self): + ms = self.model_state + layers = ms['layers'] if self.loaded_from_checkpoint else {} + ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def), + os.path.join(self.layer_path, self.layer_params), self, layers=layers) + + self.do_decouple_conv() + self.do_unshare_weights() + + self.op.set_value('conv_to_local', [], parse=False) + self.op.set_value('unshare_weights', [], parse=False) + + self.set_driver() + + def do_decouple_conv(self): + # Convert convolutional layers to local + if len(self.op.get_value('conv_to_local')) > 0: + for lname in self.op.get_value('conv_to_local'): + if self.model_state['layers'][lname]['type'] == 'conv': + lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname) + + def do_unshare_weights(self): + # Decouple weight matrices + if len(self.op.get_value('unshare_weights')) > 0: + for name_str in self.op.get_value('unshare_weights'): + if name_str: + name = lay.WeightLayerParser.get_layer_name(name_str) + if name is not None: + name, idx = name[0], name[1] + if name not in self.model_state['layers']: + raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name) + layer = self.model_state['layers'][name] + lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx) + else: + raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str) + + def set_driver(self): + if self.op.get_value('check_grads'): + self.driver = GradCheckDriver(self) + elif self.op.get_value('multiview_test'): + self.driver = MultiviewTestDriver(self) + elif self.op.get_value('write_features'): + self.driver = FeatureWriterDriver(self) + else: + self.driver = TrainingDriver(self) + + def fill_excused_options(self): + if self.op.get_value('check_grads'): + self.op.set_value('save_path', '') + self.op.set_value('train_batch_range', '0') + self.op.set_value('test_batch_range', '0') + self.op.set_value('data_path', '') + + # Make sure the data provider returned data in proper format + def parse_batch_data(self, batch_data, train=True): + if max(d.dtype != n.single for d in batch_data[2]): + raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.") + return batch_data + + def start_batch(self, batch_data, train=True): + self.driver.on_start_batch(batch_data, train) + + def finish_batch(self): + ret = IGPUModel.finish_batch(self) + self.driver.on_finish_batch() + return ret + + def print_iteration(self): + print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()), + + def print_train_time(self, compute_time_py): + print "(%.3f sec)" % (compute_time_py) + + def print_costs(self, cost_outputs): + costs, num_cases = cost_outputs[0], cost_outputs[1] + children = set() + for errname in costs: + if sum(errname in self.layers[z]['children'] for z in costs) == 0: +# print self.layers[errname]['children'] + for child in set(self.layers[errname]['children']) & set(costs.keys()): + costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])] + children.add(child) + + filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases) + print "%s: " % errname, + if 'outputFilterFormatter' not in self.layers[errname]: + print ", ".join("%.6f" % v for v in filtered_costs), + else: + print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs), + if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]): + print "<- error nan or inf!" + sys.exit(1) + for c in children: + del costs[c] + + def print_train_results(self): + self.print_costs(self.train_outputs[-1]) + + def print_test_status(self): + pass + + def print_test_results(self): + print NL + "======================Test output======================" + self.print_costs(self.test_outputs[-1]) + if not self.test_only: + print NL + "----------------------Averages-------------------------" + self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):])) + print NL + "-------------------------------------------------------", + for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now. + l = self.layers[name] + if 'weights' in l: + wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))] + print "" + print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales), + print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))), + print "" + + def conditional_save(self): + self.save_state() + + def aggregate_test_outputs(self, test_outputs): + test_outputs = cp.deepcopy(test_outputs) + num_cases = sum(t[1] for t in test_outputs) + for i in xrange(1 ,len(test_outputs)): + for k,v in test_outputs[i][0].items(): + for j in xrange(len(v)): + test_outputs[0][0][k][j] += test_outputs[i][0][k][j] + + return (test_outputs[0][0], num_cases) + + @classmethod + def get_options_parser(cls): + op = IGPUModel.get_options_parser() + op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128) + op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False) + op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file") + op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="") + op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range']) + op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0) + op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True) + op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[]) + op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[]) + op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0) + op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0) + op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test']) + op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="") + op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1) + + op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path']) + op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="") + + op.delete_option('max_test_err') + op.options["testing_freq"].default = 57 + op.options["num_epochs"].default = 50000 + op.options['dp_type'].default = None + + DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider) + DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider) + DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider) + + return op + +if __name__ == "__main__": +# nr.seed(6) + + op = ConvNet.get_options_parser() + + op, load_dic = IGPUModel.parse_options(op) + model = ConvNet(op, load_dic) + model.start() diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile b/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile new file mode 100644 index 0000000..3d16e28 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile @@ -0,0 +1,108 @@ +################################################################################ +# +# Copyright 1993-2012 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ + +# Location of the CUDA Toolkit binaries and libraries +CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include +CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin +CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64 + +# Common binaries +NVCC = $(CUDA_BIN_PATH)/nvcc +GCC = g++ +AR = ar + +# CUDA code generation flags +GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 +GENCODE_FLAGS := $(GENCODE_SM35) + +LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart +CCFLAGS := -m64 +NVCCFLAGS := -m64 + +# Debug build flags +ifeq ($(dbg),1) + CCFLAGS += -g + NVCCFLAGS += -g -G + DBG := debug +else + DBG := release + NVCCFLAGS += -O3 + CCFLAGS += -O3 +endif + +# Add profiler output +ifeq ($(prof),1) + NVCCFLAGS += --ptxas-options=-v +endif + +TARGETDIR := ./bin/$(DBG) +OBJDIR := ./obj/$(DBG) + +########## USER STUFF ########### +LDFLAGS += -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas +INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include + +CUFILES := $(shell find . -name "*.cu") +CU_DEPS := $(shell find . -name "*.cuh") +CCFILES := $(shell find . -name "*.cpp") +C_DEPS := $(shell find . -name "*.h") + +NVCCFLAGS += --compiler-options '-fPIC' +LDFLAGS += -shared +CCFLAGS += -fPIC +TARGET := $(TARGETDIR)/libcudaconv.so + +################################################################################ +# Set up target and object files +################################################################################ +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES)) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES)) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES)) + +# Target rules +all: makedirs $(TARGET) + +$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS) + $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $< + +$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS) + $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< + +$(TARGET): $(OBJS) + $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) + ln -sf $(TARGET) . + +makedirs: + mkdir -p $(TARGETDIR) + mkdir -p $(OBJDIR)/src + +clean: + rm -rf ./obj diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh b/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh new file mode 100644 index 0000000..6a4bd95 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh @@ -0,0 +1,1164 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CONV_UTIL_CUH +#define CONV_UTIL_CUH + +#include "../../nvmatrix/include/nvmatrix.cuh" + +#include "caffe2/core/context_gpu.h" + +#ifndef MIN +#define MIN(a, b) ((a) > (b) ? (b) : (a)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif + +void convLocalMaxUndo( + NVMatrix& images, + NVMatrix& maxGrads, + NVMatrix& maxActs, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX); +void convLocalAvgUndo( + NVMatrix& avgGrads, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX, + int imgSize, + bool sum); + +void convLocalAvgUndo( + NVMatrix& avgGrads, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX, + int imgSize, + bool sum, + float scaleTargets, + float scaleOutput); +void convLocalMaxUndo( + NVMatrix& images, + NVMatrix& maxGrads, + NVMatrix& maxActs, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX, + float scaleTargets, + float scaleOutput); + +void convResponseNorm( + NVMatrix& images, + NVMatrix& denoms, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float minDiv); +void convResponseNormUndo( + NVMatrix& outGrads, + NVMatrix& denoms, + NVMatrix& inputs, + NVMatrix& acts, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float scaleTargets, + float scaleOutput); +void convContrastNorm( + NVMatrix& images, + NVMatrix& meanDiffs, + NVMatrix& denoms, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float minDiv); +void convContrastNormUndo( + NVMatrix& outGrads, + NVMatrix& denoms, + NVMatrix& meanDiffs, + NVMatrix& acts, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float scaleTargets, + float scaleOutput); + +void convGaussianBlur( + NVMatrix& images, + NVMatrix& filter, + NVMatrix& target, + bool horiz, + int numChannels, + float scaleTargets, + float scaleOutputs); +void convBedOfNails( + NVMatrix& images, + NVMatrix& target, + int numChannels, + int imgSize, + int startX, + int strideX, + float scaleTargets, + float scaleOutput); +void convBedOfNailsUndo( + NVMatrix& actsGrad, + NVMatrix& target, + int numChannels, + int imgSize, + int startX, + int strideX, + float scaleTargets, + float scaleOutput); + +void convResizeBilinear( + NVMatrix& images, + NVMatrix& target, + int imgSize, + int tgtSize, + float scale); +void convRGBToYUV(NVMatrix& images, NVMatrix& target); +void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center); +void convCrop( + NVMatrix& imgs, + NVMatrix& target, + int imgSize, + int tgtSize, + int startY, + int startX); +void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm); +void convContrastNormCrossMap( + NVMatrix& images, + NVMatrix& meanDiffs, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + float minDiv, + bool blocked); +void convResponseNormCrossMapUndo( + NVMatrix& outGrads, + NVMatrix& inputs, + NVMatrix& acts, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + float minDiv, + bool blocked, + float scaleTargets, + float scaleOutput); +void convResponseNormCrossMap( + NVMatrix& images, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + bool blocked); +void convResponseNormCrossMap( + NVMatrix& images, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + float minDiv, + bool blocked); +void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize); + +void convCrossMapMaxPoolUndo( + NVMatrix& images, + NVMatrix& maxGrads, + NVMatrix& maxActs, + NVMatrix& target, + const int imgSize, + const int startF, + const int poolSize, + const int stride, + const float scaleTargets, + const float scaleOutputs); + +cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor); + +template +class AvgPooler { + public: + __device__ inline float operator()(const float a, const float b) const { + return a + b; + } + __device__ inline float getBaseValue() const { + return 0; + } + __device__ inline float output(const float a, const int regionSize) const { + return sum ? a : (a / regionSize); + } +}; + +class MaxPooler { + public: + __device__ inline float operator()(const float a, const float b) const { + return fmaxf(a, b); + } + __device__ inline float getBaseValue() const { + return -2e38; + } + __device__ inline float output(const float a, const int regionSize) const { + return a; + } +}; + +class MaxAbsPooler { + public: + __device__ inline float operator()(const float a, const float b) const { + return fabsf(a) > fabsf(b) ? a : b; + } + __device__ inline float getBaseValue() const { + return 0.0f; + } + __device__ inline float output(const float a, const int regionSize) const { + return a; + } +}; + +/* + * Block size B_YxB_X + * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread + * + * So each block does one output for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numFilters, imgPixels, numImages) + * target: (numFilters, numOutputs, numImages) + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + */ + +template < + class Agg, + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + bool checkCaseBounds> +__global__ void kLocalPool( + float* imgs, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int subsX, + const int startX, + const int strideX, + const int outputsX, + Agg agg) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = DIVUP(numFilters, B_Y * filtersPerThread); + const int outputIdxX = blockIdx.x / numImgBlocks; + const int outputIdxY = blockIdx.y / numFilterBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = + (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread; + const int myFilterIdx = (blockFilterIdx + threadIdx.y * filtersPerThread); + if (myFilterIdx >= numFilters) { + return; + } + + const int outputIdx = outputIdxY * outputsX + outputIdxX; + const int numOutputs = outputsX * outputsX; + const int imgPixels = imgSize * imgSize; + + const int startImgPxX = startX + outputIdxX * strideX; + const int startImgPxY = startX + outputIdxY * strideX; + const int imgIdx = blockImgIdx + threadIdx.x; + + imgs += myFilterIdx * imgPixels * numImages + imgIdx; + target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[f][i] = agg.getBaseValue(); + } + } + + const int loopStartY = MAX(0, startImgPxY); + const int loopStartX = MAX(0, startImgPxX); + const int loopEndY = MIN(imgSize, startImgPxY + subsX); + const int loopEndX = MIN(imgSize, startImgPxX + subsX); + const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX); + for (int y = loopStartY; y < loopEndY; y++) { + for (int x = loopStartX; x < loopEndX; x++) { + const int imgPx = y * imgSize + x; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] = + agg(prod[f][i], + imgs[(f * imgPixels + imgPx) * numImages + i * B_X]); + } + } + } + } + } + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + target[f * numOutputs * numImages + i * B_X] = + agg.output(prod[f][i], regionSize); + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, output idx in batches of B_Y + * + * So each block does one pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines output idx + * + * imgs: (numFilters, imgPixels, numImages) + * target: (numOutputs, imgPixels, numImages) (out) + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + */ +template +__global__ void kPoolCrossMap( + float* imgs, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int startF, + const int poolSize, + const int numOutputs, + const int stride, + Agg agg) { + const int imgPixels = imgSize * imgSize; + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + // const int numOutputs = DIVUP(numFilters, stride); + const int numOutputBlocks = DIVUP(numOutputs, B_Y); + const int pxIdxX = blockIdx.x / numImgBlocks; + const int pxIdxY = blockIdx.y / numOutputBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y; + // const int filterIdx = outputIdx * stride; + + const int pxIdx = pxIdxY * imgSize + pxIdxX; + const int imgIdx = blockImgIdx + threadIdx.x; + + if (outputIdx < numOutputs) { + imgs += (pxIdx)*numImages + imgIdx; + target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx; + + float prod[imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[i] = agg.getBaseValue(); + } + } + + const int myStartF = startF + outputIdx * stride; + const int loopStartF = max(0, myStartF); + const int loopEndF = min(numFilters, myStartF + poolSize); + + for (int f = loopStartF; f < loopEndF; ++f) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]); + } + } + } + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + target[i * B_X] = agg.output(prod[i], poolSize); + } + } + } +} + +/* + * imgs: (numFilters, imgPixels, numImages) + * target: (numOutputs, imgPixels, numImages) + */ +template +void convPoolCrossMap( + NVMatrix& images, + NVMatrix& target, + const int startF, + const int poolSize, + const int numOutputs, + const int stride, + const int imgSize, + Pooler pooler) { + int numImages = images.getNumCols(); + int imgPixels = imgSize * imgSize; + int numFilters = images.getNumRows() / imgPixels; + assert(images.getNumRows() == numFilters * imgPixels); + + assert(!images.isTrans()); + assert(!target.isTrans()); + assert(images.isContiguous()); + // assert(numFilters % 4 == 0); + // assert(numImages % 128 == 0); + assert(stride <= poolSize); + assert(startF <= 0); + assert( + startF + (numOutputs - 1) * stride + poolSize >= + numFilters); // All filters must be covered + + cudaStream_t stream = NVMatrix::getDefaultStream(); + target.resize(imgPixels * numOutputs, numImages); + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + + dim3 threads(32, 4); + dim3 blocks( + imgSize * DIVUP(numImages, threads.x * imgsPerThread), + imgSize * DIVUP(numOutputs, threads.y)); + bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0; + if (!checkCaseBounds) { + if (imgsPerThread == 4) { + cudaFuncSetCacheConfig( + kPoolCrossMap, cudaFuncCachePreferShared); + kPoolCrossMap<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + pooler); + + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig( + kPoolCrossMap, cudaFuncCachePreferShared); + kPoolCrossMap<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + pooler); + + } else if (imgsPerThread == 1) { + cudaFuncSetCacheConfig( + kPoolCrossMap, cudaFuncCachePreferShared); + kPoolCrossMap<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + pooler); + } + } else { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig( + kPoolCrossMap, cudaFuncCachePreferShared); + kPoolCrossMap<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + pooler); + } else { + assert(false); + } + } + getLastCudaError("convPoolCrossMap: kernel execution failed"); +} + +/* + * Block size 16xB_X + * blockIdx.x determines 4x4 pixel.x region, image idx in batches of + * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in + * batches of filtersPerThread + * + * So each block does a 4x4 region for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines pixel idx + * + * imgs: (numFilters, imgPixels, numImages) + * target: (numFilters, numOutputs, numImages) + * + * B_X one of 8, 16, 32 + * imgsPerThread one of 1, 2, 4, 8, 16 + * + * B_XximgsPerThread MUST be divisible by 32. + * Number of filters MUST be divisible by filtersPerThread. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * + * Final write-out will not be fully coalesced unless B_X is 32. But there's a + * lot more reading than writing here, and the reading is all coalesced, so it + * should be OK. + * + * To be used when the stride is 1 and the pooling region is fairly large. + */ +template < + class Agg, + int B_X, + int imgsPerThread, + int filtersPerThread, + bool checkCaseBounds> +__global__ void kLocalPool2( + float* imgs, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int subsX, + const int startX, + const int outputsX, + Agg agg) { + __shared__ float shImgs[filtersPerThread][B_X * imgsPerThread]; + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / (filtersPerThread); + const int blockOutputX = 4 * (blockIdx.x / numImgBlocks); + const int blockOutputY = 4 * (blockIdx.y / numFilterBlocks); + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread; + + // const int blockOutputIdx = blockOutputY * outputsX + blockOutputX; + const int numOutputs = outputsX * outputsX; + const int imgPixels = imgSize * imgSize; + + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + const int myX = threadIdx.y % 4; + const int myY = threadIdx.y / 4; + + const int myOutputIdxY = blockOutputY + myY; + const int myOutputIdxX = blockOutputX + myX; + const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX; + + const int startImgPxX = startX + blockOutputX; + const int startImgPxY = startX + blockOutputY; + const int endImgPxX = startImgPxX + subsX; + const int endImgPxY = startImgPxY + subsX; + + const int myStartImgPxY = startImgPxY + myY; + const int myStartImgPxX = startImgPxX + myX; + const int myEndImgPxY = endImgPxY + myY; + const int myEndImgPxX = endImgPxX + myX; + + const int loopStartY = MAX(startImgPxY, 0); + const int loopStartX = MAX(startImgPxX, 0); + const int loopEndY = MIN(imgSize, endImgPxY + 3); + const int loopEndX = MIN(imgSize, endImgPxX + 3); + + const int imgIdx = blockImgIdx + threadIdx.x; + + imgs += + (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; + target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[f][i] = agg.getBaseValue(); + } + } + int regionSize = 0; + for (int y = loopStartY; y < loopEndY; y++) { + const bool isInY = y >= myStartImgPxY && y < myEndImgPxY; + for (int x = loopStartX; x < loopEndX; x++) { + // Load a pixel + const int px = y * imgSize + x; +#pragma unroll + for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) { + if (filtersPerThread % (B_X / 2) == 0 || + ly + loadY < filtersPerThread) { +#pragma unroll + for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) { + if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) { + shImgs[ly + loadY][lx + loadX] = + imgs[(ly * imgPixels + px) * numImages + lx]; + } + } + } + } + __syncthreads(); + + // Is this pixel in my region? + if (isInY && x >= myStartImgPxX && x < myEndImgPxX) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]); + } + } + } + ++regionSize; + } + __syncthreads(); + } + } + if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + target[f * numOutputs * numImages + i * B_X] = + agg.output(prod[f][i], regionSize); + } + } + } + } +} + +/* + * imgs: (numFilters, imgPixels, numImages) + * target: (numFilters, outputs, numImages) + */ +template +void convLocalPool( + NVMatrix& images, + NVMatrix& target, + int numFilters, + int subsX, + int startX, + int strideX, + int outputsX, + Pooler pooler) { + int numImages = images.getNumCols(); + int imgPixels = images.getNumRows() / numFilters; + assert(images.getNumRows() == numFilters * imgPixels); + int imgSize = int(sqrt(imgPixels)); + assert(imgSize * imgSize == imgPixels); + + assert(!images.isTrans()); + assert(!target.isTrans()); + assert(images.isContiguous()); + // assert(numFilters % 4 == 0); + // assert(numImages % 128 == 0); + cudaStream_t stream = NVMatrix::getDefaultStream(); + int outputs = outputsX * outputsX; + target.resize(numFilters * outputs, numImages); + + if (strideX == 1 && subsX >= 6 && outputsX > 1) { + // NOTE: this part has not been optimized for Kepler + int imgsPerThread = numImages % 128 == 0 ? 8 : 4; + int filtersPerThread = numFilters % 4 == 0 + ? 4 + : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1; + int bx = 8; + bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0; + assert((imgsPerThread * bx) % 32 == 0); + assert(numFilters % filtersPerThread == 0); + dim3 threads(bx, 16); + dim3 blocks( + DIVUP(outputsX, 4) * DIVUP(numImages, bx * imgsPerThread), + DIVUP(outputsX, 4) * numFilters / filtersPerThread); + // printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters: + // %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n", + // threads.y, threads.x, blocks.y, blocks.x, imgSize, + // numFilters, numImages, subsX, startX, outputsX); + if (imgsPerThread == 8) { + if (filtersPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } else if (filtersPerThread == 2) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } else if (filtersPerThread == 3) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } else if (filtersPerThread == 4) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } + } else if (imgsPerThread == 4) { + if (filtersPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } else if (filtersPerThread == 2) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } else if (filtersPerThread == 3) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } else if (filtersPerThread == 4) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool2, cudaFuncCachePreferShared); + kLocalPool2<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + outputsX, + pooler); + } + } + } + } else { + int filtersPerThread = numFilters % 16 == 0 ? 4 : 1; + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + dim3 threads(32, 4); + dim3 blocks( + DIVUP(numImages, 32 * imgsPerThread) * outputsX, + DIVUP(numFilters, 4 * filtersPerThread) * outputsX); + if (imgsPerThread == 4) { + if (filtersPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool + <<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool + <<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } + } + } else if (imgsPerThread == 2) { + if (filtersPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool + <<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool + <<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } + } + } else { + if (filtersPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool + <<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool<<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } else { + cudaFuncSetCacheConfig( + kLocalPool, cudaFuncCachePreferL1); + kLocalPool + <<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + pooler); + } + } + } + } + getLastCudaError("convLocalPool: kernel execution failed"); +} + +#endif /* CONV_UTIL_CUH */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh b/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh new file mode 100644 index 0000000..dc92cb7 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh @@ -0,0 +1,197 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COMMON_CUH +#define COMMON_CUH + +#include // helper functions CUDA error checking and initialization +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "conv_util.cuh" + +#include "caffe2/core/context_gpu.h" + +enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE }; + +void convFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups); +void convFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput); + +void localFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups); +void localFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput); + +void convImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups); +void convImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput); + +void localImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups); +void localImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput); + +void convWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + int sumWidth); +void convWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + int sumWidth, + float scaleTargets, + float scaleOutput); + +void localWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups); + +void localWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput); + +#endif /* COMMON_CUH */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu new file mode 100644 index 0000000..61f60bd --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu @@ -0,0 +1,5019 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "../../nvmatrix/include/nvmatrix_kernels.cuh" +#include "../include/conv_util.cuh" + +using namespace std; + +__device__ inline float square(const float a) { + return a * a; +} + +/* + * Horizontal reflection. + * imgs: (numColors, imgSize, imgSize, numCases) + * targets: (numColors, imgSize, imgSize, numCases) + * + * targets should be a different array from imgs. + * + * Block size: (4, 32) + * blockIdx.y * 4 + threadIdx.y determines pixel + * blockIdx.x * 32 * imgsPerThread + threadIdx.x determines case batch + * + */ +template +__global__ void +kReflectH(float* imgs, float* targets, const int imgSize, const int numCases) { + const int pxIdx = blockIdx.y * 4 + threadIdx.y; + const int imgPixels = imgSize * imgSize; + + if (pxIdx < imgPixels) { + const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; + const int pxIdxY = pxIdx / imgSize; + const int pxIdxX = pxIdx % imgSize; + + const int pxIdxXR = imgSize - 1 - pxIdxX; // reflected coordinate + const int pxIdxR = pxIdxY * imgSize + pxIdxXR; + + imgs += pxIdx * numCases + caseIdx; + targets += pxIdxR * numCases + caseIdx; + +#pragma unroll + for (int i = 0; i < imgsPerThread; ++i) { + if (!checkCaseBounds || caseIdx + i * 32 < numCases) { +#pragma unroll + for (int c = 0; c < numColors; ++c) { + targets[c * imgPixels * numCases + i * 32] = + imgs[c * imgPixels * numCases + i * 32]; + } + } + } + } +} +/* + * Horizontal reflection. + * imgs: (numColors, imgSize, imgSize, numCases) + * targets: (numColors, imgSize, imgSize, numCases) + */ +void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize) { + int numCases = images.getNumCols(); + int imgPixels = imgSize * imgSize; + int numColors = images.getNumRows() / imgPixels; + assert(numColors * imgPixels == images.getNumRows()); + assert(numColors > 0 && numColors <= 3); + + targets.resize(images); + int imgsPerThread = numCases % 128 == 0 ? 4 : numCases % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numCases % (32 * imgsPerThread) != 0; + dim3 threads(32, 4); + dim3 blocks(DIVUP(numCases, imgsPerThread * 32), DIVUP(imgPixels, 4)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (checkCaseBounds) { + if (numColors == 1) { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig(kReflectH<1, 1, true>, cudaFuncCachePreferL1); + kReflectH<1, 1, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig(kReflectH<1, 2, true>, cudaFuncCachePreferL1); + kReflectH<1, 2, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 4) { + cudaFuncSetCacheConfig(kReflectH<1, 4, true>, cudaFuncCachePreferL1); + kReflectH<1, 4, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } + } else if (numColors == 2) { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig(kReflectH<2, 1, true>, cudaFuncCachePreferL1); + kReflectH<2, 1, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig(kReflectH<2, 2, true>, cudaFuncCachePreferL1); + kReflectH<2, 2, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 4) { + cudaFuncSetCacheConfig(kReflectH<2, 4, true>, cudaFuncCachePreferL1); + kReflectH<2, 4, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } + } else if (numColors == 3) { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig(kReflectH<3, 1, true>, cudaFuncCachePreferL1); + kReflectH<3, 1, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig(kReflectH<3, 2, true>, cudaFuncCachePreferL1); + kReflectH<3, 2, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 4) { + cudaFuncSetCacheConfig(kReflectH<3, 4, true>, cudaFuncCachePreferL1); + kReflectH<3, 4, true><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } + } + } else { + if (numColors == 1) { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig(kReflectH<1, 1, false>, cudaFuncCachePreferL1); + kReflectH<1, 1, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig(kReflectH<1, 2, false>, cudaFuncCachePreferL1); + kReflectH<1, 2, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 4) { + cudaFuncSetCacheConfig(kReflectH<1, 4, false>, cudaFuncCachePreferL1); + kReflectH<1, 4, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } + } else if (numColors == 2) { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig(kReflectH<2, 1, false>, cudaFuncCachePreferL1); + kReflectH<2, 1, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig(kReflectH<2, 2, false>, cudaFuncCachePreferL1); + kReflectH<2, 2, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 4) { + cudaFuncSetCacheConfig(kReflectH<2, 4, false>, cudaFuncCachePreferL1); + kReflectH<2, 4, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } + } else if (numColors == 3) { + if (imgsPerThread == 1) { + cudaFuncSetCacheConfig(kReflectH<3, 1, false>, cudaFuncCachePreferL1); + kReflectH<3, 1, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 2) { + cudaFuncSetCacheConfig(kReflectH<3, 2, false>, cudaFuncCachePreferL1); + kReflectH<3, 2, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } else if (imgsPerThread == 4) { + cudaFuncSetCacheConfig(kReflectH<3, 4, false>, cudaFuncCachePreferL1); + kReflectH<3, 4, false><<>>( + images.getDevData(), targets.getDevData(), imgSize, numCases); + } + } + } + getLastCudaError("kReflectH: kernel execution failed"); +} + +/* + * blockIdx.y determines module in batches of B_Y + * blockIdx.x determines filter in batches of B_X * filtersPerThread + * + * weights: (numModules, numColors, filterPixels, numFilters) + * Not fully coalesced if B_X < 32, so use cache. + */ +template +__global__ void kNormalizeLCWeights( + float* weights, + const uint numFilters, + const int numModules, + const uint weightsPerFilter, + const float norm) { + const uint moduleIdx = B_Y * blockIdx.y + threadIdx.y; + const uint filterIdx = B_X * blockIdx.x + threadIdx.x; + + float prod[filtersPerThread]; +#pragma unroll + for (uint i = 0; i < filtersPerThread; ++i) { + prod[i] = 0; + } + if (moduleIdx < numModules) { + weights += moduleIdx * weightsPerFilter * numFilters + filterIdx; + for (uint p = 0; p < weightsPerFilter; ++p) { +#pragma unroll + for (uint i = 0; i < filtersPerThread; ++i) { + prod[i] += square(weights[p * numFilters + i * B_X]); + } + } + +#pragma unroll + for (uint i = 0; i < filtersPerThread; ++i) { + prod[i] = sqrtf(prod[i]); + prod[i] = prod[i] > norm ? __fdividef(norm, prod[i]) : 1.0f; + } + + for (uint p = 0; p < weightsPerFilter; ++p) { +#pragma unroll + for (uint i = 0; i < filtersPerThread; ++i) { + weights[p * numFilters + i * B_X] *= prod[i]; + } + } + } +} + +/* + * weights: (numModules, numColors, filterPixels, numFilters) + */ +void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm) { + int numFilters = weights.getNumCols(); + int weightsPerFilter = weights.getNumRows() / numModules; + assert(numModules * weightsPerFilter == weights.getNumRows()); + + assert(!weights.isTrans()); + assert(weights.isContiguous()); + assert(numFilters % 16 == 0); + + int bx = numFilters % 32 == 0 ? 32 : 16; + int by = bx == 32 ? 4 : 8; + + int filtersPerThread = + numFilters % 128 == 0 ? 4 : numFilters % 64 == 0 ? 2 : 1; + dim3 blocks(numFilters / (bx * filtersPerThread), DIVUP(numModules, by)); + dim3 threads(bx, by); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (filtersPerThread == 4) { + cudaFuncSetCacheConfig( + kNormalizeLCWeights<4, 32, 4>, cudaFuncCachePreferL1); + kNormalizeLCWeights<4, 32, 4><<>>( + weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); + } else if (filtersPerThread == 2) { + cudaFuncSetCacheConfig( + kNormalizeLCWeights<4, 32, 2>, cudaFuncCachePreferL1); + kNormalizeLCWeights<4, 32, 2><<>>( + weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); + } else { + if (numFilters % 32 == 0) { + cudaFuncSetCacheConfig( + kNormalizeLCWeights<4, 32, 1>, cudaFuncCachePreferL1); + kNormalizeLCWeights<4, 32, 1><<>>( + weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); + } else { + cudaFuncSetCacheConfig( + kNormalizeLCWeights<8, 16, 1>, cudaFuncCachePreferL1); + kNormalizeLCWeights<8, 16, 1><<>>( + weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); + } + } +} + +/* + * Block size 4x32 + * blockIdx.x determines img idx in batches of 32*imgsPerThread + * blockIdx.y determines channel idx, pixel idx in batches of 4 + * + * threadIdx.x determins case idx + * threadIdx.y determines pixel idx + * + * imgs: (numChannels, imgPixels, numImages) with given imgStride + * target: (numChannels, tgtPixels, numImages) + */ +template +__global__ void kCrop( + float* imgs, + float* target, + const uint numImages, + const int imgStride, + const uint imgSize, + const uint tgtSize, + const uint startY, + const uint startX) { + const uint imgPixels = imgSize * imgSize; + const uint tgtPixels = tgtSize * tgtSize; + const uint caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; + const uint blockChanIdx = blockIdx.y / DIVUP(tgtPixels, 4); + const uint tgtPixelIdx = 4 * (blockIdx.y % DIVUP(tgtPixels, 4)) + threadIdx.y; + const uint tgtPxY = tgtPixelIdx / tgtSize; + const uint tgtPxX = tgtPixelIdx % tgtSize; + const uint srcPixelIdx = (startY + tgtPxY) * imgSize + startX + tgtPxX; + + if (tgtPixelIdx < tgtPixels) { + imgs += (blockChanIdx * imgPixels + srcPixelIdx) * imgStride + caseIdx; + target += (blockChanIdx * tgtPixels + tgtPixelIdx) * numImages + caseIdx; + +#pragma unroll + for (uint i = 0; i < imgsPerThread; ++i) { + if (!checkCaseBounds || (caseIdx + 32 * i < numImages)) { + target[i * 32] = imgs[i * 32]; + } + } + } +} + +/* + * Block size 4x32 + * blockIdx.y determines pixel idx in batches of 4 + * blockIdx.x determines case idx in batches of 32*imgsPerThread + * threadIdx.y determines pixel idx + * threadIdx.x determines case idx + * + * imgs: (3, imgPixels, numImages) with given imgStride + * target: (3, imgPixels, numImages) + * + * Each thread produces (y,u,v) values for a particular (r,g,b) pixel + * + * The RGB --> YUV transform is (http://en.wikipedia.org/wiki/YUV): + * + * [Y] [ 0.2126 0.7152 0.0722 ][R] + * [U] = [-0.09991 -0.33609 0.436 ][G] + * [V] [ 0.615 -0.55861 -0.05639][B] + */ +template +__global__ void kRGBToYUV( + float* imgs, + float* target, + const int imgPixels, + const int numImages, + const int imgStride) { + const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; + const int pxIdx = blockIdx.y * 4 + threadIdx.y; + + if (pxIdx < imgPixels) { + const int imgChannelStride = imgPixels * imgStride; + const int tgtChannelStride = imgPixels * numImages; + imgs += pxIdx * imgStride + caseIdx; + target += pxIdx * numImages + caseIdx; + +#pragma unroll + for (int i = 0; i < imgsPerThread; ++i) { + if (!checkCaseBounds || caseIdx + i * 32 < numImages) { + const float R = imgs[0 * imgChannelStride + i * 32]; + const float G = imgs[1 * imgChannelStride + i * 32]; + const float B = imgs[2 * imgChannelStride + i * 32]; + target[0 * tgtChannelStride + i * 32] = + 0.2126f * R + 0.7152f * G + 0.0722f * B; // Y + target[1 * tgtChannelStride + i * 32] = + -0.09991f * R + -0.33609f * G + 0.436f * B; // U + target[2 * tgtChannelStride + i * 32] = + 0.615f * R + -0.55861f * G + -0.05639f * B; // V + } + } + } +} + +__device__ inline float labf(const float x) { + if (x > 0.0088564517f) { + return __powf(x, 0.3333f); + } + return 7.787037f * x + 0.13793103f; +} + +/* + * Block size 4x32 + * blockIdx.y determines pixel idx in batches of 4 + * blockIdx.x determines case idx in batches of 32*imgsPerThread + * threadIdx.y determines pixel idx + * threadIdx.x determines case idx + * + * imgs: (3, imgPixels, numImages) with given imgStride + * target: (3, imgPixels, numImages) + * + * This proceeds in two steps. + * + * - First, RGB values are linearly transformed to XYZ as per + * http://en.wikipedia.org/wiki/CIE_XYZ_color_space + * - Second, XYZ values are nonlinearly transformed to L*a*b* as per + * http://en.wikipedia.org/wiki/Lab_color_space#The_forward_transformation + * + * Each thread produces (L*,a*,b*) values for a particular (r,g,b) pixel + * + * The RGB --> XYZ transform is: + * + * [X] [0.49 0.31 0.2 ][R] + * [Y] = 5.6506753 * [0.17697 0.8124 0.01063 ][G] + * [Z] [0 0.01 0.99 ][B] + * + * NOTE: The input should be in the range 0-1. Don't do mean-subtraction + * beforehand. + * + * Then X_max, Y_max, Z_max = 5.6506753. + * + * The range of the L* values is [0, 100]. + * If the center flag is given, the range will be [-50, 50]. + * + */ +template +__global__ void kRGBToLAB( + float* imgs, + float* target, + const int imgPixels, + const int numImages, + const int imgStride) { + const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; + const int pxIdx = blockIdx.y * 4 + threadIdx.y; + + if (pxIdx < imgPixels) { + const int imgChannelStride = imgPixels * imgStride; + const int tgtChannelStride = imgPixels * numImages; + imgs += pxIdx * imgStride + caseIdx; + target += pxIdx * numImages + caseIdx; + +#pragma unroll + for (int i = 0; i < imgsPerThread; ++i) { + if (!checkCaseBounds || caseIdx + i * 32 < numImages) { + const float R = imgs[0 * imgChannelStride + i * 32]; + const float G = imgs[1 * imgChannelStride + i * 32]; + const float B = imgs[2 * imgChannelStride + i * 32]; + + const float X = (0.49f * R + 0.31f * G + 0.2f * B); + const float Y = (0.17697f * R + 0.8124f * G + 0.01063f * B); + const float Z = (0.01f * G + 0.99f * B); + + const float labX = labf(X); + const float labY = labf(Y); + const float labZ = labf(Z); + + target[0 * tgtChannelStride + i * 32] = + 116.0f * labY - 16.0f - (center ? 50.0f : 0); // L* + target[1 * tgtChannelStride + i * 32] = 500.0f * (labX - labY); // a* + target[2 * tgtChannelStride + i * 32] = 200.0f * (labY - labZ); // b* + } + } + } +} + +/* + * Block size 16x32. + * Each block produces a 4x4 chunk of the output image. + * threadIdx.y determines pixel idx in 4x4 chunk. + * threadIdx.x determines case idx. + * blockIdx.x determines case idx in batches of 32*imgsPerThread. + * blockIdx.y determines 4x4 chunk idx, channel idx. + * + * imgs: (numChannels, imgPixels, numImages) with given imgStride + * target: (numChannels, tgtPixels, numImages) + * + * imgSize = scale * tgtSize (roughly) + * + * This is a rather naive kernel that relies on cache for speed. But all it's + * doing is basic texture manipulation, which is very local in nature, so it + * should be ok. Also, it will in practice be a tiny fraction of the runtime of + * a large convnet. + * + * So that is my justification for being lazy here. + */ +template +__global__ void kResizeBilinear( + float* imgs, + float* target, + const int imgSize, + const int tgtSize, + const int numImages, + const int imgStride, + const float scale, + const float centerScale) { + const int numChunksX = DIVUP(tgtSize, 4); + const int numChunks = numChunksX * numChunksX; + const int channelIdx = blockIdx.y / numChunks; + const int chunkIdx = blockIdx.y % numChunks; + const int chunkIdxX = chunkIdx % numChunksX; + const int chunkIdxY = chunkIdx / numChunksX; + const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; + const int imgPixels = imgSize * imgSize; + const int tgtPixels = tgtSize * tgtSize; + + const int pxX = 4 * chunkIdxX + threadIdx.y % 4; + const int pxY = 4 * chunkIdxY + threadIdx.y / 4; + + if (pxY < tgtSize && pxX < tgtSize) { + const int pxIdx = pxY * tgtSize + pxX; + + imgs += channelIdx * imgPixels * imgStride + caseIdx; + target += channelIdx * tgtPixels * numImages + pxIdx * numImages + caseIdx; + + // This will cause slight distortions at the edges when upsampling in some + // cases. But I think that's not a big deal. + const float srcPxX = fmaxf( + 0.0f, + fminf( + __int2float_rn(imgSize) - 1.01f, + __int2float_rn(pxX) * scale + centerScale)); + const float srcPxY = fmaxf( + 0.0f, + fminf( + __int2float_rn(imgSize) - 1.01f, + __int2float_rn(pxY) * scale + centerScale)); + + const float u = floorf(srcPxX + 1) - srcPxX; + const float w = srcPxY - floorf(srcPxY); + + // Consider doing max(0, min(imgSize, x)) here + const int srcPx0 = + (__float2int_rd(srcPxY) * imgSize + __float2int_rd(srcPxX)); // top-left + const int srcPx1 = srcPx0 + 1; // top-right + const int srcPx2 = srcPx0 + imgSize; // bottom-left + const int srcPx3 = srcPx2 + 1; // bottom-right + +#pragma unroll + for (int c = 0; c < imgsPerThread; ++c) { + if (!checkCaseBounds || caseIdx + c * 32 < numImages) { + const float val0 = imgs[srcPx0 * imgStride + c * 32]; + const float val1 = imgs[srcPx1 * imgStride + c * 32]; + const float val2 = imgs[srcPx2 * imgStride + c * 32]; + const float val3 = imgs[srcPx3 * imgStride + c * 32]; + + const float c0 = u * (val0 - val1) + val1; + const float c1 = u * (val2 - val3) + val3; + + target[32 * c] = w * (c1 - c0) + c0; + } + } + } +} + +/* + * Block size B_YxB_X. + * B_X*imgsPerThread*blockIdx.x + threadIdx.x determines img idx + * B_Y*blockIdx.y + threadIdx.y determines img row (col if !horiz), channel idx + * + * imgs: (numChannels, imgPixels, numImages) with given imgStride + * filter: (1, 2*radius + 1) + * target: (numChannels, imgPixels, numImages) + * + * target can be the same matrix as imgs. + * radius must be one of 3, 5, 7, 9. + * + * Tried imgsPerThread, slower. + */ +template +__global__ void kGaussianBlur( + float* imgs, + float* filter, + float* target, + const int imgSize, + const int numImages, + const int imgStride, + const int numChannels, + const bool horiz, + const float scaleTargets, + const float scaleOutputs) { + const int filterWidth = 2 * radius + 1; + __shared__ float shFilter[filterWidth - 1]; + + const int imgPixels = imgSize * imgSize; + const int ty = B_Y * blockIdx.y + threadIdx.y; + const int channelIdx = ty / imgSize; + const int rowIdx = ty % imgSize; + const int imgIdx = B_X * blockIdx.x + threadIdx.x; + + // const int tidx = B_Y * threadIdx.y + threadIdx.x; + if (horiz) { + imgs += channelIdx * imgPixels * imgStride + rowIdx * imgSize * imgStride + + imgIdx; + target += channelIdx * imgPixels * numImages + + rowIdx * imgSize * numImages + imgIdx; + } else { + imgs += channelIdx * imgPixels * imgStride + rowIdx * imgStride + imgIdx; + target += channelIdx * imgPixels * numImages + rowIdx * numImages + imgIdx; + } + float outputs[filterWidth - 1]; +#pragma unroll + for (int r = 0; r < filterWidth - 1; r++) { + outputs[r] = 0; + } + if (threadIdx.x < filterWidth - 1) { + shFilter[threadIdx.x] = filter[threadIdx.x]; + } + __syncthreads(); + + if (imgIdx < numImages && channelIdx < numChannels) { +// This writes radius*2 = filterWidth - 1 values to outputs +#pragma unroll + for (int col = 0; col < radius; col++) { + float px = imgs[0]; +#pragma unroll + for (int r = 0; r < radius + 1 + col; r++) { + outputs[r] += px * shFilter[radius + col - r]; + } + imgs += horiz ? imgStride : imgStride * imgSize; + } + + // Unfortunately this has to be at this level of granularity + if (scaleTargets != 0) { + for (int col = radius; col < imgSize; col++) { // loop over img columns + float px = imgs[0]; + target[0] = scaleTargets * target[0] + + scaleOutputs * (outputs[0] + px * shFilter[0]); + +#pragma unroll + for (int r = 1; r < radius * 2; r++) { + outputs[r - 1] = outputs[r] + px * shFilter[r]; + } + outputs[filterWidth - 2] = px * shFilter[0]; + + imgs += horiz ? imgStride : imgStride * imgSize; + target += horiz ? numImages : numImages * imgSize; + } + +#pragma unroll + for (int r = 0; r < radius; r++) { + float* t = &target[0]; + t[0] = scaleTargets * t[0] + scaleOutputs * outputs[r]; + target += horiz ? numImages : numImages * imgSize; + } + } else { + for (int col = radius; col < imgSize; col++) { // loop over img columns + float px = imgs[0]; + target[0] = scaleOutputs * (outputs[0] + px * shFilter[0]); +#pragma unroll + for (int r = 1; r < radius * 2; r++) { + outputs[r - 1] = outputs[r] + px * shFilter[r]; + } + outputs[filterWidth - 2] = px * shFilter[0]; + + imgs += horiz ? imgStride : imgStride * imgSize; + target += horiz ? numImages : numImages * imgSize; + } + +#pragma unroll + for (int r = 0; r < radius; r++) { + target[0] = scaleOutputs * outputs[r]; + target += horiz ? numImages : numImages * imgSize; + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread + * + * So each block does one output for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numChannels, imgPixels, numImages) + * target: (numChannels, numOutputs, numImages) + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * numFilters must be divisible by filtersPerThread + */ + +template < + int B_Y, + int B_X, + int imgsPerThread, + int chansPerThread, + bool checkCaseBounds> +__global__ void kBedOfNails( + float* imgs, + float* target, + const int imgSize, + const int numChannels, + const int numImages, + const int startX, + const int strideX, + const int outputsX, + const bool reverse, + const float scaleTargets, + const float scaleOutput) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numChanBlocks = DIVUP(numChannels, B_Y * chansPerThread); + const int outputIdxX = blockIdx.x / numImgBlocks; + const int outputIdxY = blockIdx.y / numChanBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockChanIdx = (blockIdx.y % numChanBlocks) * B_Y * chansPerThread; + const int myChanIdx = (blockChanIdx + threadIdx.y * chansPerThread); + if (myChanIdx >= numChannels) { + return; + } + // if (blockIdx.x != 0 || blockIdx.y != 0) { + // return; + // } + const int outputIdx = outputIdxY * outputsX + outputIdxX; + const int numOutputs = outputsX * outputsX; + const int imgPixels = imgSize * imgSize; + + const int startImgPxX = startX + outputIdxX * strideX; + const int startImgPxY = startX + outputIdxY * strideX; + const int imgIdx = blockImgIdx + threadIdx.x; + const int imgPx = startImgPxY * imgSize + startImgPxX; + + imgs += myChanIdx * imgPixels * numImages + imgPx * numImages + imgIdx; + target += (myChanIdx * numOutputs + outputIdx) * numImages + imgIdx; + + if (scaleTargets != 0) { + if (!reverse) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < chansPerThread; c++) { + target[c * numOutputs * numImages + i * B_X] = + scaleTargets * target[c * numOutputs * numImages + i * B_X] + + scaleOutput * imgs[c * imgPixels * numImages + i * B_X]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < chansPerThread; c++) { + imgs[c * imgPixels * numImages + i * B_X] = + scaleTargets * imgs[c * imgPixels * numImages + i * B_X] + + scaleOutput * target[c * numOutputs * numImages + i * B_X]; + } + } + } + } + } else { + if (!reverse) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < chansPerThread; c++) { + target[c * numOutputs * numImages + i * B_X] = + scaleOutput * imgs[c * imgPixels * numImages + i * B_X]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < chansPerThread; c++) { + imgs[c * imgPixels * numImages + i * B_X] = + scaleOutput * target[c * numOutputs * numImages + i * B_X]; + } + } + } + } + } +} + +/* + * imgs: (numChannels, imgPixels, numImages) + * target: (numChannels, outputs, numImages) + */ +void _convBedOfNails( + NVMatrix& images, + NVMatrix& target, + int numChannels, + int imgSize, + int startX, + int strideX, + bool reverse, + float scaleTargets, + float scaleOutput) { + int numImages = reverse ? target.getNumCols() : images.getNumCols(); + int imgPixels = imgSize * imgSize; + + assert(!images.isTrans()); + assert(!target.isTrans()); + assert(images.isContiguous()); + assert(target.isContiguous()); + assert(strideX > 1); + + int outputsX = DIVUP(imgSize, strideX); + int outputs = outputsX * outputsX; + if (reverse) { + assert(target.getNumRows() == numChannels * outputs); + } else { + assert(images.getNumRows() == numChannels * imgPixels); + } + + if (scaleTargets == 0) { + if (reverse) { + images.resize(numChannels * imgPixels, numImages); + images.apply(NVMatrixOps::Zero()); + } else { + target.resize(numChannels * outputs, numImages); + } + } else { + if (reverse) { + assert(images.getNumRows() == numChannels * outputs); + assert(images.getNumCols() == numImages); + } else { + assert(target.getNumRows() == numChannels * outputs); + assert(target.getNumCols() == numImages); + } + } + + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + int chansPerThread = numChannels % 8 == 0 ? 2 : 1; + dim3 threads(32, 4); + dim3 blocks( + DIVUP(numImages, 32 * imgsPerThread) * outputsX, + DIVUP(numChannels, 4 * chansPerThread) * outputsX); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (imgsPerThread == 4) { + if (chansPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 4, 1, true>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 4, 1, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 4, 1, false>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 4, 1, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 4, 2, true>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 4, 2, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 4, 2, false>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 4, 2, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } + } + } else if (imgsPerThread == 2) { + if (chansPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 2, 1, true>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 2, 1, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 2, 1, false>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 2, 1, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 2, 2, true>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 2, 2, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 2, 2, false>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 2, 2, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } + } + } else { + if (chansPerThread == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 1, 1, true>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 1, 1, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 1, 1, false>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 1, 1, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 1, 2, true>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 1, 2, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kBedOfNails<4, 32, 1, 2, false>, cudaFuncCachePreferL1); + kBedOfNails<4, 32, 1, 2, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + numChannels, + numImages, + startX, + strideX, + outputsX, + reverse, + scaleTargets, + scaleOutput); + } + } + } +} + +void convBedOfNails( + NVMatrix& images, + NVMatrix& target, + int numChannels, + int imgSize, + int startX, + int strideX, + float scaleTargets, + float scaleOutput) { + _convBedOfNails( + images, + target, + numChannels, + imgSize, + startX, + strideX, + false, + scaleTargets, + scaleOutput); +} + +void convBedOfNailsUndo( + NVMatrix& actsGrad, + NVMatrix& target, + int numChannels, + int imgSize, + int startX, + int strideX, + float scaleTargets, + float scaleOutput) { + _convBedOfNails( + target, + actsGrad, + numChannels, + imgSize, + startX, + strideX, + true, + scaleTargets, + scaleOutput); +} + +/* + * imgs: (numChannels, imgPixels, numImages) with given imgStride + * filter: (1, 2*radius + 1) + * target: (numChannels, imgPixels, numImages) + */ +void convGaussianBlur( + NVMatrix& images, + NVMatrix& filter, + NVMatrix& target, + bool horiz, + int numChannels, + float scaleTargets, + float scaleOutputs) { + int numImages = images.getNumCols(); + int radius = filter.getNumCols() / 2; + int imgPixels = images.getNumRows() / numChannels; + int imgSize = int(sqrt(imgPixels)); + + assert(imgPixels == imgSize * imgSize); + assert(radius >= 1 && radius <= 4); + assert(imgSize >= 2 * radius + 1); + assert(filter.getNumRows() == 1); + assert(images.getNumRows() == numChannels * imgPixels); + assert(!images.isTrans()); + assert(!filter.isTrans()); + assert(!target.isTrans()); + assert(target.isContiguous()); + if (scaleTargets == 0) { + target.resize(images); + } else { + assert(target.isSameDims(images)); + } + + dim3 threads(32, 4); + dim3 blocks( + DIVUP(numImages, threads.x), DIVUP(numChannels * imgSize, threads.y)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (radius == 1) { + cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 1>, cudaFuncCachePreferL1); + kGaussianBlur<4, 32, 1><<>>( + images.getDevData(), + filter.getDevData(), + target.getDevData(), + imgSize, + numImages, + images.getStride(), + numChannels, + horiz, + scaleTargets, + scaleOutputs); + + } else if (radius == 2) { + cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 2>, cudaFuncCachePreferL1); + kGaussianBlur<4, 32, 2><<>>( + images.getDevData(), + filter.getDevData(), + target.getDevData(), + imgSize, + numImages, + images.getStride(), + numChannels, + horiz, + scaleTargets, + scaleOutputs); + + } else if (radius == 3) { + cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 3>, cudaFuncCachePreferL1); + kGaussianBlur<4, 32, 3><<>>( + images.getDevData(), + filter.getDevData(), + target.getDevData(), + imgSize, + numImages, + images.getStride(), + numChannels, + horiz, + scaleTargets, + scaleOutputs); + } else if (radius == 4) { + cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 4>, cudaFuncCachePreferL1); + kGaussianBlur<4, 32, 4><<>>( + images.getDevData(), + filter.getDevData(), + target.getDevData(), + imgSize, + numImages, + images.getStride(), + numChannels, + horiz, + scaleTargets, + scaleOutputs); + } +} + +/* + * Block size 1x128 + * blockIdx.x determines pixel.x, image idx in batches of 128*imgsPerThread + * blockIdx.y determines pixel.y + * + * So each block does one output for some number of images and all the fliters. + * + * threadIdx.x determines img idx + * + * imgs: (numFilters, imgPixels, numImages) + * meanDiffs: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * numFilters must be divisible by B_Y*filtersPerThread + */ + +template +__global__ void kCNorm_fewfilter( + float* imgs, + float* meanDiffs, + float* denoms, + float* target, + const int imgSize, + const int numImages, + const int sizeX, + const float addScale, + const float powScale, + const float minDiv) { + const int imgPixels = imgSize * imgSize; + const int numImgBlocks = DIVUP(numImages, 128 * imgsPerThread); + const int pxIdxX = blockIdx.x / numImgBlocks; + const int pxIdxY = blockIdx.y; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * 128 * imgsPerThread; + + const int pxIdx = pxIdxY * imgSize + pxIdxX; + + const int startPxX = -sizeX / 2 + pxIdxX; + const int startPxY = -sizeX / 2 + pxIdxY; + const int imgIdx = blockImgIdx + threadIdx.x; + + imgs += pxIdx * numImages + imgIdx; + denoms += pxIdx * numImages + imgIdx; + meanDiffs += imgIdx; + target += pxIdx * numImages + imgIdx; + + float prod[numFilters][imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * 128 < numImages) { +#pragma unroll + for (int f = 0; f < numFilters; f++) { + prod[f][i] = 0; + } + } + } + const int loopStartY = MAX(0, startPxY); + const int loopStartX = MAX(0, startPxX); + const int loopEndY = MIN(imgSize, startPxY + sizeX); + const int loopEndX = MIN(imgSize, startPxX + sizeX); + + for (int y = loopStartY; y < loopEndY; y++) { + for (int x = loopStartX; x < loopEndX; x++) { + const int imgPx = y * imgSize + x; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * 128 < numImages) { +#pragma unroll + for (int f = 0; f < numFilters; f++) { + prod[f][i] += square( + meanDiffs[(f * imgPixels + imgPx) * numImages + i * 128]); + } + } + } + } + } + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * 128 < numImages) { +#pragma unroll + for (int f = 0; f < numFilters; f++) { + prod[f][i] = minDiv + addScale * prod[f][i]; + denoms[f * imgPixels * numImages + i * 128] = prod[f][i]; + target[f * imgPixels * numImages + i * 128] = + imgs[f * imgPixels * numImages + i * 128] * + __powf(prod[f][i], -powScale); + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines image idx in batches of B_X*imgsPerThread + * blockIdx.y determines filter idx in batches of B_Y*filtersPerThread + * blockIdx.z determines pixel + * + * So each block does one pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numFilters, imgPixels, numImages) + * means: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * numFilters must be divisible by B_Y*filtersPerThread + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + bool checkCaseBounds> +__global__ void kCNorm_manyfilter( + float* imgs, + float* meanDiffs, + float* denoms, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeX, + const float addScale, + const float powScale, + const float minDiv) { + const int imgPixels = imgSize * imgSize; + + const int pxIdxX = blockIdx.z % imgSize; + const int pxIdxY = blockIdx.z / imgSize; + const int blockImgIdx = blockIdx.x * B_X * imgsPerThread; + const int blockFilterIdx = blockIdx.y * B_Y * filtersPerThread; + + const int pxIdx = pxIdxY * imgSize + pxIdxX; + + const int startPxX = -sizeX / 2 + pxIdxX; + const int startPxY = -sizeX / 2 + pxIdxY; + const int imgIdx = blockImgIdx + threadIdx.x; + imgs += + ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx; + meanDiffs += (blockFilterIdx + threadIdx.y) * imgPixels * numImages + imgIdx; + denoms += + ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx; + target += + ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] = 0; + } + } + } + + const int loopStartY = max(0, startPxY); + const int loopStartX = max(0, startPxX); + const int loopEndY = min(imgSize, startPxY + sizeX); + const int loopEndX = min(imgSize, startPxX + sizeX); + + for (int y = loopStartY; y < loopEndY; y++) { + for (int x = loopStartX; x < loopEndX; x++) { + const int imgPx = y * imgSize + x; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[f][i] += square( + meanDiffs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]); + } + } + } + } + } +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[f][i] = minDiv + addScale * prod[f][i]; + denoms[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; + target[f * B_Y * imgPixels * numImages + i * B_X] = + imgs[f * B_Y * imgPixels * numImages + i * B_X] * + __powf(prod[f][i], -powScale); + } + } + } +} + +/* + * Block size 16xB_X + * blockIdx.x determines 4x4 pixel.x region, image idx in batches of + * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in + * batches of filtersPerThread + * + * So each block does 4x4 region of pixels for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines pixel idx + * + * imgs: (numFilters, imgPixels, numImages) + * means: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + * + * B_X one of 8, 16, 32 + * imgsPerThread one of 1, 2, 4, 8, 16 + * + * B_XximgsPerThread MUST be divisible by 32. + * Number of filters MUST be divisible by filtersPerThread. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * numFilters must be divisible by filtersPerThread + * + * Final write-out will not be fully coalesced unless B_X is 32. But there's a + * lot more reading than writing here, and the reading is all coalesced, so it + * should be OK. + */ +template < + int B_X, + int imgsPerThread, + int filtersPerThread, + bool checkCaseBounds> +__global__ void kCNorm2( + float* imgs, + float* meanDiffs, + float* denoms, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeX, + const float addScale, + const float powScale, + const float minDiv) { + __shared__ float shDiffs[filtersPerThread][B_X * imgsPerThread]; + const int imgPixels = imgSize * imgSize; + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / (filtersPerThread); + const int blockPxX = 4 * (blockIdx.x / numImgBlocks); + const int blockPxY = 4 * (blockIdx.y / numFilterBlocks); + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread; + + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + const int startPxX = MAX(0, -sizeX / 2 + blockPxX); + const int startPxY = MAX(0, -sizeX / 2 + blockPxY); + const int endPxX = MIN(imgSize, blockPxX + DIVUP(sizeX, 2) + 3); + const int endPxY = MIN(imgSize, blockPxY + DIVUP(sizeX, 2) + 3); + + const int myPxX = blockPxX + threadIdx.y % 4; + const int myPxY = blockPxY + threadIdx.y / 4; + const int myPxIdx = myPxY * imgSize + myPxX; + // const bool doWork = myPxX < imgSize && myPxY < imgSize; + const int myStartPxY = -sizeX / 2 + myPxY; + const int myStartPxX = -sizeX / 2 + myPxX; + const int myEndPxY = myPxY + DIVUP(sizeX, 2); + const int myEndPxX = myPxX + DIVUP(sizeX, 2); + + const int imgIdx = blockImgIdx + threadIdx.x; + + imgs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + meanDiffs += + (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; + denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] = 0; + } + } + } + + for (int y = startPxY; y < endPxY; y++) { + const bool isInY = y >= myStartPxY && y < myEndPxY; + for (int x = startPxX; x < endPxX; x++) { + const int px = y * imgSize + x; +// All the threads load a pixel from memory +#pragma unroll + for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) { + if (filtersPerThread % (B_X / 2) == 0 || + ly + loadY < filtersPerThread) { +#pragma unroll + for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) { + if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) { + shDiffs[ly + loadY][lx + loadX] = + meanDiffs[(ly * imgPixels + px) * numImages + lx]; + } + } + } + } + __syncthreads(); + + // Each row of threads decides if it's interested in this pixel + if (isInY && x >= myStartPxX && x < myEndPxX) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] += square(shDiffs[f][threadIdx.x + i * B_X]); + } + } + } + } + __syncthreads(); + } + } + // imgs -= (loadY * imgPixels - myPxIdx) * numImages + loadX; + // imgs += threadIdx.x; + if (myPxX < imgSize && myPxY < imgSize) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] = minDiv + addScale * prod[f][i]; + denoms[f * imgPixels * numImages + i * B_X] = prod[f][i]; + target[f * imgPixels * numImages + i * B_X] = + imgs[f * imgPixels * numImages + i * B_X] * + __powf(prod[f][i], -powScale); + } + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y + * + * So each block does one pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numFilters, imgPixels, numImages) + * meanDiffs: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * numFilters must be divisible by B_Y + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + bool checkCaseBounds, + bool blocked> +__global__ void kFCNorm( + cudaTextureObject_t imgs, + cudaTextureObject_t meanDiffs, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeF, + const float addScale, + const float powScale, + const float minDiv) { + const int imgPixels = imgSize * imgSize; + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / B_Y; + const int pxIdxX = blockIdx.x / numImgBlocks; + const int pxIdxY = blockIdx.y / numFilterBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; + + const int pxIdx = pxIdxY * imgSize + pxIdxX; + + const int imgIdx = blockImgIdx + threadIdx.x; + const int imgOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx; + const int meanDiffsOffset = pxIdx * numImages + imgIdx; + // imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; + // meanDiffs += pxIdx * numImages + imgIdx; + target += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx; + + float prod[imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[i] = 0; + } + } + + const int startF = + blocked ? (filterIdx / sizeF) * sizeF : -sizeF / 2 + filterIdx; + const int loopStartF = blocked ? startF : MAX(0, startF); + const int loopEndF = MIN(numFilters, startF + sizeF); + + for (int f = loopStartF; f < loopEndF; ++f) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[i] += square(tex1Dfetch( + meanDiffs, meanDiffsOffset + f * imgPixels * numImages + i * B_X)); + } + } + } + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[i] = minDiv + addScale * prod[i]; + target[i * B_X] = tex1Dfetch(imgs, imgOffset + i * B_X) * + __powf(prod[i], -powScale); + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y + * + * So each block does one output pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numFilters, imgPixels, numImages) + * maxGrads: (numOutputs, imgPixels, numImages) + * maxActs: (numOutputs, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * numImages must be divisible by B_X*imgsPerThread + * numFilters must be divisible by B_Y + * + * TODO: this isn't really ideal + */ +template +__global__ void kCrossMapMaxPoolUndo( + float* imgs, + float* maxGrads, + float* maxActs, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int startF, + const int poolSize, + const int numOutputs, + const int stride, + const float scaleTargets, + const float scaleOutputs) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + // const int numOutputs = DIVUP(numFilters, stride); + const int numFilterBlocks = numFilters / B_Y; + + const int pxIdxX = blockIdx.x / numImgBlocks; + const int pxIdxY = blockIdx.y / numFilterBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; + + const int imgPixels = imgSize * imgSize; + const int pxIdx = pxIdxY * imgSize + pxIdxX; + const int imgIdx = blockImgIdx + threadIdx.x; + + imgs += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx; + maxGrads += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx; + maxActs += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx; + target += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx; + + float prod[imgsPerThread]; + // if (imgIdx != 0 || pxIdx != 0 || filterIdx != 0) { + // return; + // } +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[i] = 0; + } + + if (filterIdx < numFilters) { + // const int startOut = max(0, (filterIdx-startF-poolSize)/ stride + + // 1); + const int loopStartOut = + max(0, (filterIdx - startF - poolSize) / stride + 1); + const int loopEndOut = min(numOutputs, (filterIdx - startF) / stride + 1); + + for (int o = loopStartOut; o < loopEndOut; ++o) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + const float ma = maxActs[o * imgPixels * numImages + i * B_X]; + const float mg = maxGrads[o * imgPixels * numImages + i * B_X]; + const float img = imgs[i * B_X]; + prod[i] += (img == ma) * mg; + } + } + } + // printf("gpu f start: %d, end: %d\n", loopStartF, loopEndF); + + if (!add) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + target[i * B_X] = prod[i]; + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + target[i * B_X] = + scaleTargets * target[i * B_X] + scaleOutputs * prod[i]; + } + } + } + } +} + +/* + * images: (numFilters, imgPixels, numImages) + * maxGrads: (numOutputs, imgPixels, numImages) + * maxActs: (numOutputs, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + */ +void convCrossMapMaxPoolUndo( + NVMatrix& images, + NVMatrix& maxGrads, + NVMatrix& maxActs, + NVMatrix& target, + const int imgSize, + const int startF, + const int poolSize, + const int stride, + const float scaleTargets, + const float scaleOutputs) { + int numImages = images.getNumCols(); + int imgPixels = imgSize * imgSize; + int numFilters = images.getNumRows() / imgPixels; + int numOutputs = maxActs.getNumRows() / imgPixels; + assert(images.getNumRows() == numFilters * imgPixels); + assert(maxGrads.getNumRows() == numOutputs * imgPixels); + assert(maxGrads.getNumCols() == numImages); + assert(maxGrads.isSameDims(maxActs)); + + assert(images.getNumRows() == numFilters * imgPixels); + + assert(!images.isTrans()); + assert(!target.isTrans()); + assert(!maxGrads.isTrans()); + assert(!maxActs.isTrans()); + assert(images.isContiguous()); + assert(maxGrads.isContiguous()); + assert(maxActs.isContiguous()); + assert(maxGrads.isSameDims(maxActs)); + // assert(numFilters % 16 == 0); + // assert(numImages % 128 == 0); + + assert(stride <= poolSize); + assert(startF <= 0); + assert( + startF + (numOutputs - 1) * stride + poolSize >= + numFilters); // All filters must be covered + + dim3 threads(32, 4); + + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + dim3 blocks( + imgSize * DIVUP(numImages, threads.x * imgsPerThread), + imgSize * DIVUP(numFilters, threads.y)); + bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0; + + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (scaleTargets == 0) { + target.resize(images); + if (!checkCaseBounds) { + if (imgsPerThread == 4) { + kCrossMapMaxPoolUndo<4, 32, 4, false, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } else if (imgsPerThread == 2) { + kCrossMapMaxPoolUndo<4, 32, 2, false, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } else { + kCrossMapMaxPoolUndo<4, 32, 1, false, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } + } else { + kCrossMapMaxPoolUndo<4, 32, 1, false, true> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } + } else { + assert(target.isSameDims(images)); + if (!checkCaseBounds) { + if (imgsPerThread == 4) { + kCrossMapMaxPoolUndo<4, 32, 4, true, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } else if (imgsPerThread == 2) { + kCrossMapMaxPoolUndo<4, 32, 2, true, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } else { + kCrossMapMaxPoolUndo<4, 32, 1, true, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } + } else { + kCrossMapMaxPoolUndo<4, 32, 1, true, true> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + startF, + poolSize, + numOutputs, + stride, + scaleTargets, + scaleOutputs); + } + } + getLastCudaError("convCrossMapMaxPoolUndo: kernel execution failed"); +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y + * + * So each block does one output pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * outGrads: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) + * inputs: (numFilters, imgPixels, numImages) + * acts: (numFilters, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * numImages must be divisible by B_X*imgsPerThread + * numFilters must be divisible by B_Y + * + * TODO: this isn't really ideal + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + bool add, + bool checkCaseBounds, + bool blocked> +__global__ void kFRNormUndo( + cudaTextureObject_t outGrads, + cudaTextureObject_t denoms, + cudaTextureObject_t inputs, + cudaTextureObject_t acts, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeF, + const float powScale, + const float scaleTargets, + const float scaleOutputs) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / B_Y; + + const int pxIdxX = blockIdx.x / numImgBlocks; + const int pxIdxY = blockIdx.y / numFilterBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; + + const int imgPixels = imgSize * imgSize; + const int pxIdx = pxIdxY * imgSize + pxIdxX; + const int imgIdx = blockImgIdx + threadIdx.x; + + const int actsOffset = pxIdx * numImages + imgIdx; + const int inputOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx; + + target += inputOffset; + float prod[imgsPerThread]; + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[i] = 0; + } + + const int startF = blocked ? (filterIdx / sizeF) * sizeF + : -sizeF + sizeF / 2 + 1 + filterIdx; + const int loopStartF = blocked ? startF : MAX(0, startF); + const int loopEndF = MIN(numFilters, startF + sizeF); + + for (int f = loopStartF; f < loopEndF; ++f) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + prod[i] += tex1Dfetch( + acts, actsOffset + f * imgPixels * numImages + i * B_X); + } + } + } + + if (!add) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + const float inp = tex1Dfetch(inputs, inputOffset + i * B_X); + const float out = tex1Dfetch(outGrads, inputOffset + i * B_X); + const float den = tex1Dfetch(denoms, inputOffset + i * B_X); + prod[i] = inp * prod[i] + out * __powf(den, -powScale); + target[i * B_X] = prod[i]; + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + const float inp = tex1Dfetch(inputs, inputOffset + i * B_X); + const float out = tex1Dfetch(outGrads, inputOffset + i * B_X); + const float den = tex1Dfetch(denoms, inputOffset + i * B_X); + prod[i] = inp * prod[i] + out * __powf(den, -powScale); + target[i * B_X] = + scaleTargets * target[i * B_X] + scaleOutputs * prod[i]; + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y + * + * So each block does one output pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * outGrads: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) + * inputs: (numFilters, imgPixels, numImages) + * acts: (numFilters, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * numImages must be divisible by B_X*imgsPerThread + * numFilters must be divisible by B_Y + * + * TODO: this is pretty wasteful of computation. a lot of threads basically + * compute the same products. + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + bool add, + bool checkCaseBounds, + bool blocked> +//__launch_bounds__(128,16) +__global__ void kFRNormUndo2( + cudaTextureObject_t outGrads, + cudaTextureObject_t inputs, + cudaTextureObject_t acts, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeF, + const float addScale, + const float powScale, + const float minDiv, + const float scaleTargets, + const float scaleOutputs) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / B_Y; + + const int pxIdxX = blockIdx.x / numImgBlocks; + const int pxIdxY = blockIdx.y / numFilterBlocks; + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; + + const int imgPixels = imgSize * imgSize; + const int pxIdx = pxIdxY * imgSize + pxIdxX; + const int imgIdx = blockImgIdx + threadIdx.x; + + const int inpOffset = pxIdx * numImages + imgIdx; + const int outOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx; + + target += outOffset; + + float prod[imgsPerThread]; + float denoms[imgsPerThread]; + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[i] = 0; + denoms[i] = 0; + } + + int startF = blocked ? (filterIdx / sizeF) * sizeF + : -sizeF + sizeF / 2 + 1 + filterIdx; + int loopStartF = blocked ? startF : MAX(0, startF); + int loopEndF = MIN(numFilters, startF + sizeF); + + for (int f = loopStartF; f < loopEndF; ++f) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + // If an input is zero, then we shuldn't divide by it. + const float grad = tex1Dfetch( + outGrads, inpOffset + f * imgPixels * numImages + i * B_X); + const float act = tex1Dfetch( + acts, inpOffset + f * imgPixels * numImages + i * B_X); + const float inp = + tex1Dfetch( + inputs, inpOffset + f * imgPixels * numImages + i * B_X) + + (act == 0); + prod[i] += grad * act * __powf(__fdividef(act, inp), 1.0f / powScale); + } + } + } + + startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF / 2 + filterIdx; + loopStartF = blocked ? startF : MAX(0, startF); + loopEndF = MIN(numFilters, startF + sizeF); + + for (int f = loopStartF; f < loopEndF; ++f) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + denoms[i] += square(tex1Dfetch( + inputs, inpOffset + f * imgPixels * numImages + i * B_X)); + } + } + } + + if (!add) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + const float inp = tex1Dfetch(inputs, outOffset + i * B_X); + const float out = tex1Dfetch(outGrads, outOffset + i * B_X); + denoms[i] = addScale * denoms[i] + minDiv; + prod[i] = + (-2 * powScale * addScale * inp * prod[i] + + out * __powf(denoms[i], -powScale)); + target[i * B_X] = prod[i]; + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { + const float inp = tex1Dfetch(inputs, outOffset + i * B_X); + const float out = tex1Dfetch(outGrads, outOffset + i * B_X); + denoms[i] = addScale * denoms[i] + minDiv; + prod[i] = + (-2 * powScale * addScale * inp * prod[i] + + out * __powf(denoms[i], -powScale)); + target[i * B_X] = + scaleTargets * target[i * B_X] + scaleOutputs * prod[i]; + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread + * + * So each block does one output pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numFilters, imgPixels, numImages) + * maxGrads: (numFilters, numOutputs, numImages) + * rMaxActs: (numFilters, numOutputs, numImages) + * target: (numFilters, imgPixels, numImages) + * + * numImages must be divisible by B_X*imgsPerThread + * numFilters must be divisible by B_Y*filtersPerThread + */ + +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + bool sum, + bool add, + bool checkCaseBounds> +__global__ void kLocalAvgUndo( + float* avgGrads, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int subsX, + const int startX, + const int strideX, + const int outputsX, + const float scaleTargets, + const float scaleOutputs) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int blockPxX = blockIdx.x / numImgBlocks; + const int blockPxY = blockIdx.y / (numFilters / (B_Y * filtersPerThread)); + + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = + (blockIdx.y % (numFilters / (B_Y * filtersPerThread))) * B_Y * + filtersPerThread; + + const int blockPx = blockPxY * imgSize + blockPxX; + const int numOutputs = outputsX * outputsX; + const int imgPixels = imgSize * imgSize; + + const int startOutputY = + blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX; + const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX); + const int startOutputX = + blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX; + const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX); + + const int imgIdx = blockImgIdx + threadIdx.x; + + avgGrads += + ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx; + target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[f][i] = 0; + } + } + + if (blockPxX >= startX && + blockPxX < startX + strideX * (outputsX - 1) + subsX && + blockPxY >= startX && + blockPxY < startX + strideX * (outputsX - 1) + subsX) { + for (int my = startOutputY; my < endOutputY; my++) { + const float regionStartY = fmaxf(0, startX + my * strideX); + const float regionEndY = fminf(imgSize, startX + my * strideX + subsX); + const float regionSizeY = regionEndY - regionStartY; + for (int mx = startOutputX; mx < endOutputX; mx++) { + const int outputIdx = my * outputsX + mx; + const float regionStartX = fmaxf(0, startX + mx * strideX); + const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX); + const float regionSizeX = regionEndX - regionStartX; + // It's important to do the division here, because pushing division into + // the below loops makes the code 4x slower. + const float regionSizeInv = + sum ? 1.0f : (1.0f / (regionSizeX * regionSizeY)); +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] += + avgGrads + [(f * B_Y * numOutputs + outputIdx) * numImages + + i * B_X] * + regionSizeInv; + } + } + } + } + } + } + + if (!add) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + target[f * B_Y * imgPixels * numImages + i * B_X] = + scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[f][i]; + } + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread + * + * So each block does one output pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * imgs: (numFilters, imgPixels, numImages) + * maxGrads: (numFilters, numOutputs, numImages) + * maxActs: (numFilters, numOutputs, numImages) + * target: (numFilters, imgPixels, numImages) + * + * numImages must be divisible by B_X*imgsPerThread + * numFilters must be divisible by B_Y*filtersPerThread + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + bool add, + bool checkCaseBounds> +__global__ void kLocalMaxUndo( + float* imgs, + float* maxGrads, + float* maxActs, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int subsX, + const int startX, + const int strideX, + const int outputsX, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImgs[B_Y * filtersPerThread][B_X * imgsPerThread]; + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int blockPxX = blockIdx.x / numImgBlocks; + const int blockPxY = blockIdx.y / (numFilters / (B_Y * filtersPerThread)); + + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = + (blockIdx.y % (numFilters / (B_Y * filtersPerThread))) * B_Y * + filtersPerThread; + + const int blockPx = blockPxY * imgSize + blockPxX; + const int numOutputs = outputsX * outputsX; + const int imgPixels = imgSize * imgSize; + + const int startOutputY = + blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX; + const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX); + const int startOutputX = + blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX; + const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX); + + const int imgIdx = blockImgIdx + threadIdx.x; + + imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + maxGrads += + ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx; + maxActs += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx; + + target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[f][i] = 0; + } + } + + if (blockPxX >= startX && + blockPxX < startX + strideX * (outputsX - 1) + subsX && + blockPxY >= startX && + blockPxY < startX + strideX * (outputsX - 1) + subsX) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i] = + imgs[f * B_Y * imgPixels * numImages + i * B_X]; + } + } + } + for (int my = startOutputY; my < endOutputY; my++) { + for (int mx = startOutputX; mx < endOutputX; mx++) { + const int outputIdx = my * outputsX + mx; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + const float ma = maxActs + [(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X]; + const float mg = maxGrads + [(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X]; + const float img = + shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i]; + + prod[f][i] += (img == ma) * mg; + } + } + } + } + } + } + if (!add) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + target[f * B_Y * imgPixels * numImages + i * B_X] = + scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[f][i]; + } + } + } + } +} + +/* + * acts := -2 x scale x acts x outGrads / denoms + */ +template +__global__ void kRNormUndoPrelims( + float* acts, + cudaTextureObject_t denoms, + cudaTextureObject_t outGrads, + const uint numElements, + const float scale) { + const uint e = B_X * blockIdx.x * eltsPerThread + threadIdx.x; + const uint numThreads = B_X * gridDim.x; + for (uint i = e; i < numElements; i += numThreads * eltsPerThread) { +#pragma unroll + for (uint k = 0; k < eltsPerThread; k++) { + if (i + k * B_X < numElements) { + acts[i + k * B_X] = __fdividef( + scale * tex1Dfetch(outGrads, i + k * B_X) * + acts[i + k * B_X], + tex1Dfetch(denoms, i + k * B_X)); + } + } + } +} + +/* + * Block size B_YxB_X + * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread + * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread + * + * So each block does one output pixel for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines filter idx + * + * outGrads: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) + * inputs: (numFilters, imgPixels, numImages) + * acts: (numFilters, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * numImages must be divisible by B_X*imgsPerThread + * numFilters must be divisible by B_Y*filtersPerThread + * + * TODO: this isn't really ideal + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + bool checkCaseBounds> +__global__ void kRNormUndo( + float* outGrads, + float* denoms, + float* inputs, + float* acts, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeX, + const float powScale, + const float scaleTargets, + const float scaleOutputs) { + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / (B_Y * filtersPerThread); + + const int blockPxX = blockIdx.x / numImgBlocks; + const int blockPxY = blockIdx.y / numFilterBlocks; + + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = + (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread; + + const int blockPx = blockPxY * imgSize + blockPxX; + const int imgPixels = imgSize * imgSize; + + const int startY = MAX(0, blockPxY + sizeX / 2 - sizeX + 1); + const int startX = MAX(0, blockPxX + sizeX / 2 - sizeX + 1); + const int endY = MIN(imgSize, blockPxY + sizeX / 2 + 1); + const int endX = MIN(imgSize, blockPxX + sizeX / 2 + 1); + + const int imgIdx = blockImgIdx + threadIdx.x; + + acts += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx; + inputs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + outGrads += + ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[f][i] = 0; + } + } + + for (int sy = startY; sy < endY; sy++) { + for (int sx = startX; sx < endX; sx++) { + const int outPx = sy * imgSize + sx; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] += + acts[(f * B_Y * imgPixels + outPx) * numImages + i * B_X]; + } + } + } + } + } + // outGrads += blockPx * numImages; + if (scaleTargets == 0) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X]; + const float out = + outGrads[(f * B_Y * imgPixels) * numImages + i * B_X]; + const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X]; + prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); + target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X]; + const float out = + outGrads[(f * B_Y * imgPixels) * numImages + i * B_X]; + const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X]; + prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); + target[f * B_Y * imgPixels * numImages + i * B_X] = + scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[f][i]; + } + } + } + } +} + +/* + * Block size 16xB_X + * blockIdx.x determines 4x4 pixel.x region, image idx in batches of + * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in + * batches of filtersPerThread + * + * So each block does 4x4 region for some number of images/filters. + * + * threadIdx.x determines img idx + * threadIdx.y determines pixel idx + * + * outGrads: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) + * inputs: (numFilters, imgPixels, numImages) + * acts: (numFilters, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * B_X one of 8, 16, 32 + * imgsPerThread one of 1, 2, 4, 8, 16 + * + * B_XximgsPerThread MUST be divisible by 32. + * Number of filters MUST be divisible by filtersPerThread. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false + * numFilters must be divisible by filtersPerThread + * + * Final write-out will not be fully coalesced unless B_X is 32. But there's a + * lot more reading than writing here, and the reading is all coalesced, so it + * should be OK. + */ +template < + int B_X, + int imgsPerThread, + int filtersPerThread, + bool add, + bool checkCaseBounds> +__global__ void kRNormUndo2( + float* outGrads, + float* denoms, + float* inputs, + float* acts, + float* target, + const int imgSize, + const int numFilters, + const int numImages, + const int sizeX, + const float powScale, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shActs[filtersPerThread][B_X * imgsPerThread]; + const int imgPixels = imgSize * imgSize; + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int numFilterBlocks = numFilters / (filtersPerThread); + const int blockPxX = 4 * (blockIdx.x / numImgBlocks); + const int blockPxY = 4 * (blockIdx.y / numFilterBlocks); + const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread; + + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + const int startPxX = MAX(0, -DIVUP(sizeX, 2) + blockPxX + 1); + const int startPxY = MAX(0, -DIVUP(sizeX, 2) + blockPxY + 1); + const int endPxX = MIN(imgSize, blockPxX + sizeX / 2 + 4); + const int endPxY = MIN(imgSize, blockPxY + sizeX / 2 + 4); + + const int myPxX = blockPxX + threadIdx.y % 4; + const int myPxY = blockPxY + threadIdx.y / 4; + const int myPxIdx = myPxY * imgSize + myPxX; + // const bool doWork = myPxX < imgSize && myPxY < imgSize; + const int myStartPxY = -DIVUP(sizeX, 2) + myPxY + 1; + const int myStartPxX = -DIVUP(sizeX, 2) + myPxX + 1; + const int myEndPxY = myPxY + sizeX / 2 + 1; + const int myEndPxX = myPxX + sizeX / 2 + 1; + + const int imgIdx = blockImgIdx + threadIdx.x; + + acts += + (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; + denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + inputs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + outGrads += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[f][i] = 0; + } + } + + for (int y = startPxY; y < endPxY; y++) { + const bool isInY = y >= myStartPxY && y < myEndPxY; + for (int x = startPxX; x < endPxX; x++) { + const int px = y * imgSize + x; +// All the threads load a pixel from memory +#pragma unroll + for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) { + if (filtersPerThread % (B_X / 2) == 0 || + ly + loadY < filtersPerThread) { +#pragma unroll + for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) { + if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) { + shActs[ly + loadY][lx + loadX] = + acts[(ly * imgPixels + px) * numImages + lx]; + } + } + } + } + __syncthreads(); + + // Each row of threads decides if it's interested in this pixel + if (isInY && x >= myStartPxX && x < myEndPxX) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][i] += shActs[f][threadIdx.x + i * B_X]; + } + } + } + } + __syncthreads(); + } + } + acts -= (loadY * imgPixels - myPxIdx) * numImages + loadX; + acts += threadIdx.x; + if (myPxX < imgSize && myPxY < imgSize) { + if (!add) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + const float out = outGrads[f * imgPixels * numImages + i * B_X]; + const float den = denoms[f * imgPixels * numImages + i * B_X]; + const float inp = inputs[f * imgPixels * numImages + i * B_X]; + prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); + target[f * imgPixels * numImages + i * B_X] = prod[f][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || imgIdx + i * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + const float out = outGrads[f * imgPixels * numImages + i * B_X]; + const float den = denoms[f * imgPixels * numImages + i * B_X]; + const float inp = inputs[f * imgPixels * numImages + i * B_X]; + prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); + target[f * imgPixels * numImages + i * B_X] = + scaleTargets * target[f * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[f][i]; + } + } + } + } + } +} + +void convLocalMaxUndo( + NVMatrix& images, + NVMatrix& maxGrads, + NVMatrix& maxActs, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX) { + convLocalMaxUndo( + images, + maxGrads, + maxActs, + target, + subsX, + startX, + strideX, + outputsX, + 0, + 1); +} + +/* + * imgs: (numFilters, imgPixels, numImages) + * maxGrads: (numFilters, numOutputs, numImages) + * rMaxActs: (numFilters, numOutputs, numImages) + * target: (numFilters, imgPixels, numImages) + */ +void convLocalMaxUndo( + NVMatrix& images, + NVMatrix& maxGrads, + NVMatrix& maxActs, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX, + float scaleTargets, + float scaleOutput) { + int outputs = outputsX * outputsX; + int numImages = images.getNumCols(); + int numFilters = maxGrads.getNumRows() / outputs; + int imgPixels = images.getNumRows() / numFilters; + assert(images.getNumRows() == numFilters * imgPixels); + int imgSize = int(sqrt(imgPixels)); + + assert(imgSize * imgSize == imgPixels); + assert(maxGrads.getNumRows() == numFilters * outputs); + assert(maxGrads.getNumCols() == numImages); + assert(!images.isTrans()); + assert(!target.isTrans()); + assert(!maxGrads.isTrans()); + assert(!maxActs.isTrans()); + assert(images.isContiguous()); + assert(maxGrads.isContiguous()); + assert(maxActs.isContiguous()); + assert(maxGrads.isSameDims(maxActs)); + assert(numFilters % 16 == 0); + // assert(numImages % 128 == 0); + + assert(strideX <= subsX); + + target.resize(images); + assert(target.isContiguous()); + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + int checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + dim3 threads(32, 4); + dim3 blocks( + DIVUP(numImages, 32 * imgsPerThread) * imgSize, + (numFilters / (4 * 2)) * imgSize); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (imgsPerThread == 4) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalMaxUndo<4, 32, 4, 2, false, true><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalMaxUndo<4, 32, 4, 2, true, true><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalMaxUndo<4, 32, 4, 2, false, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalMaxUndo<4, 32, 4, 2, true, false><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } else if (imgsPerThread == 2) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalMaxUndo<4, 32, 2, 2, false, true><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalMaxUndo<4, 32, 2, 2, true, true><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalMaxUndo<4, 32, 2, 2, false, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalMaxUndo<4, 32, 2, 2, true, false><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } else { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalMaxUndo<4, 32, 1, 2, false, true><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalMaxUndo<4, 32, 1, 2, true, true><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalMaxUndo<4, 32, 1, 2, false, false> + <<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalMaxUndo<4, 32, 1, 2, true, false><<>>( + images.getDevData(), + maxGrads.getDevData(), + maxActs.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } + + getLastCudaError("convLocalMaxUndo: kernel execution failed"); +} + +void convLocalAvgUndo( + NVMatrix& avgGrads, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX, + int imgSize, + bool sum) { + convLocalAvgUndo( + avgGrads, target, subsX, startX, strideX, outputsX, imgSize, sum, 0, 1); +} + +/* + * avgGrads: (numFilters, numOutputs, numImages) + * target: (numFilters, imgPixels, numImages) + */ +void convLocalAvgUndo( + NVMatrix& avgGrads, + NVMatrix& target, + int subsX, + int startX, + int strideX, + int outputsX, + int imgSize, + bool sum, + float scaleTargets, + float scaleOutput) { + int numImages = avgGrads.getNumCols(); + + int outputs = outputsX * outputsX; + int imgPixels = imgSize * imgSize; + int numFilters = avgGrads.getNumRows() / outputs; + assert(avgGrads.getNumRows() == numFilters * outputs); + + assert(!target.isTrans()); + assert(!avgGrads.isTrans()); + assert(avgGrads.isContiguous()); + assert(numFilters % 16 == 0); + // assert(numImages % 128 == 0); + + assert(strideX <= subsX); + + target.resize(numFilters * imgPixels, numImages); + assert(target.isContiguous()); + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + int checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + dim3 threads(32, 4); + dim3 blocks( + DIVUP(numImages, 32 * imgsPerThread) * imgSize, + (numFilters / (4 * 4)) * imgSize); + cudaStream_t stream = NVMatrix::getDefaultStream(); + bool scale = !(scaleTargets == 0 && scaleOutput == 1); + if (sum) { + if (imgsPerThread == 4) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 4, 4, true, false, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 4, 4, true, true, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 4, 4, true, false, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 4, 4, true, true, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } else if (imgsPerThread == 2) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 2, 4, true, false, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 2, 4, true, true, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 2, 4, true, false, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 2, 4, true, true, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } else { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 1, 4, true, false, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 1, 4, true, true, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 1, 4, true, false, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 1, 4, true, true, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } + } else { + if (imgsPerThread == 4) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 4, 4, false, false, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 4, 4, false, true, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 4, 4, false, false, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 4, 4, false, true, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } else if (imgsPerThread == 2) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 2, 4, false, false, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 2, 4, false, true, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 2, 4, false, false, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 2, 4, false, true, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } else { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 1, 4, false, false, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 1, 4, false, true, true> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + kLocalAvgUndo<4, 32, 1, 4, false, false, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } else { + kLocalAvgUndo<4, 32, 1, 4, false, true, false> + <<>>( + avgGrads.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + subsX, + startX, + strideX, + outputsX, + scaleTargets, + scaleOutput); + } + } + } + } + + getLastCudaError("convLocalAvgUndo: kernel execution failed"); +} + +void convResponseNorm( + NVMatrix& images, + NVMatrix& denoms, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float minDiv) { + convContrastNorm( + images, + images, + denoms, + target, + numFilters, + sizeX, + addScale, + powScale, + minDiv); +} + +/* + * images: (numFilters, imgPixels, numImages) + * meanDiffs: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + */ +void convContrastNorm( + NVMatrix& images, + NVMatrix& meanDiffs, + NVMatrix& denoms, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float minDiv) { + int numImages = images.getNumCols(); + int imgPixels = images.getNumRows() / numFilters; + assert(images.getNumRows() == numFilters * imgPixels); + int imgSize = int(sqrt(imgPixels)); + assert(imgSize * imgSize == imgPixels); + assert(meanDiffs.isSameDims(images)); + + assert(!meanDiffs.isTrans()); + assert(!images.isTrans()); + assert(images.isContiguous()); + assert(meanDiffs.isContiguous()); + assert(numFilters % 16 == 0 || numFilters <= 8); + + target.resize(images); + denoms.resize(images); + assert(target.isContiguous()); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (sizeX >= 6 && numFilters % 4 == 0) { + // This one is faster for large regions (my tests show regions >= 6...) + int imgsPerThread = 8; + int filtersPerThread = 4; + int bx = 8; + bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0; + assert((imgsPerThread * bx) % 32 == 0); + assert(numFilters % filtersPerThread == 0); + dim3 threads(bx, 16); + dim3 blocks( + DIVUP(imgSize, 4) * DIVUP(numImages, bx * imgsPerThread), + DIVUP(imgSize, 4) * numFilters / filtersPerThread); + + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm2<8, 8, 4, true>, cudaFuncCachePreferL1); // L1 faster here + kCNorm2<8, 8, 4, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm2<8, 8, 4, false>, cudaFuncCachePreferL1); // L1 faster here + kCNorm2<8, 8, 4, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else { + bool checkCaseBounds = numImages % 128 != 0; + if (numFilters <= 8) { + dim3 threads(128); + dim3 blocks(DIVUP(numImages, 128) * imgSize, imgSize); + if (numFilters == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 1, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 1, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 1, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 1, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 2) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 2, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 2, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 2, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 2, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 3) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 3, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 3, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 3, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 3, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 4) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 4, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 4, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 4, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 4, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 5) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 5, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 5, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 5, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 5, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 6) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 6, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 6, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 6, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 6, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 7) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 7, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 7, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 7, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 7, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } else if (numFilters == 8) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 8, true>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 8, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_fewfilter<1, 8, false>, cudaFuncCachePreferL1); + kCNorm_fewfilter<1, 8, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } + } else { + dim3 threads(32, 4); + dim3 blocks( + DIVUP(numImages, threads.x * 4), + (numFilters / (threads.y * 2)), + imgPixels); + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kCNorm_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1); + kCNorm_manyfilter<4, 32, 4, 2, true><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kCNorm_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1); + kCNorm_manyfilter<4, 32, 4, 2, false><<>>( + images.getDevData(), + meanDiffs.getDevData(), + denoms.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + addScale, + powScale, + minDiv); + } + } + } + getLastCudaError("convResponseNorm: kernel execution failed"); +} + +void convContrastNormUndo( + NVMatrix& outGrads, + NVMatrix& denoms, + NVMatrix& meanDiffs, + NVMatrix& acts, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float scaleTargets, + float scaleOutput) { + convResponseNormUndo( + outGrads, + denoms, + meanDiffs, + acts, + target, + numFilters, + sizeX, + addScale, + powScale, + scaleTargets, + scaleOutput); +} + +/* + * outGrads: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) + * inputs: (numFilters, imgPixels, numImages) + * acts: (numFilters, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * THIS WILL OVERWRITE THE ACTS MATRIX. + */ +void convResponseNormUndo( + NVMatrix& outGrads, + NVMatrix& denoms, + NVMatrix& inputs, + NVMatrix& acts, + NVMatrix& target, + int numFilters, + int sizeX, + float addScale, + float powScale, + float scaleTargets, + float scaleOutput) { + int numImages = outGrads.getNumCols(); + int imgPixels = outGrads.getNumRows() / numFilters; + + int imgSize = int(sqrt(imgPixels)); + assert(imgSize * imgSize == imgPixels); + + assert(outGrads.getNumRows() == numFilters * imgPixels); + + assert(denoms.isSameDims(outGrads)); + assert(acts.isSameDims(denoms)); + assert(!denoms.isTrans()); + assert(!outGrads.isTrans()); + assert(!acts.isTrans()); + assert(!target.isTrans()); + assert(outGrads.isContiguous()); + + assert(numFilters % 16 == 0); + + target.resize(outGrads); + assert(target.isContiguous()); + // First do acts := -2 x scale x acts x outGrads / denoms + // so that the main routine only has to do an addition in its inner loop. + int prelimEltsPerThread = 8; + dim3 threads(128); + dim3 blocks( + DIVUP(outGrads.getNumElements(), (threads.x * prelimEltsPerThread))); + bool checkPrelimBounds = + outGrads.getNumElements() % (threads.x * prelimEltsPerThread) != 0; + // printf("num elts: %d, blocks: %d\n", outGrads.getNumElements(), blocks.x); + cudaStream_t stream = NVMatrix::getDefaultStream(); + kRNormUndoPrelims<128, 8><<>>( + acts.getDevData(), + denoms.getTextureObject(), + outGrads.getTextureObject(), + outGrads.getNumElements(), + -2 * addScale * powScale); + + // Now the main routine + if (sizeX >= 6 && numFilters % 4 == 0) { + // This one is faster for large regions (my tests show regions >= 6...) + // NOTE: this stuff is not optimized for Kepler. Only kRNormUndo is. + int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; + int filtersPerThread = 4; + int bx = 16; + bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0; + assert((imgsPerThread * bx) % 32 == 0); + + threads = dim3(bx, 16); + blocks = dim3( + DIVUP(imgSize, 4) * DIVUP(numImages, bx * imgsPerThread), + DIVUP(imgSize, 4) * numFilters / filtersPerThread); + if (imgsPerThread == 8) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 8, 4, true, true>, cudaFuncCachePreferL1); + kRNormUndo2<16, 8, 4, true, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 8, 4, false, true>, cudaFuncCachePreferL1); + kRNormUndo2<16, 8, 4, false, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 8, 4, true, false>, cudaFuncCachePreferL1); + kRNormUndo2<16, 8, 4, true, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 8, 4, false, false>, cudaFuncCachePreferL1); + kRNormUndo2<16, 8, 4, false, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } + } else if (imgsPerThread == 4) { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 4, 4, true, true>, cudaFuncCachePreferL1); + kRNormUndo2<16, 4, 4, true, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 4, 4, false, true>, cudaFuncCachePreferL1); + kRNormUndo2<16, 4, 4, false, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 4, 4, true, false>, cudaFuncCachePreferL1); + kRNormUndo2<16, 4, 4, true, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 4, 4, false, false>, cudaFuncCachePreferL1); + kRNormUndo2<16, 4, 4, false, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } + } else { + if (checkCaseBounds) { + if (scaleTargets == 0 && scaleOutput == 1) { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 2, 4, true, true>, cudaFuncCachePreferL1); + kRNormUndo2<16, 2, 4, true, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 2, 4, false, true>, cudaFuncCachePreferL1); + kRNormUndo2<16, 2, 4, false, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 2, 4, true, false>, cudaFuncCachePreferL1); + kRNormUndo2<16, 2, 4, true, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo2<16, 2, 4, false, false>, cudaFuncCachePreferL1); + kRNormUndo2<16, 2, 4, false, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } + } + } else { + int imgsPerThread = numImages % 128 == 0 ? 4 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + threads = dim3(32, 4); + blocks = dim3( + DIVUP(numImages, 32 * imgsPerThread) * imgSize, + (numFilters / (4 * 2)) * imgSize); + + if (imgsPerThread == 4) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kRNormUndo<4, 32, 4, 2, true>, cudaFuncCachePreferL1); + kRNormUndo<4, 32, 4, 2, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo<4, 32, 4, 2, false>, cudaFuncCachePreferL1); + kRNormUndo<4, 32, 4, 2, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kRNormUndo<4, 32, 1, 2, true>, cudaFuncCachePreferL1); + kRNormUndo<4, 32, 1, 2, true><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kRNormUndo<4, 32, 1, 2, false>, cudaFuncCachePreferL1); + kRNormUndo<4, 32, 1, 2, false><<>>( + outGrads.getDevData(), + denoms.getDevData(), + inputs.getDevData(), + acts.getDevData(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeX, + powScale, + scaleTargets, + scaleOutput); + } + } + } + getLastCudaError("kRNormUndo: kernel execution failed"); +} + +/* + * imgs: (numChannels, imgPixels, numImages) with given imgStride + * target: (numChannels, tgtPixels, numImages) + * + * imgSize = scale * tgtSize + */ +void convResizeBilinear( + NVMatrix& images, + NVMatrix& target, + int imgSize, + int tgtSize, + float scale) { + assert(!images.isTrans()); + assert(!target.isTrans()); + int imgPixels = imgSize * imgSize; + int tgtPixels = tgtSize * tgtSize; + int numChannels = images.getNumRows() / imgPixels; + int numImages = images.getNumCols(); + assert(images.getNumRows() == numChannels * imgPixels); + + target.resize(numChannels * tgtPixels, numImages); + assert(target.isContiguous()); + int numChunksX = DIVUP(tgtSize, 4); + int numChunks = numChunksX * numChunksX; + double imgCenter = imgSize * 0.5; + double tgtCenter = tgtSize * 0.5; + double centerScale = imgCenter - tgtCenter * scale; + + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + cudaStream_t stream = NVMatrix::getDefaultStream(); + dim3 threads(32, 16); + dim3 blocks(DIVUP(numImages, imgsPerThread * 32), numChannels * numChunks); + if (imgsPerThread == 4) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kResizeBilinear<4, true>, cudaFuncCachePreferL1); + kResizeBilinear<4, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + tgtSize, + numImages, + images.getStride(), + scale, + centerScale); + } else { + cudaFuncSetCacheConfig(kResizeBilinear<4, false>, cudaFuncCachePreferL1); + kResizeBilinear<4, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + tgtSize, + numImages, + images.getStride(), + scale, + centerScale); + } + } else if (imgsPerThread == 2) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kResizeBilinear<2, true>, cudaFuncCachePreferL1); + kResizeBilinear<2, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + tgtSize, + numImages, + images.getStride(), + scale, + centerScale); + } else { + cudaFuncSetCacheConfig(kResizeBilinear<2, false>, cudaFuncCachePreferL1); + kResizeBilinear<2, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + tgtSize, + numImages, + images.getStride(), + scale, + centerScale); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kResizeBilinear<1, true>, cudaFuncCachePreferL1); + kResizeBilinear<1, true><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + tgtSize, + numImages, + images.getStride(), + scale, + centerScale); + } else { + cudaFuncSetCacheConfig(kResizeBilinear<1, false>, cudaFuncCachePreferL1); + kResizeBilinear<1, false><<>>( + images.getDevData(), + target.getDevData(), + imgSize, + tgtSize, + numImages, + images.getStride(), + scale, + centerScale); + } + } + getLastCudaError("convResizeBilinear: kernel execution failed"); +} + +/* + * imgs: (3, imgPixels, numImages) with given imgStride + * target: (3, imgPixels, numImages) + */ +void convRGBToYUV(NVMatrix& images, NVMatrix& target) { + assert(!images.isTrans()); + assert(!target.isTrans()); + int imgPixels = images.getNumRows() / 3; + int numImages = images.getNumCols(); + assert(images.getNumRows() == 3 * imgPixels); + + target.resize(3 * imgPixels, numImages); + assert(target.isContiguous()); + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + cudaStream_t stream = NVMatrix::getDefaultStream(); + dim3 threads(32, 4); + dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4)); + if (imgsPerThread == 4) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kRGBToYUV<4, true>, cudaFuncCachePreferL1); + kRGBToYUV<4, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig(kRGBToYUV<4, false>, cudaFuncCachePreferL1); + kRGBToYUV<4, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } else if (imgsPerThread == 2) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kRGBToYUV<2, true>, cudaFuncCachePreferL1); + kRGBToYUV<2, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig(kRGBToYUV<2, false>, cudaFuncCachePreferL1); + kRGBToYUV<2, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kRGBToYUV<1, true>, cudaFuncCachePreferL1); + kRGBToYUV<1, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig(kRGBToYUV<1, false>, cudaFuncCachePreferL1); + kRGBToYUV<1, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } + getLastCudaError("convRGBToYUV: kernel execution failed"); +} + +/* + * imgs: (3, imgPixels, numImages) with given imgStride + * target: (3, imgPixels, numImages) + */ +void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center) { + assert(!images.isTrans()); + assert(!target.isTrans()); + int imgPixels = images.getNumRows() / 3; + int numImages = images.getNumCols(); + assert(images.getNumRows() == 3 * imgPixels); + + target.resize(3 * imgPixels, numImages); + assert(target.isContiguous()); + + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + dim3 threads(32, 4); + dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (imgsPerThread == 4) { + if (center) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kRGBToLAB<4, true, true>, cudaFuncCachePreferL1); + kRGBToLAB<4, true, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig( + kRGBToLAB<4, false, true>, cudaFuncCachePreferL1); + kRGBToLAB<4, false, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kRGBToLAB<4, true, false>, cudaFuncCachePreferL1); + kRGBToLAB<4, true, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig( + kRGBToLAB<4, false, false>, cudaFuncCachePreferL1); + kRGBToLAB<4, false, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } + } else if (imgsPerThread == 2) { + if (center) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kRGBToLAB<2, true, true>, cudaFuncCachePreferL1); + kRGBToLAB<2, true, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig( + kRGBToLAB<2, false, true>, cudaFuncCachePreferL1); + kRGBToLAB<2, false, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kRGBToLAB<2, true, false>, cudaFuncCachePreferL1); + kRGBToLAB<2, true, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig( + kRGBToLAB<2, false, false>, cudaFuncCachePreferL1); + kRGBToLAB<2, false, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } + } else { + if (center) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig(kRGBToLAB<1, true, true>, cudaFuncCachePreferL1); + kRGBToLAB<1, true, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig( + kRGBToLAB<1, false, true>, cudaFuncCachePreferL1); + kRGBToLAB<1, false, true><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kRGBToLAB<1, true, false>, cudaFuncCachePreferL1); + kRGBToLAB<1, true, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } else { + cudaFuncSetCacheConfig( + kRGBToLAB<1, false, false>, cudaFuncCachePreferL1); + kRGBToLAB<1, false, false><<>>( + images.getDevData(), + target.getDevData(), + imgPixels, + numImages, + images.getStride()); + } + } + } + getLastCudaError("convRGBToLAB: kernel execution failed"); +} + +/* + * imgs: (numChannels, imgPixels, numImages) with given imgStride + * target: (numChannels, tgtPixels, numImages) + */ +void convCrop( + NVMatrix& imgs, + NVMatrix& target, + int imgSize, + int tgtSize, + int startY, + int startX) { + int numImages = imgs.getNumCols(); + int imgPixels = imgSize * imgSize; + int tgtPixels = tgtSize * tgtSize; + + int numChannels = imgs.getNumRows() / imgPixels; + assert(imgs.getNumRows() == imgPixels * numChannels); + assert(imgPixels == imgSize * imgSize); + assert(imgSize - startY >= tgtSize); + assert(imgSize - startX >= tgtSize); + assert(startY >= 0); + assert(startX >= 0); + target.resize(numChannels * tgtPixels, numImages); + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0; + dim3 blocks( + DIVUP(numImages, 32 * imgsPerThread), numChannels * DIVUP(tgtPixels, 4)); + dim3 threads(32, 4); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (imgsPerThread == 4) { + if (checkCaseBounds) { + kCrop<4, true><<>>( + imgs.getDevData(), + target.getDevData(), + numImages, + imgs.getStride(), + imgSize, + tgtSize, + startY, + startX); + } else { + kCrop<4, false><<>>( + imgs.getDevData(), + target.getDevData(), + numImages, + imgs.getStride(), + imgSize, + tgtSize, + startY, + startX); + } + } else if (imgsPerThread == 2) { + if (checkCaseBounds) { + kCrop<2, true><<>>( + imgs.getDevData(), + target.getDevData(), + numImages, + imgs.getStride(), + imgSize, + tgtSize, + startY, + startX); + } else { + kCrop<2, false><<>>( + imgs.getDevData(), + target.getDevData(), + numImages, + imgs.getStride(), + imgSize, + tgtSize, + startY, + startX); + } + } else { + if (checkCaseBounds) { + kCrop<1, true><<>>( + imgs.getDevData(), + target.getDevData(), + numImages, + imgs.getStride(), + imgSize, + tgtSize, + startY, + startX); + } else { + kCrop<1, false><<>>( + imgs.getDevData(), + target.getDevData(), + numImages, + imgs.getStride(), + imgSize, + tgtSize, + startY, + startX); + } + } + getLastCudaError("convCrop: kernel execution failed"); +} + +/* + * images: (numFilters, imgPixels, numImages) + * meanDiffs: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + + * Note: at present, I have no code to compute the meanDiffs. So it should be + set + * to be equal to images. In other words, this isn't really doing contrast + normalization, + * just response normalization. + */ +void convContrastNormCrossMap( + NVMatrix& images, + NVMatrix& meanDiffs, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + float minDiv, + bool blocked) { + int numImages = images.getNumCols(); + int imgPixels = images.getNumRows() / numFilters; + assert(images.getNumRows() == numFilters * imgPixels); + int imgSize = int(sqrt(imgPixels)); + assert(imgSize * imgSize == imgPixels); + assert(meanDiffs.isSameDims(images)); + assert(sizeF > 0 && sizeF <= numFilters); + + assert(!meanDiffs.isTrans()); + assert(!images.isTrans()); + assert(images.isContiguous()); + assert(meanDiffs.isContiguous()); + assert(numFilters % 16 == 0); + + target.resize(images); + // denoms.resize(images); + assert(target.isContiguous()); + + bool checkCaseBounds = numImages % 128 != 0; + + dim3 threads(32, 4); + dim3 blocks(DIVUP(numImages, 32 * 4) * imgSize, (numFilters / 4) * imgSize); + cudaStream_t stream = NVMatrix::getDefaultStream(); + // printf("convContrastNormCrossMap imgs: %p, meanDiffs: %p, denoms: %p, + // target: %p, imgSize: %d, numFilters: %d, numImages: %d, sizeF: %d, + // addScale: %f, powScale: %f, minDiv: %f, blocked: %d\n", + // images.getDevData(), meanDiffs.getDevData(), + // denoms.getDevData(), target.getDevData(), imgSize, numFilters, + // numImages, sizeF, addScale, powScale, minDiv, blocked); + if (blocked) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kFCNorm<4, 32, 4, true, true>, cudaFuncCachePreferL1); + kFCNorm<4, 32, 4, true, true><<>>( + images.getTextureObject(), + meanDiffs.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kFCNorm<4, 32, 4, false, true>, cudaFuncCachePreferL1); + kFCNorm<4, 32, 4, false, true><<>>( + images.getTextureObject(), + meanDiffs.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kFCNorm<4, 32, 4, true, false>, cudaFuncCachePreferL1); + kFCNorm<4, 32, 4, true, false><<>>( + images.getTextureObject(), + meanDiffs.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv); + } else { + cudaFuncSetCacheConfig( + kFCNorm<4, 32, 4, false, false>, cudaFuncCachePreferL1); + kFCNorm<4, 32, 4, false, false><<>>( + images.getTextureObject(), + meanDiffs.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv); + } + } + + getLastCudaError("convContrastNormCrossMap: kernel execution failed"); +} + +/* + * outGrads: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) + * inputs: (numFilters, imgPixels, numImages) + * acts: (numFilters, imgPixels, numImages) + * target: (numFilters, imgPixels, numImages) + * + * THIS WILL OVERWRITE THE ACTS MATRIX. + */ +void convResponseNormCrossMapUndo( + NVMatrix& outGrads, + NVMatrix& inputs, + NVMatrix& acts, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + float minDiv, + bool blocked, + float scaleTargets, + float scaleOutput) { + int numImages = outGrads.getNumCols(); + int imgPixels = outGrads.getNumRows() / numFilters; + + int imgSize = int(sqrt(imgPixels)); + assert(imgSize * imgSize == imgPixels); + assert(sizeF > 0 && sizeF <= numFilters); + assert(outGrads.getNumRows() == numFilters * imgPixels); + + assert(!outGrads.isTrans()); + assert(!acts.isTrans()); + assert(!target.isTrans()); + assert(outGrads.isContiguous()); + + assert(numFilters % 16 == 0); + + target.resize(outGrads); + assert(target.isContiguous()); + // First do acts := -2 x scale x acts x outGrads / denoms + // so that the main routine only has to do an addition in its inner loop. + cudaStream_t stream = NVMatrix::getDefaultStream(); + + dim3 threads2 = dim3(32, 4); + dim3 blocks2 = + dim3(DIVUP(numImages, 32 * 4) * imgSize, (numFilters / 4) * imgSize); + + bool checkCaseBounds = (numImages % 128) != 0; + if (blocked) { + if (scaleTargets == 0 && scaleOutput == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, false, true, true>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, false, true, true> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, false, false, true>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, false, false, true> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, true, true, true>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, true, true, true> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, true, false, true>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, true, false, true> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } + } + } else { + if (scaleTargets == 0 && scaleOutput == 1) { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, false, true, false>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, false, true, false> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, false, false, false>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, false, false, false> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } + } else { + if (checkCaseBounds) { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, true, true, false>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, true, true, false> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } else { + cudaFuncSetCacheConfig( + kFRNormUndo2<4, 32, 4, true, false, false>, cudaFuncCachePreferL1); + kFRNormUndo2<4, 32, 4, true, false, false> + <<>>( + outGrads.getTextureObject(), + inputs.getTextureObject(), + acts.getTextureObject(), + target.getDevData(), + imgSize, + numFilters, + numImages, + sizeF, + addScale, + powScale, + minDiv, + scaleTargets, + scaleOutput); + } + } + } + + getLastCudaError("convResponseNormCrossMapUndo: kernel execution failed"); +} + +void convResponseNormCrossMap( + NVMatrix& images, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + float minDiv, + bool blocked) { + convContrastNormCrossMap( + images, + images, + target, + numFilters, + sizeF, + addScale, + powScale, + minDiv, + blocked); +} + +/* + * images: (numFilters, imgPixels, numImages) + * denoms: (numFilters, imgPixels, numImages) (out) + * target: (numFilters, imgPixels, numImages) (out) + */ +void convResponseNormCrossMap( + NVMatrix& images, + NVMatrix& target, + int numFilters, + int sizeF, + float addScale, + float powScale, + bool blocked) { + convContrastNormCrossMap( + images, + images, + target, + numFilters, + sizeF, + addScale, + powScale, + 1, + blocked); +} + +cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor) { + cudaTextureObject_t tex_obj; + cudaResourceDesc res_desc; + std::memset(&res_desc, 0, sizeof(res_desc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = tensor->mutable_data(); + res_desc.res.linear.sizeInBytes = tensor->nbytes(); + res_desc.res.linear.desc = + cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaTextureDesc tex_desc; + std::memset(&tex_desc, 0, sizeof(tex_desc)); + CUDA_ENFORCE( + cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr)); + return tex_obj; +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu new file mode 100644 index 0000000..3fb31c5 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu @@ -0,0 +1,6081 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "../include/cudaconv2.cuh" + +__device__ __forceinline__ void +filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords( + int fPidx, + int imgLoadModPosY, + int imgLoadModPosX, + int imgSizeX, + int filterSize, + int& iPidx) { + int x = imgLoadModPosX + (fPidx) % filterSize; + int y = imgLoadModPosY + (fPidx) / filterSize; + iPidx = + y >= 0 && y < imgSizeX && x >= 0 && x < imgSizeX ? y * imgSizeX + x : -1; +} + +#define FA_COLOR3_IMPRELOAD(c, i) \ + imPreload[c][i] = \ + iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) \ + ? 0 \ + : mm[c * imgPixels * imgStride + i * B_X]; +#define FA_COLOR3_IMPRELOAD_TX(c, i) \ + imPreload[c][i] = \ + iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) \ + ? 0 \ + : tex1Dfetch( \ + images, imagesOffset2 + c * imgPixels * imgStride + i * B_X); + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) + * otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int numColors, + int pixelCache, + bool scale, + bool checkImgBounds> +//__launch_bounds__(128,3) +__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex( + cudaTextureObject_t images, + cudaTextureObject_t filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const float scaleTargets, + const float scaleOutputs, + const bool conv /*, const bool noloads*/) { + __shared__ float + shFilters[numColors][pixelCache] + [B_Y * filtersPerThread]; // pre-load 1 pixel from + // B_Y*filtersPerThread filters + __shared__ float shImages[numColors][pixelCache] + [B_X * imgsPerThread]; // pre-load 1 pixel from + // B_X*imgsPerThread images + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = + filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + + const int numModules = numModulesX * numModulesY; + // Another fun insanity: the % B_X makes things faster, even thought + // threadIdx.x is in the range 0..31. It appears that this allows the compiler + // to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + + // images += myImgIdx; + // filters += blockFilterIdx + // + shFilterLoadY * numFilters + shFilterLoadX; + // if (!conv) { // NOTE: UNTESTED! + // filters += moduleIdx * numColors * filterPixels * numFilters; + // } + + const int imagesOffset = myImgIdx; + const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters + + shFilterLoadX + + (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters); + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * + numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + + int iPidxNext; + float imPreload[numColors][imgsPerThread]; + float fPreload[numColors][pixelCache * filtersPerThread / B_X]; + +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int p = 0; p < pixelCache; p += B_X / filtersPerThread) { + if (p + shFilterLoadY < filterPixels) { + fPreload[c][p * filtersPerThread / B_X] = tex1Dfetch( + filters, + filtersOffset + p * numFilters + c * numFilters * filterPixels); + } else { + fPreload[c][p * filtersPerThread / B_X] = 0; + } + } + } + + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords( + ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); + +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (iPidxNext >= 0 && + (!checkImgBounds || myImgIdx + i * B_X < numImages)) { + imPreload[c][i] = tex1Dfetch( + images, + imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X); + } else { + imPreload[c][i] = 0; + } + } + } + + for (int p = 0; p < filterPixels; p += pixelCache) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int c = 0; c < numColors; ++c) { + // NOTE: bank conflicts here! + shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i]; + } + } + + const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache; + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords( + fPidxNext + ty, + imgLoadModPosY, + imgLoadModPosX, + imgSizeX, + filterSize, + iPidxNext); + + // const float* ff = &filters[numFilters * fPidxNext]; + // const float* mm = &images[imgStride * iPidxNext]; + const int filtersOffset2 = filtersOffset + numFilters * fPidxNext; + const int imagesOffset2 = imagesOffset + imgStride * iPidxNext; + + FA_COLOR3_IMPRELOAD_TX(0, 0); + FA_COLOR3_IMPRELOAD_TX(0, 1); + FA_COLOR3_IMPRELOAD_TX(0, 2); + FA_COLOR3_IMPRELOAD_TX(0, 3); + +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int pp = 0; pp < pixelCache; pp += B_X / filtersPerThread) { + shFilters[c][pp + shFilterLoadY][shFilterLoadX] = + fPreload[c][pp * filtersPerThread / B_X]; + } + } + + __syncthreads(); + FA_COLOR3_IMPRELOAD_TX(1, 0); + FA_COLOR3_IMPRELOAD_TX(1, 1); + FA_COLOR3_IMPRELOAD_TX(1, 2); + FA_COLOR3_IMPRELOAD_TX(1, 3); + FA_COLOR3_IMPRELOAD_TX(2, 0); + FA_COLOR3_IMPRELOAD_TX(2, 1); + FA_COLOR3_IMPRELOAD_TX(2, 2); + FA_COLOR3_IMPRELOAD_TX(2, 3); +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int pp = 0; pp < pixelCache * filtersPerThread / B_X; pp++) { + fPreload[c][pp] = + fPidxNext + pp * (B_X / filtersPerThread) + shFilterLoadY >= + filterPixels + ? 0 + : tex1Dfetch( + filters, + filtersOffset2 + c * numFilters * filterPixels + + pp * (B_X / filtersPerThread) * numFilters); + } + } +#pragma unroll + for (int pp = 0; pp < pixelCache; pp++) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] * + shFilters[c][pp][ty * filtersPerThread + f]; + } + } + } + } + + __syncthreads(); + } + + if (scale) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = + scaleTargets * targets[i * B_X + f * numImages * numModules] + + scaleOutputs * prod[i][f]; + } + } + } + } else { +// Note: reversing order of these loops saves 2 registers, but costs time +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = + scaleOutputs * prod[i][f]; + } + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) + * otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * This won't be pretty. + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int numColors, + int pixelCache, + bool scale, + bool checkImgBounds> +__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex( + cudaTextureObject_t images, + cudaTextureObject_t filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const float scaleTargets, + const float scaleOutputs, + const bool conv /*, const bool noloads*/) { + __shared__ float + shFilters[numColors][pixelCache] + [B_Y * filtersPerThread]; // pre-load 1 pixel from + // B_Y*filtersPerThread filters + __shared__ float shImages[numColors][pixelCache] + [B_X * imgsPerThread]; // pre-load 1 pixel from + // B_X*imgsPerThread images + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = + filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + + const int numModules = numModulesX * numModulesY; + // Another fun insanity: the % B_X makes things faster, even though + // threadIdx.x is in the range 0..31. It appears that this allows the compiler + // to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + const int warp = tidx / 32; + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + + // images += myImgIdx; + // filters += blockFilterIdx + // + shFilterLoadY * numFilters + shFilterLoadX; + // if (!conv) { // NOTE: UNTESTED! + // filters += moduleIdx * numColors * filterPixels * numFilters; + // } + + const int imagesOffset = myImgIdx; + const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters + + shFilterLoadX + + (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters); + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * + numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + + int iPidxNext; + float imPreload[numColors][imgsPerThread]; + float fPreload[numColors][DIVUP(pixelCache * filtersPerThread, B_X)]; + + if (warp < 3) { +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int p = 0; p < pixelCache; p += 2) { + if (p + shFilterLoadY < filterPixels) { + fPreload[c][p / 2] = tex1Dfetch( + filters, + filtersOffset + p * numFilters + c * numFilters * filterPixels); + } else { + fPreload[c][p / 2] = 0; + } + } + } + } + + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords( + ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); + +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (iPidxNext >= 0 && + (!checkImgBounds || myImgIdx + i * B_X < numImages)) { + imPreload[c][i] = tex1Dfetch( + images, + imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X); + } else { + imPreload[c][i] = 0; + } + } + } + + for (int p = 0; p < filterPixels; p += pixelCache) { + const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache; + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords( + fPidxNext + ty, + imgLoadModPosY, + imgLoadModPosX, + imgSizeX, + filterSize, + iPidxNext); + +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i]; + } + } + + if (warp < 3) { +#pragma unroll + for (int c = 0; c < numColors; ++c) { +#pragma unroll + for (int pp = 0; pp < pixelCache; pp += 2) { + shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp / 2]; + } + } + } + + __syncthreads(); + // const float* ff = &filters[numFilters * fPidxNext]; + // const float* mm = &images[imgStride * iPidxNext]; + const int filtersOffset2 = filtersOffset + numFilters * fPidxNext; + const int imagesOffset2 = imagesOffset + imgStride * iPidxNext; + +#pragma unroll + for (int i = 0; i < imgsPerThread; ++i) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + FA_COLOR3_IMPRELOAD_TX(c, i); + } + } + +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int pp = 0; pp < 2; pp++) { + fPreload[c][pp] = + warp >= 3 || fPidxNext + pp * 2 + shFilterLoadY >= filterPixels + ? 0 + : tex1Dfetch( + filters, + filtersOffset2 + c * numFilters * filterPixels + + pp * 2 * numFilters); + } +#pragma unroll + for (int pp = 0; pp < pixelCache; pp++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] * + shFilters[c][pp][ty * filtersPerThread + f]; + } + } + } + } + __syncthreads(); + } + + if (scale) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = + scaleTargets * targets[i * B_X + f * numImages * numModules] + + scaleOutputs * prod[i][f]; + } + } + } + } else { +// Note: reversing order of these loops costs 2 registers, but saves time +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = + scaleOutputs * prod[i][f]; + } + } + } + } +} + +__device__ inline void +filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords( + int filterSize, + int imgSizeX, + int imgLoadModPosY, + int imgLoadModPosX, + int imgY, + int imgX, + int& fPidx, + int& iPidx) { + int filterPxY = imgY - imgLoadModPosY; + int filterPxX = imgX - imgLoadModPosX; + fPidx = filterPxY * filterSize + filterPxX; + iPidx = imgY * imgSizeX + imgX; // Pixel index in img +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) + * otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * Note: in git there's a 1.5% faster version of this which sues 167 registers + * instead of 154... it's basically the same thing, but it doesn't do the + * next-pixel computation. It just avoids pre-loading when it rolls over to the + * next pixel. + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int colorCache, + bool scale, + bool checkImgBounds> +__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4( + float* images, + float* filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs, + const bool conv /*, const bool noloads*/) { + __shared__ float + shFilters[colorCache] + [B_Y * filtersPerThread]; // pre-load 1 pixel from + // B_Y*filtersPerThread filters + __shared__ float shImages[colorCache] + [B_X * imgsPerThread]; // pre-load 1 pixel from + // B_X*imgsPerThread images + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = + filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + // Another fun insanity: the % B_X makes things faster, even thought + // threadIdx.x is in the range 0..31. It appears that this allows the compiler + // to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + + images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels + + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numFilterColors * filterPixels * numFilters; + } + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * + numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; + // float fCache[filtersPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + // NOTE: these max/min functions increase register usage as compared to my + // macros + const int imgStartX = max(0, imgLoadModPosX); + const int imgStartY = max(0, imgLoadModPosY); + const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY); + // __shared__ int imgPos[] + + int fPidx, iPidx; + float imPreload[imgsPerThread]; + float fPreload[colorCache * filtersPerThread / B_X]; + // float fCache[filtersPerThread]; + + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords( + filterSize, + imgSizeX, + imgLoadModPosY, + imgLoadModPosX, + imgStartY, + imgStartX, + fPidx, + iPidx); + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + imPreload[i] = images[imgStride * iPidx + i * B_X]; + } else { + imPreload[i] = 0; + } + } + if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < + B_X / filtersPerThread) { // This if statement reduces reg usage.. +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = + filters[(c * filterPixels + fPidx) * numFilters]; + } + } + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { + // const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { + // const int filterPxX = imgX - imgLoadModPosX; + // const int p = filterPxY * filterSize + filterPxX; + // const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in + // img setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, + // imgLoadModPosX, imgY, imgX, &p, &pixIdx); float* m = + // &images[imgStride * pixIdx]; + const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1; + int imgYNext = imgY; + int imgXNext = imgX; + int fPidxNext, iPidxNext; + if (!lastPixel) { + imgYNext = imgY + (imgX + 1 == imgEndX); + imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1; + } + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords( + filterSize, + imgSizeX, + imgLoadModPosY, + imgLoadModPosX, + imgYNext, + imgXNext, + fPidxNext, + iPidxNext); + for (int oc = 0; oc < numFilterColors; + oc += colorCache) { // oc stands for outer color (loop) + const float* ff = + &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)]; + const float* mm = + &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)]; + if (oc == numFilterColors - colorCache) { + ff = &filters[fPidxNext * numFilters]; + mm = &images[iPidxNext * imgStride]; + fPidx = fPidxNext; + iPidx = iPidxNext; + } + +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + shFilters[c + shFilterLoadY][shFilterLoadX] = + fPreload[c * filtersPerThread / B_X]; + } + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + shImages[ty][tx * imgsPerThread + i] = imPreload[i]; + } + imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages) + ? 0 + : mm[0 * B_X]; + imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages) + ? 0 + : mm[1 * B_X]; + imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages) + ? 0 + : mm[2 * B_X]; + + __syncthreads(); + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] * + shFilters[0][threadIdx.y * filtersPerThread + f]; + } + } + + fPreload[0] = ff[0]; + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] * + shFilters[1][threadIdx.y * filtersPerThread + f]; + } + } + + fPreload[1] = ff[(B_X / filtersPerThread * filterPixels) * numFilters]; + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] * + shFilters[2][threadIdx.y * filtersPerThread + f]; + } + } + + imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages) + ? 0 + : mm[3 * B_X]; + +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] * + shFilters[3][threadIdx.y * filtersPerThread + f]; + } + } + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = + scaleTargets * targets[i * B_X + f * numImages * numModules] + + scaleOutputs * prod[i][f]; + } + } + } + } else { +// Note: reversing order of these loops saves 2 registers, but costs time +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = + scaleOutputs * prod[i][f]; + } + } + } + } +} + +/*****************************Function Revision + *Record***************************** Author: Tencent BestImage + *Team(ankerguo@tencent.com) * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + *********************************************************************************/ + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) + * otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int colorCache, + bool scale, + bool checkImgBounds> +__global__ void __launch_bounds__(128, 4) + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex( + cudaTextureObject_t images, + cudaTextureObject_t filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs, + const bool conv /*, const bool noloads*/) { + // avoid bank conflict by reorganizing the data structure and improve the band + // width by using 'float2' instead of 'float' + __shared__ float2 + shFilters[colorCache / 2] + [B_Y * filtersPerThread]; // pre-load 1 pixel from + // B_Y*filtersPerThread filters + __shared__ float2 + shImages[colorCache][B_X * imgsPerThread / 2]; // pre-load 1 pixel from + // B_X*imgsPerThread images + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = + filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + // Another fun insanity: the % B_X makes things faster, even thought + // threadIdx.x is in the range 0..31. It appears that this allows the compiler + // to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + // const int tidx = ty * B_X + threadIdx.x; // reduce one register + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + + // reduce two registers + // const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + // const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + tx; + const int imgOffset = (blockColorIdx + ty) * imgPixels * imgStride + myImgIdx; + + // images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + + // myImgIdx; + const int filterOffset = blockFilterIdx + + ((ty * B_X + tx) / (B_Y * filtersPerThread)) * numFilters * filterPixels + + ((ty * B_X + tx) % (B_Y * filtersPerThread)) + + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters); + // filters +=blockFilterIdx + // + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; + // if (!conv) { + // filters += moduleIdx * numFilterColors * filterPixels * numFilters; + // } + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * + numModules + + myImgIdx; + + // combine two registers into one + const int numModImages = numModules * numImages; + float prod[imgsPerThread][filtersPerThread]; + // float fCache[filtersPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + // NOTE: these max/min functions increase register usage as compared to my + // macros + const int imgStartX = max(0, imgLoadModPosX); + const int imgStartY = max(0, imgLoadModPosY); + const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY); + // __shared__ int imgPos[] + + int fPidx, iPidx; + float imPreload[imgsPerThread]; // [4] + float fPreload[colorCache * filtersPerThread / B_X]; // [2] + // float fCache[filtersPerThread]; + + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords( + filterSize, + imgSizeX, + imgLoadModPosY, + imgLoadModPosX, + imgStartY, + imgStartX, + fPidx, + iPidx); + +// remove redundant conditions +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + imPreload[i] = + tex1Dfetch(images, imgOffset + imgStride * iPidx + i * B_X); + } + +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = tex1Dfetch( + filters, filterOffset + (c * filterPixels + fPidx) * numFilters); + } + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { + // const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { + // const int filterPxX = imgX - imgLoadModPosX; + // const int p = filterPxY * filterSize + filterPxX; + // const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in + // img setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, + // imgLoadModPosX, imgY, imgX, &p, &pixIdx); float* m = + // &images[imgStride * pixIdx]; + const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1; + int imgYNext = imgY; + int imgXNext = imgX; + int fPidxNext, iPidxNext; + if (!lastPixel) { + imgYNext = imgY + (imgX + 1 == imgEndX); + imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1; + } + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords( + filterSize, + imgSizeX, + imgLoadModPosY, + imgLoadModPosX, + imgYNext, + imgXNext, + fPidxNext, + iPidxNext); + for (int oc = 0; oc < numFilterColors; + oc += colorCache) { // oc stands for outer color (loop) + // store the preloaded pixel of filter and image into shared memory + shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)] + [(ty * B_X + tx) % (B_Y * filtersPerThread)] + .x = fPreload[0]; + shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)] + [(ty * B_X + tx) % (B_Y * filtersPerThread)] + .y = fPreload[1]; + shImages[ty][tx].x = imPreload[0]; + shImages[ty][tx].y = imPreload[1]; + shImages[ty][tx + B_X].x = imPreload[2]; + shImages[ty][tx + B_X].y = imPreload[3]; + + int imgOffset2 = + imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx); + int filterOffset2 = filterOffset + + numFilters * ((oc + colorCache) * filterPixels + fPidx); + if (oc == numFilterColors - colorCache) { + filterOffset2 = filterOffset + fPidxNext * numFilters; + imgOffset2 = imgOffset + iPidxNext * imgStride; + fPidx = fPidxNext; + iPidx = iPidxNext; + } + + // preload one pixel of filter and image from texture, and no need to + // check 'checkImgBounds' with all callers setting it as false + imPreload[0] = tex1Dfetch(images, imgOffset2); + imPreload[1] = tex1Dfetch(images, imgOffset2 + B_X); + imPreload[2] = tex1Dfetch(images, imgOffset2 + 2 * B_X); + imPreload[3] = tex1Dfetch(images, imgOffset2 + 3 * B_X); + fPreload[0] = tex1Dfetch(filters, filterOffset2); + fPreload[1] = tex1Dfetch( + filters, filterOffset2 + 2 * filterPixels * numFilters); + + __syncthreads(); + +// put together the instructions with same type to improve instruction-level +// parallelism calculate the convolution between images and filters +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int r = 0; r < colorCache / 2; r++) { + prod[0][f] += + shImages[r][tx].x * shFilters[r][ty * filtersPerThread + f].x; + prod[1][f] += + shImages[r][tx].y * shFilters[r][ty * filtersPerThread + f].x; + prod[2][f] += shImages[r][tx + B_X].x * + shFilters[r][ty * filtersPerThread + f].x; + prod[3][f] += shImages[r][tx + B_X].y * + shFilters[r][ty * filtersPerThread + f].x; + prod[0][f] += shImages[r + 2][tx].x * + shFilters[r][ty * filtersPerThread + f].y; + prod[1][f] += shImages[r + 2][tx].y * + shFilters[r][ty * filtersPerThread + f].y; + prod[2][f] += shImages[r + 2][tx + B_X].x * + shFilters[r][ty * filtersPerThread + f].y; + prod[3][f] += shImages[r + 2][tx + B_X].y * + shFilters[r][ty * filtersPerThread + f].y; + } + } + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // remove the redundant condition for less registers + targets[i * B_X + f * numModImages] = + scaleTargets * targets[i * B_X + f * numModImages] + + scaleOutputs * prod[i][f]; + } + } + } else { +// Note: reversing order of these loops saves 2 registers, but costs time +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + // remove the redundant condition for less registers + targets[i * B_X + f * numModImages] = scaleOutputs * prod[i][f]; + } + } + } +} + +/* + * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X + * * imgsPerThread images. threadIdx.x determines image threadIdx.y determines + * filter + * + * blockIdx.x determines image batch of B_X * imgsPerThread + * blockIdx.y determines filter batch of module and B_Y * filtersPerThread + * + * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numColors, filterPixels, numFilters) if conv + * (numModules, numColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * + * Number of filters per module should be divisible by B_Y * filtersPerThread + * checkImgBounds indicates whether number of images is divisible by B_X * + * imgsPerThread + * + * The imgSize here is the size of the actual image without the padding. + * + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int numColors, + int pixelCache, + bool scale, + bool checkImgBounds> +__global__ void filterActs_YxX_color( + float* images, + float* filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const float scaleTargets, + const float scaleOutputs, + const bool conv) { + __shared__ float + shFilters[pixelCache * numColors] + [B_Y * filtersPerThread]; // pre-load pixelCache pixels from + // B_Y*filtersPerThread filters + __shared__ float + shImages[pixelCache * numColors] + [B_X * imgsPerThread]; // pre-load pixelCache pixels from + // B_X*imgsPerThread images + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = blockIdx.y % blocksPerModule; + + const int tidx = threadIdx.y * B_X + threadIdx.x; + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + const int numModules = numModulesY * numModulesX; + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + images += myImgIdx; + filters += filtersPerThread * B_Y * blockFilterIdx + + shFilterLoadY * numFilters + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numColors * filterPixels * numFilters; + } + + targets += moduleIdx * numImages + + (blockFilterIdx * B_Y * filtersPerThread + + threadIdx.y * filtersPerThread) * + numImages * numModulesY * numModulesX + + myImgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + prod[f][g] = 0; + } + } + // float* shImgLoad = &shImages[0][threadIdx.x]; + for (int p = 0; p < filterPixels; p += pixelCache) { + /* + * Load pixelCache pixels from B_Y*filtersPerThread filters + * This condition covers the case when B_X is not divisible by + * filtersPerThread. In this case, not all of the threads will participate + * in the loading operation. This ensures that in each loop iteration, an + * integer number of rows of shFilters are filled, which makes indexing + * simple. + */ + if (B_X % filtersPerThread == 0 || shFilterLoadY < B_X / filtersPerThread) { +#pragma unroll + for (int p2 = 0; p2 < pixelCache; p2 += B_X / filtersPerThread) { + const bool omit = pixelCache % (B_X / filtersPerThread) == 0; + const int preloadPx = shFilterLoadY + p2; + if (omit || preloadPx < pixelCache) { + if (p + preloadPx < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = + filters[(c * filterPixels + p + p2) * numFilters]; + } + } else { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = 0; + } + } + } + } + } + +/* + * Load pixelCache pixels from B_X*imgsPerThread images. + */ +#pragma unroll + for (int ly = 0; ly < pixelCache; ly += B_Y) { + const int preloadPx = ly + threadIdx.y; + const int pixIdx = p + preloadPx; + const bool omit = pixelCache % B_Y == 0; // Compile-time condition + /* + * Don't load any image pixels corresponding to filter pixels that don't + * exist. + */ + if (pixIdx < filterPixels && (omit || preloadPx < pixelCache)) { + const int x = imgLoadModPosX + pixIdx % filterSize; + const int y = imgLoadModPosY + pixIdx / filterSize; + + if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) { + float* m = &images[imgStride * (y * imgSizeX + x)]; + +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + shImages[preloadPx + c * pixelCache] + [threadIdx.x * imgsPerThread + i] = + m[c * imgStride * imgPixels + i * B_X]; + } else { + shImages[preloadPx + c * pixelCache] + [threadIdx.x * imgsPerThread + i] = 0; + } + } + } + } else { // Padding +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[preloadPx + c * pixelCache] + [threadIdx.x * imgsPerThread + i] = 0; + } + } + } + } + } + + __syncthreads(); + +#pragma unroll + for (int i = 0; i < pixelCache * numColors; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + prod[f][g] += shImages[i][g + threadIdx.x * imgsPerThread] * + shFilters[i][threadIdx.y * filtersPerThread + f]; + } + } + } + __syncthreads(); + } + + if (scale) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + targets[g * B_X + f * numImages * numModules] = + scaleTargets * targets[g * B_X + f * numImages * numModules] + + scaleOutputs * prod[f][g]; + } + } + } + } else { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[g * B_X + f * numImages * numModules] = + scaleOutputs * prod[f][g]; + } + } + } + } +} + +/* + * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X + * * imgsPerThread images. threadIdx.x determines image threadIdx.y determines + * filter + * + * blockIdx.x determines image batch of B_X * imgsPerThread + * blockIdx.y determines filter batch of B_Y * filtersPerThread + * + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) + * otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * B_Y one of 4, 8, 16 + * B_X one of 16, 32 + * imgsPerThread one of 1, 2, 4 + * filtersPerThread one of 1, 2, 4, 8 + * colorCache: how many colors to put into shmem + * + * numFilters should be divisible by B_Y * filtersPerThread + * numImages be divisible by B_X * imgsPerThread + * numFilterColors should be divisible by colorCache. + * numImgColors must be even. + * numFilters must be divisible by numGroups. + * no restrictions on pixelCache + * The imgSize here is the size of the actual image without the padding. + * As always, try to make B_X * imgsPerThread == B_Y * filtersPerThread for + * maximum efficiency. + * + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int colorCache, + bool scale, + bool checkImgBounds> +__global__ void filterActs_YxX_sparse2( + float* images, + float* filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs, + const bool conv) { + __shared__ float + shFilters[colorCache] + [B_Y * filtersPerThread]; // pre-load 1 pixel from + // B_Y*filtersPerThread filters + __shared__ float shImages[colorCache] + [B_X * imgsPerThread]; // pre-load 1 pixel from + // B_X*imgsPerThread images + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = + filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + + const int tidx = threadIdx.y * B_X + threadIdx.x; + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + + images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels + + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numFilterColors * filterPixels * numFilters; + } + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y) * numImages * numModules + myImgIdx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + prod[f][g] = 0; + } + } + const int imgStartX = MAX(0, imgLoadModPosX); + const int imgStartY = MAX(0, imgLoadModPosY); + const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY); + // __shared__ int imgPos[] + + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { + const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { + const int filterPxX = imgX - imgLoadModPosX; + const int p = filterPxY * filterSize + filterPxX; + for (int oc = 0; oc < numFilterColors; + oc += colorCache) { // oc stands for outer color (loop) + + /* + * Load a pixel from B_Y*filtersPerThread filters + * This condition covers the case when B_X is not divisible by + filtersPerThread. + * In this case, not all of the threads will participate in the loading + operation. + * This ensures that in each loop iteration, an integer number of rows + of shFilters + * are filled, which makes indexing simple. + + * nvcc is behaving in a completely insane way: removing this condition + under + * template parameters that guarantee it to be true actually slows down + * the computation. + * + */ + if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < + B_X / filtersPerThread) { +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + if (colorCache % (B_X / filtersPerThread) == 0 || + c + shFilterLoadY < colorCache) { + shFilters[c + shFilterLoadY][shFilterLoadX] = + filters[((oc + c) * filterPixels + p) * numFilters]; + } + } + } + + /* + * Load a pixel from B_X*imgsPerThread images. + */ + const int pixIdx = imgY * imgSizeX + imgX; // Pixel index in img + + float* m = &images[imgStride * (oc * imgPixels + pixIdx)]; +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y) { + if (colorCache % B_Y == 0 || threadIdx.y + c < colorCache) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + shImages[c + threadIdx.y][threadIdx.x + i * B_X] = + m[c * imgStride * imgPixels + i * B_X]; + } else { + shImages[c + threadIdx.y][threadIdx.x + i * B_X] = 0; + } + } + } + } + + __syncthreads(); + + for (int c = 0; c < colorCache; c++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][g] += shImages[c][g * B_X + threadIdx.x] * + shFilters[c][threadIdx.y + f * B_Y]; + } + } + } + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * + targets[g * B_X + f * B_Y * numImages * numModules] + + scaleOutputs * prod[f][g]; + } + } + } + } else { +// Note: reversing order of these loops saves 2 registers, but costs time +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + targets[g * B_X + f * B_Y * numImages * numModules] = + scaleOutputs * prod[f][g]; + } + } + } + } +} + +/*****************************Function Revision + *Record***************************** Author: Tencent BestImage + *Team(ankerguo@tencent.com) * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * Note: This function can be used + *when each thread loads even number of filter * pixels(filtersPerThread * + *colorCache / B_X is even), and this can be * optimized more when the number + *of loaded image's pixel is even. * + *********************************************************************************/ +template < + int B_Y, + int B_X, + int imgsPerThread, + int filtersPerThread, + int colorCache, + bool scale, + bool checkImgBounds> +__global__ void filterActs_YxX_sparse2_f_vec( + float* images, + float* filters, + float* targets, + const int numImages, + const int numFilters, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int numModulesY, + const int numModulesX, + const int imgStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs, + const bool conv) { + // improve shared memory's band width by using 'float2' instead of 'float' + __shared__ float2 + shFilters[colorCache / 2] + [B_Y * filtersPerThread]; // pre-load 1 pixel from + // B_Y*filtersPerThread filters + __shared__ float shImages[colorCache] + [B_X * imgsPerThread]; // pre-load 1 pixel from + // B_X*imgsPerThread images + + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y * filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = + filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + + const int tidx = ty * B_X + tx; + + const int imgLoadModPosY = + paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (moduleIdx % numModulesX) * moduleStride; + + // load position of filters' pixels for current thread + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + // load position of images' pixels for current thread + const int shImgLoadY = tidx / (B_X * imgsPerThread); + const int shImgLoadX = tidx % (B_X * imgsPerThread); + + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + shImgLoadX; + images += (blockColorIdx + shImgLoadY) * imgPixels * imgStride + myImgIdx; + + filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels + + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numFilterColors * filterPixels * numFilters; + } + + targets += moduleIdx * numImages + + (blockFilterIdx + ty) * numImages * numModules + + blockIdx.x * B_X * imgsPerThread + tx; + + float prod[filtersPerThread][imgsPerThread]; +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + prod[f][g] = 0; + } + } + + const int imgStartX = MAX(0, imgLoadModPosX); + const int imgStartY = MAX(0, imgLoadModPosY); + const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY); + + // temporary buffer to store the filter's loaded pixels during each loop + float fPreload[colorCache * filtersPerThread / B_X]; + // temporary buffer to store the image's loaded pixels during each loop + float iPreload[colorCache * imgsPerThread / B_Y]; + +// preload filter's pixels +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = filters + [(c * filterPixels + (imgStartY - imgLoadModPosY) * filterSize + + (imgStartX - imgLoadModPosX)) * + numFilters]; + } + + // preload image's pixels + if (!checkImgBounds || myImgIdx < numImages) { +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = images + [(c * imgPixels + imgStartY * imgSizeX + imgStartX) * imgStride]; + } + } else { +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = 0; + } + } + + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { + // const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { + for (int oc = 0; oc < numFilterColors; + oc += colorCache) { // oc stands for outer color (loop) +// store the preloaded filter's pixels into shared memory +#pragma unroll + for (int c = 0; c < colorCache / 2; c += B_X / filtersPerThread) { + shFilters[c + shFilterLoadY][shFilterLoadX].x = + fPreload[c * filtersPerThread / B_X]; + shFilters[c + shFilterLoadY][shFilterLoadX].y = + fPreload[(c + colorCache / 2) * filtersPerThread / B_X]; + } + +// store the preloaded image's pixels into shared memory +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + shImages[c + shImgLoadY][shImgLoadX] = + iPreload[c * imgsPerThread / B_Y]; + } + /* + * Load a pixel from B_Y*filtersPerThread filters + * This condition covers the case when B_X is not divisible by + filtersPerThread. + * In this case, not all of the threads will participate in the loading + operation. + * This ensures that in each loop iteration, an integer number of rows + of shFilters + * are filled, which makes indexing simple. + + * nvcc is behaving in a completely insane way: removing this condition + under + * template parameters that guarantee it to be true actually slows down + * the computation. + * + */ + + /* preload image and filter pixels' data */ + if ((oc + colorCache) == + numFilterColors) { // move to next pixel when all colors of current + // pixel have been finished + int imgXn = (imgX < (imgEndX - 1)) ? (imgX + 1) : imgStartX; + int imgYn = imgY + (imgXn != (imgX + 1)); + +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = filters + [(c * filterPixels + (imgYn - imgLoadModPosY) * filterSize + + (imgXn - imgLoadModPosX)) * + numFilters]; + } + + if (!checkImgBounds || myImgIdx < numImages) { +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = images + [(c * imgPixels + imgYn * imgSizeX + imgXn) * imgStride]; + } + } else { +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = 0; + } + } + } else { // move next colorCache +#pragma unroll + for (int c = 0; c < colorCache; c += B_X / filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = filters + [((c + oc + colorCache) * filterPixels + + (imgY - imgLoadModPosY) * filterSize + + (imgX - imgLoadModPosX)) * + numFilters]; + } + + if (!checkImgBounds || myImgIdx < numImages) { +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = images + [((c + oc + colorCache) * imgPixels + imgY * imgSizeX + + imgX) * + imgStride]; + } + } else { +#pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = 0; + } + } + } + + __syncthreads(); + + // convolution + for (int c = 0; c < colorCache / 2; c++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[f][g] += + shImages[c][g * B_X + tx] * shFilters[c][ty + f * B_Y].x; + prod[f][g] += shImages[c + colorCache / 2][g * B_X + tx] * + shFilters[c][ty + f * B_Y].y; + } + } + } + __syncthreads(); + } + } + } + + // write convolution result into global memory + if (scale) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * + targets[g * B_X + f * B_Y * numImages * numModules] + + scaleOutputs * prod[f][g]; + } + } + } + } else { +// Note: reversing order of these loops saves 2 registers, but costs time +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + targets[g * B_X + f * B_Y * numImages * numModules] = + scaleOutputs * prod[f][g]; + } + } + } + } +} +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) + * otherwise + * + * targets: (numFilters, numModules, numImages) + * + * Note: all of these convolution routines are optimized for the case when + * the number of images (i.e. the minibatch size) is a multiple of 128. + * Other batch sizes will work, but but I made no attempt whatsoever + * to make them work fast. + */ +void _filterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput, + bool conv) { + CAFFE_ENFORCE(images->ndim() == 2); + CAFFE_ENFORCE(filters->ndim() == 2); + CAFFE_ENFORCE(targets->ndim() == 2); + + int numFilterColors = numImgColors / numGroups; + int numFilters = filters->dim32(1); + int numModules = numModulesY * numModulesX; + int numImages = images->dim32(1); + int imgPixels = images->dim32(0) / numImgColors; + int imgSizeX = imgPixels / imgSizeY; + int filterModuleMult = conv ? 1 : numModules; + + CAFFE_ENFORCE( + numGroups > 1 || + (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0))); + CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0); + CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0); + CAFFE_ENFORCE(numImgColors % numGroups == 0); + CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors); + CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels); + int numFiltersPerGroup = numFilters / numGroups; + + int imgStride = images->dim32(1); + + int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors); + int filterSize = int(sqrt(filterPixels)); + CAFFE_ENFORCE(filterSize * filterSize == filterPixels); + CAFFE_ENFORCE( + filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels); + + // These routines don't handle the case when only part of the image is visited + // in the convolution + CAFFE_ENFORCE(paddingStart <= 0); + CAFFE_ENFORCE( + paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX); + CAFFE_ENFORCE( + paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY); + CAFFE_ENFORCE(moduleStride <= filterSize); + + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + int filtersPerThread, threadsY = 4; + if (numImgColors <= 3) { + // Special kernels written for colors = 3, filters = 64 and colors = 3, + // filters = 48 cases. The remaining cases use the old routines. + // TODO: Modernize the remaining cases if you care about them. + filtersPerThread = numFiltersPerGroup % 64 == 0 + ? 16 + : numFiltersPerGroup % 48 == 0 ? 12 + : numFiltersPerGroup % 32 == 0 ? 8 : 4; + } else { + filtersPerThread = numFiltersPerGroup % 64 == 0 + ? 16 + : numFiltersPerGroup % 32 == 0 ? 8 : 4; + threadsY = numFiltersPerGroup % 128 == 0 && numFilterColors % 8 == 0 && + imgsPerThread != 4 + ? 8 + : 4; + } + int threadsX = 32; + dim3 threads(threadsX, threadsY); + dim3 blocks = dim3( + DIVUP(numImages, threads.x * imgsPerThread), + (numModules * numFilters) / (threads.y * filtersPerThread)); + + bool checkImgBounds = numImages % (threads.x * imgsPerThread) != 0; + bool scale = scaleTargets != 0; + if (scaleTargets == 0) { + targets->Resize(std::vector{numFilters * numModules, numImages}); + } else { + CAFFE_ENFORCE(targets->dim32(0) == numFilters * numModules); + CAFFE_ENFORCE(targets->dim32(1) == numImages); + } + + float* images_data = images->mutable_data(); + float* filters_data = filters->mutable_data(); + float* targets_data = targets->mutable_data(); + const std::size_t images_bytes = images->nbytes(); + + cudaStream_t stream = context->cuda_stream(); + + checkCudaErrors(cudaDeviceSetSharedMemConfig( + cudaSharedMemBankSizeEightByte)); // using wider band width + + // Auto-generated calling code... + // NOTE: The calling code is set up such that if checkImgBounds is true, then + // imgsPerThread = 1. In principle it doesn't have to be this way, and you may + // want to optimize for that case. + + if (scale == false) { + if (checkImgBounds == false) { + if (numFilterColors % 8 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + if (images_bytes < TEXTURE_SIZE_MAX) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + false, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + false, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + false, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + false, + false><<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numFiltersPerGroup % 64 == 0) { + if (images_bytes < TEXTURE_SIZE_MAX) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + false, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + false, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + false, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + false, + false><<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 4, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 4, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 4, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 4, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors % 4 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 8, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 8, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 4, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 4, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 8, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 8, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 4, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 4, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 3) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex< + 4, + 32, + 4, + 16, + 3, + 4, + false, + false>, + cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex< + 4, + 32, + 4, + 16, + 3, + 4, + false, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numFiltersPerGroup % 48 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex< + 4, + 32, + 4, + 12, + 3, + 4, + false, + false>, + cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex< + 4, + 32, + 4, + 12, + 3, + 4, + false, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 8, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 8, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 4, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 4, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 16, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 16, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 12, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 12, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 8, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 8, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 4, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 4, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 2) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 16, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 16, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 12, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 12, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 8, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 8, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 4, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 4, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 16, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 16, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 12, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 12, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 8, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 8, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 4, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 4, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 1) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 16, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 16, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 12, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 12, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 8, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 8, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 4, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 4, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 16, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 16, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 12, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 12, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 8, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 8, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 4, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 4, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } + } else if (checkImgBounds == true) { + if (numFilterColors % 8 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<8, 32, 1, 16, 8, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<8, 32, 1, 16, 8, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 8, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 8, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 8, 8, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 8, 8, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors % 4 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 3) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 2) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 1) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } + } + } else if (scale == true) { + if (checkImgBounds == false) { + if (numFilterColors % 8 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + if (images_bytes < TEXTURE_SIZE_MAX) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + true, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + true, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + true, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + true, + false><<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numFiltersPerGroup % 64 == 0) { + if (images_bytes < TEXTURE_SIZE_MAX) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + true, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex< + 4, + 32, + 4, + 16, + 4, + true, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + true, + false>, + cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4< + 4, + 32, + 4, + 16, + 4, + true, + false><<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 4, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 4, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 4, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 4, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors % 4 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 8, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 8, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 4, 4, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 4, 4, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 8, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 8, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 2, 4, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 2, 4, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 3) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex< + 4, + 32, + 4, + 16, + 3, + 4, + true, + false>, + cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex< + 4, + 32, + 4, + 16, + 3, + 4, + true, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numFiltersPerGroup % 48 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex< + 4, + 32, + 4, + 12, + 3, + 4, + true, + false>, + cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex< + 4, + 32, + 4, + 12, + 3, + 4, + true, + false><<>>( + tex_images, + tex_filters, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 8, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 8, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 4, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 4, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 16, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 16, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 12, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 12, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 8, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 8, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 4, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 4, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 2) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 16, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 16, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 12, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 12, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 8, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 8, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 4, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 4, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 16, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 16, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 12, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 12, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 8, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 8, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 4, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 4, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 1) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 16, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 16, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 12, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 12, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 8, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 8, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 4, 4, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 4, 4, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 16, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 16, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 12, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 12, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 8, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 8, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 2, 4, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 2, 4, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, false>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, false> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } + } else if (checkImgBounds == true) { + if (numFilterColors % 8 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<8, 32, 1, 16, 8, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<8, 32, 1, 16, 8, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 8, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 8, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 8, 8, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 8, 8, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors % 4 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 3) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 2) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } else if (numFilterColors == 1) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig( + filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, true>, + cudaFuncCachePreferShared); + filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, true> + <<>>( + images_data, + filters_data, + targets_data, + numImages, + numFilters, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + numModulesY, + numModulesX, + imgStride, + scaleTargets, + scaleOutput, + conv); + } + } + } + } + } + + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); + getLastCudaError("filterActs: kernel execution failed"); +} + +void convFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups) { + convFilterActs( + context, + images, + filters, + targets, + imgSizeY, + numModulesY, + numModulesX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + 0, + 1); +} + +void convFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput) { + _filterActs( + context, + images, + filters, + targets, + imgSizeY, + numModulesY, + numModulesX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + true); +} + +void localFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups) { + localFilterActs( + context, + images, + filters, + targets, + imgSizeY, + numModulesY, + numModulesX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + 0, + 1); +} + +void localFilterActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput) { + _filterActs( + context, + images, + filters, + targets, + imgSizeY, + numModulesY, + numModulesX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + false); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu new file mode 100644 index 0000000..e8dd351 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu @@ -0,0 +1,9796 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../include/cudaconv2.cuh" + +/* + * Block size: 16x16. + * blockIdx.x determines case in batches of 16*imgsPerThread. + * blockIdx.y determines 4x4 image region in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines pixel. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numColors, filterPixels, numFilters) if conv (numModulesY, + * numModulesX, numColors, filterPixels, numFilters) otherwise targets: + * (numColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. + * + * Number of filters must be divisible by 16. + * Number of images must be divisible by 16*imgsPerThread if checkCaseBounds is + * false. 16 * imgsPerThread must be divisible by 32. + * + * This version loads 32 cases at a time, so it gets full coalescing on that + * load. It only loads 16 weights at a time, so those aren't fully coalesced. + * This version conserves shared memory by loading 16 filters at a time rather + * than 32. + */ +template < + int imgsPerThread, + int numColors, + bool scale, + bool checkCaseBounds, + bool conv> +__global__ void img_acts_color( + const float* hidActs, + const float* filters, + float* targets, + const int numModulesY, + const int numModulesX, + const int numImages, + const int numFilters, + const int filterSize, + const int imgSizeY, + const int imgSizeX, + const int paddingStart, + const int moduleStride, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shFilters[numColors * 16][16 + 1]; + __shared__ float shHidActs[16][16 * imgsPerThread]; + + const int blockCaseIdx = blockIdx.x * 16 * imgsPerThread; + const int numRegionsX = DIVUP(imgSizeX, 4); + const int blockRegionIdx = blockIdx.y; + const int blockRegionIdxX = blockRegionIdx % numRegionsX; + const int blockRegionIdxY = blockRegionIdx / numRegionsX; + const int blockRegionLeft = blockRegionIdxX * 4; + const int blockRegionTop = blockRegionIdxY * 4; + const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4; + const int pxY = blockRegionTop + pxYInRegion; + const int pxX = blockRegionLeft + pxXInRegion; + const int pxIdx = pxY * imgSizeX + pxX; + const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX; + const int numModules = numModulesY * numModulesX; + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeX * imgSizeY; + const int tidx = threadIdx.y * 16 + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + hidActs += blockCaseIdx + loadY * numImages * numModules + loadX; + filters += threadIdx.x; + targets += pxIdx * numImages + blockCaseIdx + threadIdx.x; + + float prod[numColors][imgsPerThread]; +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + const int startY = blockRegionTop - paddingStart < filterSize + ? 0 + : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride; + const int endY = + MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride); + const int startX = blockRegionLeft - paddingStart < filterSize + ? 0 + : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride; + const int endX = + MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride); + + float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x]; + float* shHidActLoad = &shHidActs[loadY][loadX]; + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInModuleY = pxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInModuleX = pxX - moduleLeft; + + const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && + pxInModuleX >= 0 && pxInModuleX < filterSize; + const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX; + + for (int f = 0; f < numFilters; + f += 16) { // multiply with 16 filters at a time + // Now the threads split up into half-warps, and each half-warp decides + // if it's interested. + const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; +#pragma unroll + for (int i = 0; i < imgsPerThread * 16; i += 32) { + if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) { +#pragma unroll + for (int j = 0; j < 16; + j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 + // elements at a time. + shHidActLoad[j * 16 * imgsPerThread + i] = + hLoad[j * numModules * numImages + i]; + } + } else { +#pragma unroll + for (int j = 0; j < 16; + j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 + // elements at a time. + shHidActLoad[j * 16 * imgsPerThread + i] = 0; + } + } + } + + if (isPxInImg && isPxInModule) { + // This half-warp is interested, so it's going to load the weights + // from this module to its pixel. Not fully coalesced read :( But + // taking out this read entirely only reduces the runtime by ~2.8%, so + // it isn't costing me much. + const float* fLoad = conv + ? &filters[pxIdxInModule * numFilters + f] + : &filters + [(moduleIdx * numColors * filterPixels + pxIdxInModule) * + numFilters + + f]; +#pragma unroll + for (int c = 0; c < numColors; c++) { + shilterLoad[c * 16 * (16 + 1)] = + fLoad[c * filterPixels * numFilters]; + } + } + + __syncthreads(); + // Do some actual computation + if (isPxInImg && isPxInModule) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int w = 0; w < 16; w++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] += shFilters[threadIdx.y + c * 16][w] * + shHidActs[w][threadIdx.x + i * 16]; + } + } + } + } + __syncthreads(); + } + } + } + // Not fully coalesced write :(... shmem (and fully coalesced) version is + // actually slightly slower, though + if (isPxInImg) { + if (scale) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * 16 < numImages) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + targets[c * imgPixels * numImages + i * 16] = + scaleTargets * targets[c * imgPixels * numImages + i * 16] + + scaleOutputs * prod[c][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * 16 < numImages) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + targets[c * imgPixels * numImages + i * 16] = + scaleOutputs * prod[c][i]; + } + } + } + } + } +} + +/* + * Block size: 16x16. + * blockIdx.x determines case in batches of 16*imgsPerThread, also color in + * batches of colorsPerThread. In essence, blockIdx.x.x + * = 1..numImages/(16*imgsPerThread) blockIdx.x.y + * = 1..numImgColors/colorsPerThread blockIdx.y determines 4x4 image region in + * target image. + * + * threadIdx.x determines case. + * threadIdx.y determines pixel. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, + * numFilters) otherwise targets: (numImageColors, imgSizeY, imgSizeX, + * numImages) + * + * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. + * + * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false. + * 16 * imgsPerThread must be divisible by 32. + * numImageColors/numGroups must be divisible by colorsPerThread. + * + * This version loads 32 cases at a time, so it gets full coalescing on that + * load. It only loads 16 weights at a time, so those aren't fully coalesced. + * This version conserves shared memory by loading 16 filters at a time rather + * than 32. + * + * To be used when there are 4-16 color channels. + */ +template < + int imgsPerThread, + int colorsPerThread, + bool scale, + bool checkCaseBounds, + bool conv> +__global__ void img_acts_mediumcolor( + const float* hidActs, + const float* filters, + float* targets, + const int numModulesY, + const int numModulesX, + const int numImages, + const int numFilters, + const int filterSize, + const int imgSizeY, + const int imgSizeX, + const int paddingStart, + const int moduleStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread * 16][16 + 1]; + __shared__ float shHidActs[16][16 * imgsPerThread]; + + const int numImgBlocks = DIVUP(numImages, 16 * imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16 * imgsPerThread; + + const int imgColorIdx = + (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = + imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int numRegionsX = DIVUP(imgSizeX, 4); + const int blockRegionIdx = blockIdx.y; + const int blockRegionIdxX = blockRegionIdx % numRegionsX; + const int blockRegionIdxY = blockRegionIdx / numRegionsX; + const int blockRegionLeft = blockRegionIdxX * 4; + const int blockRegionTop = blockRegionIdxY * 4; + const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4; + const int pxY = blockRegionTop + pxYInRegion; + const int pxX = blockRegionLeft + pxXInRegion; + const int pxIdx = pxY * imgSizeX + pxX; + const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX; + const uint numModules = numModulesY * numModulesX; + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * 16 + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + hidActs += + blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX; + filters += + blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x; + targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages + + blockCaseIdx + threadIdx.x; + + float prod[colorsPerThread][imgsPerThread]; +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + const int startY = blockRegionTop - paddingStart < filterSize + ? 0 + : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride; + const int endY = + MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride); + const int startX = blockRegionLeft - paddingStart < filterSize + ? 0 + : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride; + const int endX = + MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x]; + float* shHidActLoad = &shHidActs[loadY][loadX]; + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInModuleY = pxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInModuleX = pxX - moduleLeft; + + const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && + pxInModuleX >= 0 && pxInModuleX < filterSize; + const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX; + + for (int f = 0; f < numFiltersPerGroup; + f += 16) { // multipply with 16 filters at a time + // Now the threads split up into half-warps, and each half-warp decides + // if it's interested. + const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; +#pragma unroll + for (int i = 0; i < imgsPerThread * 16; i += 32) { + if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) { +#pragma unroll + for (int j = 0; j < 16; + j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 + // elements at a time. + shHidActLoad[j * 16 * imgsPerThread + i] = + hLoad[j * numModules * numImages + i]; + } + } else { +#pragma unroll + for (int j = 0; j < 16; + j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 + // elements at a time. + shHidActLoad[j * 16 * imgsPerThread + i] = 0; + } + } + } + + if (isPxInImg && isPxInModule) { + // This half-warp is interested, so it's going to load the weights + // from this module to its pixel. + + // Not fully coalesced read :( + // But taking out this read entirely only reduces the runtime by + // ~2.8%, so it isn't costing me much. + const float* fLoad = conv + ? &filters[pxIdxInModule * numFilters + f] + : &filters + [moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInModule * numFilters + f]; +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + shFilterLoad[c * 16 * (16 + 1)] = + fLoad[c * filterPixels * numFilters]; + } + } + + __syncthreads(); + // Do some actual computation + if (isPxInImg && isPxInModule) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int w = 0; w < 16; w++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] += shFilters[threadIdx.y + c * 16][w] * + shHidActs[w][threadIdx.x + i * 16]; + } + } + } + } + __syncthreads(); + } + } + } + // Not fully coalesced write :(... shmem (and fully coalesced) version is + // actually slightly slower, though + if (isPxInImg) { + if (scale) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * 16 < numImages) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * imgPixels * numImages + i * 16] = + scaleTargets * targets[c * imgPixels * numImages + i * 16] + + scaleOutputs * prod[c][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * 16 < numImages) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * imgPixels * numImages + i * 16] = + scaleOutputs * prod[c][i]; + } + } + } + } + } +} + +/* + * Block size: B_YxB_X. + * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in + batches of B_Y*colorsPerThread. + * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread) + * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread) + * blockIdx.y determines image pixel in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines color. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, + numFilters) otherwise + * targets: (numImageColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from + B_X*imgsPerThread cases. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. + * numFiltersPerGroup must be divisible by filterCache. + * + * B_X * imgsPerThread must be divisible by 32. + * numFilterColors must be divisible by B_Y*colorsPerThread. + * B_X*B_Y must be divisible by 32. + * filterCache must be divisible by B_X*B_Y/32 + * B_X*B_Y must be divisible by filterCache + + * This version loads 32 cases at a time, so it gets full coalescing on that + load. + * It only loads filterCache weights at a time, so those aren't fully coalesced + (depending on size of filterCache). + * + * To be used when there are >= 16 color channels. + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int colorsPerThread, + int filterCache, + bool scale, + bool checkCaseBounds, + bool conv> +__global__ void conv_img_acts_manycolor( + const float* hidActs, + const float* filters, + float* targets, + const int numModulesY, + const int numModulesX, + const int numImages, + const int numFilters, + const int filterSize, + const int imgSizeY, + const int imgSizeX, + const int paddingStart, + const int moduleStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread * B_Y][filterCache + 1]; + __shared__ float shHidActs[filterCache][B_X * imgsPerThread]; + + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + + const int imgColorIdx = + (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = + imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32; + const int filtersLoadY = tidx / filterCache, + filtersLoadX = tidx % filterCache; + const int numModules = numModulesY * numModulesX; + + hidActs += blockCaseIdx + + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; + filters += blockFilterIdx + + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x; + + float prod[colorsPerThread][imgsPerThread]; +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = + MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = + MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX]; + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + + for (int f = 0; f < numFiltersPerGroup; + f += filterCache) { // multiply with filterCache filters at a time + const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; +#pragma unroll + for (int i = 0; i < imgsPerThread * B_X; i += 32) { + if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) { +#pragma unroll + for (int j = 0; j < filterCache; j += + B_X * B_Y / 32) { // load filterCache rows of imgsPerThread*B_X + // cols, 8 * 32 elements at a time. + shHidActLoad[j * B_X * imgsPerThread + i] = + hLoad[j * numModules * numImages + i]; + } + } else { +#pragma unroll + for (int j = 0; j < filterCache; j += + B_X * B_Y / 32) { // load filterCache rows of imgsPerThread*B_X + // cols, 8 * 32 elements at a time. + shHidActLoad[j * B_X * imgsPerThread + i] = 0; + } + } + } + const float* fLoad = conv + ? &filters[pxIdxInFilter * numFilters + f] + : &filters + [moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInFilter * numFilters + f]; +#pragma unroll + for (int i = 0; i < colorsPerThread * B_Y; + i += B_X * B_Y / filterCache) { + if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCache) == 0 || + i + filtersLoadY < colorsPerThread * B_Y) { + shFilterLoad[i * (filterCache + 1)] = + fLoad[i * filterPixels * numFilters]; + } + } + + __syncthreads(); +// Do some actual computation +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int w = 0; w < filterCache; w++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * + shHidActs[w][threadIdx.x + i * B_X]; + } + } + } + __syncthreads(); + } + } + } + if (scale) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * + targets[c * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[c][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = + scaleOutputs * prod[c][i]; + } + } + } + } +} + +/* + * Block size: B_YxB_X. + * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in + * batches of B_Y*colorsPerThread. In essence, blockIdx.x.x + * = 1..numImages/(B_X*imgsPerThread) blockIdx.x.y + * = 1..numImgColors/(B_Y*colorsPerThread) blockIdx.y determines image pixel in + * target image. + * + * threadIdx.x determines case. + * threadIdx.y determines color. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, + * numFilters) otherwise targets: (numImageColors, imgSizeY, imgSizeX, + * numImages) + * + * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from + * B_X*imgsPerThread cases. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. + * numFiltersPerGroup must be divisible by filterCacheF. + * + * numFilterColors must be divisible by B_Y*colorsPerThread. + * B_X*B_Y must be divisible by filterCacheF + * filterCacheF must be divisible by filterCacheH + * + * This version loads 32 cases at a time, so it gets full coalescing on that + * load. It only loads filterCacheF weights at a time, so those aren't fully + * coalesced (depending on size of filterCacheF). + * + * To be used when there are >= 16 color channels. + */ +template < + int B_Y, + int B_X, + int imgsPerThread, + int colorsPerThread, + int filterCacheF, + int filterCacheH, + bool scale, + bool checkCaseBounds, + bool conv> +__global__ void conv_img_acts_manycolor_kepler( + const float* hidActs, + const float* filters, + float* targets, + const int numModulesY, + const int numModulesX, + const int numImages, + const int numFilters, + const int filterSize, + const int imgSizeY, + const int imgSizeX, + const int paddingStart, + const int moduleStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF]; + __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread]; + + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + + const int imgColorIdx = + (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = + imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int hidActLoadY = threadIdx.y, hidActLoadX = threadIdx.x; + // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % + // (B_X*imgsPerThread); + const int filtersLoadY = tidx / filterCacheF, + filtersLoadX = tidx % filterCacheF; + // nvcc is behaving idiotically again, these useless declarations save + // registers + // const int outputY = threadIdx.y, outputX = threadIdx.x; + // const int ty = threadIdx.y, tx = threadIdx.x; + const int numModules = numModulesY * numModulesX; + + hidActs += blockCaseIdx + + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; + filters += blockFilterIdx + + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x; + + float prod[colorsPerThread][imgsPerThread]; +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = + min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = + min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX]; + // const bool noFLoop = filterCacheF == filterCacheH; + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + + for (int f = 0; f < numFiltersPerGroup; + f += filterCacheF) { // multiply with filterCacheF filters at a time + const float* fLoad = conv + ? &filters[pxIdxInFilter * numFilters + f] + : &filters + [moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInFilter * numFilters + f]; +#pragma unroll + for (int i = 0; i < colorsPerThread * B_Y; + i += B_X * B_Y / filterCacheF) { + if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 || + i + filtersLoadY < colorsPerThread * B_Y) { + shFilterLoad[i * filterCacheF] = + fLoad[i * filterPixels * numFilters]; + } + } + //#pragma unroll + + for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) { + // conv_img_acts_manycolor_dummy_fhLoop(hidActs, shHidActLoad, shHidActs, shFilters, + // moduleIdx, numImages, hidActLoadY, hidActLoadX, blockCaseIdx, + // numModules, f, fh, prod); + + const float* hLoad = + &hidActs[(moduleIdx + fh * numModules) * numImages]; + +#pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || hidActLoadY + j < filterCacheH) { +#pragma unroll + for (int i = 0; i < imgsPerThread * B_X; i += B_X) { + if (!checkCaseBounds || + blockCaseIdx + hidActLoadX + i < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = + hLoad[j * numModules * numImages + i]; + } else { + shHidActLoad[j * B_X * imgsPerThread + i] = 0; + } + } + } + } + + __syncthreads(); + +// Do some actual computation +// Using these variables causes register usage to go from 161 --> 123. +// But nonetheless, the high-register version is faster. +// const float* shF = &shFilters[threadIdx.y][fh-f]; +// const float* const shF2 = &shFilters[threadIdx.y][fh]; +// const float* shH = &shHidActs[0][threadIdx.x]; +#pragma unroll + for (int w = 0; w < filterCacheH; w++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] += shFilters[c * B_Y + threadIdx.y][fh - f + w] * + shHidActs[w][threadIdx.x + i * B_X]; + } + } + } + __syncthreads(); + } + } + } + } + if (scale) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * + targets[c * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[c][i]; + } + } + } + } else { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || + blockCaseIdx + threadIdx.x + i * B_X < numImages) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = + scaleOutputs * prod[c][i]; + } + } + } + } +} + +/* + * New Titan-optimized stuff. + */ + +__device__ __forceinline__ void +conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords( + const int my, + const int mx, + const int numModulesX, + const int paddingStart, + const int moduleStride, + const int blockPixelIdxY, + const int blockPixelIdxX, + const int filterSize, + int& moduleIdx, + int& pxIdxInFilter) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + moduleIdx = my * numModulesX + mx; // out + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; // out +} + +#define IA_PRELOAD_LOOP(w, offset) \ + _Pragma("unroll") for (int i = 0; i < imgsPerThread; i++) { \ + _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \ + prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \ + shHidActs[w][threadIdx.x * imgsPerThread + i]; \ + } \ + } + +/* + * Same loop as above but inverted. + */ +#define IA_PRELOAD_LOOP2(w, offset) \ + _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \ + _Pragma("unroll") for (int i = 0; i < imgsPerThread; i++) { \ + prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \ + shHidActs[w][threadIdx.x * imgsPerThread + i]; \ + } \ + } + +#define IA_PRELOAD_LOOP3(i, offset) \ + _Pragma("unroll") for (int w = 0; w < filterCacheH; w++) { \ + _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \ + prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \ + shHidActs[w][threadIdx.x * imgsPerThread + i]; \ + } \ + } + +#define IA_PRELOAD_W(z) \ + wPreload[z] = fLoad[(z)*B_X * B_Y / filterCacheF * filterPixels * numFilters]; +#define IA_PRELOAD_W_TX(z) \ + wPreload[z] = tex1Dfetch( \ + filters, \ + filtersLoadOffset + \ + (z)*B_X * B_Y / filterCacheF * filterPixels * numFilters); +#define IA_PRELOAD_H(y, x) \ + if (!checkCaseBounds || myCaseIdx + (x)*B_X < numImages) { \ + hPreload[y][x] = hLoad[(y)*B_Y * numModules * numImages + (x)*B_X]; \ + } +#define IA_PRELOAD_H_TX(y, x) \ + if (!checkCaseBounds || myCaseIdx + (x)*B_X < numImages) { \ + hPreload[y][x] = tex1Dfetch( \ + hidActs, \ + hidActsLoadOffset + (y)*B_Y * numModules * numImages + (x)*B_X); \ + } + +template < + int B_Y, + int B_X, + int imgsPerThread, + int colorsPerThread, + int filterCacheF, + int filterCacheH, + bool scale, + bool checkCaseBounds, + bool conv> +__global__ void __launch_bounds__( + 256, + 2) // 256 threads per block, 2 blocks per multiprocessor + // These launch bounds ensure 25% occupancy (128 registers used) + // as oppposed to 13% (130 registers) achieved by defaults. + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex( + cudaTextureObject_t hidActs, + cudaTextureObject_t filters, + float* targets, + const int numModulesY, + const int numModulesX, + const int numImages, + const int numFilters, + const int filterSize, + const int imgSizeY, + const int imgSizeX, + const int paddingStart, + const int moduleStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF]; + __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread]; + + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int myCaseIdx = blockCaseIdx + threadIdx.x; + + const int imgColorIdx = + (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = + imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; + // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % + // B_X; + // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % + // (B_X*imgsPerThread); + const int filtersLoadY = tidx / filterCacheF, + filtersLoadX = tidx % filterCacheF; + // nvcc is behaving idiotically again, these useless declarations save + // registers + // const int outputY = threadIdx.y, outputX = threadIdx.x; + // const int ty = threadIdx.y, tx = threadIdx.x; + const int numModules = numModulesY * numModulesX; + const int hidActsOffset = + (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; + const int filtersOffset = blockFilterIdx + + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + + filtersLoadX; + // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + + // myCaseIdx; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * + // filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + + blockPixelIdx * numImages + myCaseIdx; + + float prod[colorsPerThread][imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = + min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = + min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; + // const bool noFLoop = filterCacheF == filterCacheH; + + /* + * Initial preload + */ + float hPreload[filterCacheH / B_Y][imgsPerThread]; // [2][4] + float wPreload[filterCacheF * colorsPerThread / B_X]; // [8] + + int moduleIdx, pxIdxInFilter; + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords( + startY, + startX, + numModulesX, + paddingStart, + moduleStride, + blockPixelIdxY, + blockPixelIdxX, + filterSize, + moduleIdx, + pxIdxInFilter); + // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] + // : &filters[moduleIdx * numFilterColors * + // filterPixels * numFilters + pxIdxInFilter * + // numFilters + 0]; + int filtersLoadOffset = filtersOffset + + (conv ? pxIdxInFilter * numFilters + 0 + : moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInFilter * numFilters); +#pragma unroll + for (int i = 0; i < colorsPerThread * B_Y; i += B_X * B_Y / filterCacheF) { + if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 || + i + filtersLoadY < colorsPerThread * B_Y) { + wPreload[i * filterCacheF / (B_X * B_Y)] = tex1Dfetch( + filters, filtersLoadOffset + i * filterPixels * numFilters); + } + } + + // const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages]; + int hidActsLoadOffset = + hidActsOffset + (moduleIdx + 0 * numModules) * numImages; +#pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + hPreload[j / B_Y][i] = tex1Dfetch( + hidActs, + hidActsLoadOffset + j * numModules * numImages + i * B_X); + } + } + } + } + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; + const bool lastModule = my == endY - 1 && mx == endX - 1; + if (!lastModule) { + mxNext = mx + 1 == endX ? startX : mx + 1; + myNext = my + (mx + 1 == endX); + } + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords( + myNext, + mxNext, + numModulesX, + paddingStart, + moduleStride, + blockPixelIdxY, + blockPixelIdxX, + filterSize, + moduleIdxNext, + pxIdxInFilterNext); + for (int f = 0; f < numFiltersPerGroup; + f += filterCacheF) { // multiply with filterCacheF filters at a time +#pragma unroll + for (int i = 0; i < colorsPerThread * B_Y; + i += B_X * B_Y / filterCacheF) { + if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 || + i + filtersLoadY < colorsPerThread * B_Y) { + shFilterLoad[i * filterCacheF] = + wPreload[i * filterCacheF / (B_X * B_Y)]; + } + } + + filtersLoadOffset = filtersOffset + + (conv ? pxIdxInFilter * numFilters + f + filterCacheF + : moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInFilter * numFilters + f + filterCacheF); + if (f == numFiltersPerGroup - filterCacheF) { + filtersLoadOffset = filtersOffset + + (conv ? pxIdxInFilterNext * numFilters + : moduleIdxNext * numFilterColors * filterPixels * + numFilters + + pxIdxInFilterNext * numFilters); + } + +#pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = + hPreload[j / B_Y][i]; + } + } + } + } + + __syncthreads(); + + hidActsLoadOffset = hidActsOffset + + (moduleIdx + (f + filterCacheH) * numModules) * numImages; + +#pragma unroll + for (int z = 0; z < 4; ++z) { + IA_PRELOAD_LOOP(z, 0); + IA_PRELOAD_W_TX(z); + } + +#pragma unroll + for (int z = 4; z < 12; ++z) { + IA_PRELOAD_LOOP(z, 0); + IA_PRELOAD_H_TX((z - 4) / 4, z % 4); + } + +#pragma unroll + for (int z = 12; z < 16; ++z) { + IA_PRELOAD_LOOP(z, 0); + } + + __syncthreads(); + +#pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = + hPreload[j / B_Y][i]; + } + } + } + } + + __syncthreads(); + + hidActsLoadOffset = hidActsOffset + + (moduleIdx + (f + filterCacheF) * numModules) * numImages; + if (f == numFiltersPerGroup - filterCacheF) { + hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; + } + +#pragma unroll + for (int z = 0; z < 4; ++z) { + IA_PRELOAD_LOOP(z, filterCacheH); + IA_PRELOAD_W_TX(z + 4); + } + +#pragma unroll + for (int z = 4; z < 12; ++z) { + IA_PRELOAD_LOOP(z, filterCacheH); + IA_PRELOAD_H_TX((z - 4) / 4, z % 4); + } + +#pragma unroll + for (int z = 12; z < 16; ++z) { + IA_PRELOAD_LOOP(z, filterCacheH); + } + + __syncthreads(); + } + } + } + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * + targets[c * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[c][i]; + } + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = + scaleOutputs * prod[c][i]; + } + } + } + } +} + +template < + int B_Y, + int B_X, + int imgsPerThread, + int colorsPerThread, + int filterCacheF, + int filterCacheH, + bool scale, + bool checkCaseBounds, + bool conv> +__global__ void +//__launch_bounds__(128, 3) // 128 threads per block, 3 blocks per +// multiprocessor +conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16( + cudaTextureObject_t hidActs, + cudaTextureObject_t filters, + float* targets, + const int numModulesY, + const int numModulesX, + const int numImages, + const int numFilters, + const int filterSize, + const int imgSizeY, + const int imgSizeX, + const int paddingStart, + const int moduleStride, + const int numImgColors, + const int numGroups, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF]; + __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread]; + + const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; + const int myCaseIdx = blockCaseIdx + threadIdx.x; + + const int imgColorIdx = + (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = + imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; + // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % + // B_X; + // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % + // (B_X*imgsPerThread); + const int filtersLoadY = tidx / filterCacheF, + filtersLoadX = tidx % filterCacheF; + // nvcc is behaving idiotically again, these useless declarations save + // registers + // const int outputY = threadIdx.y, outputX = threadIdx.x; + // const int ty = threadIdx.y, tx = threadIdx.x; + const int numModules = numModulesY * numModulesX; + + const int hidActsOffset = + (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; + const int filtersOffset = blockFilterIdx + + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + + filtersLoadX; + + // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + + // myCaseIdx; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * + // filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + + blockPixelIdx * numImages + myCaseIdx; + + float prod[colorsPerThread][imgsPerThread]; +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = + min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize + ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = + min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; + // const bool noFLoop = filterCacheF == filterCacheH; + + /* + * Initial preload + */ + float hPreload[filterCacheH / B_Y][imgsPerThread]; // [4][4] + float wPreload[filterCacheF * colorsPerThread / B_X]; // [6] + + int moduleIdx, pxIdxInFilter; + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords( + startY, + startX, + numModulesX, + paddingStart, + moduleStride, + blockPixelIdxY, + blockPixelIdxX, + filterSize, + moduleIdx, + pxIdxInFilter); + // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] + // : &filters[moduleIdx * numFilterColors * + // filterPixels * numFilters + pxIdxInFilter * + // numFilters + 0]; + int filtersLoadOffset = filtersOffset + + (conv ? pxIdxInFilter * numFilters + : moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInFilter * numFilters); +#pragma unroll + for (int i = 0; i < colorsPerThread * B_Y; i += B_X * B_Y / filterCacheF) { + if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 || + i + filtersLoadY < colorsPerThread * B_Y) { + wPreload[i * filterCacheF / (B_X * B_Y)] = tex1Dfetch( + filters, filtersLoadOffset + i * filterPixels * numFilters); + } + } + + // const float* hLoad = &hidActs[moduleIdx * numImages]; + int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages; +#pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + hPreload[j / B_Y][i] = tex1Dfetch( + hidActs, + hidActsLoadOffset + j * numModules * numImages + i * B_X); + } + } + } + } + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; + const bool lastModule = my == endY - 1 && mx == endX - 1; + if (!lastModule) { + mxNext = mx + 1 == endX ? startX : mx + 1; + myNext = my + (mx + 1 == endX); + } + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords( + myNext, + mxNext, + numModulesX, + paddingStart, + moduleStride, + blockPixelIdxY, + blockPixelIdxX, + filterSize, + moduleIdxNext, + pxIdxInFilterNext); + for (int f = 0; f < numFiltersPerGroup; + f += filterCacheF) { // multiply with filterCacheF filters at a time +#pragma unroll + for (int i = 0; i < colorsPerThread * B_Y; + i += B_X * B_Y / filterCacheF) { + if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 || + i + filtersLoadY < colorsPerThread * B_Y) { + shFilterLoad[i * filterCacheF] = + wPreload[i * filterCacheF / (B_X * B_Y)]; + } + } + + filtersLoadOffset = filtersOffset + + (conv ? pxIdxInFilter * numFilters + f + filterCacheF + : moduleIdx * numFilterColors * filterPixels * numFilters + + pxIdxInFilter * numFilters + f + filterCacheF); + if (f == numFiltersPerGroup - filterCacheF) { + filtersLoadOffset = filtersOffset + + (conv ? pxIdxInFilterNext * numFilters + : moduleIdxNext * numFilterColors * filterPixels * + numFilters + + pxIdxInFilterNext * numFilters); + } + +#pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = + hPreload[j / B_Y][i]; + } + } + } + } + hidActsLoadOffset = hidActsOffset + + (moduleIdx + (f + filterCacheF) * numModules) * numImages; + if (f == numFiltersPerGroup - filterCacheF) { + hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; + } + + __syncthreads(); + + // It seems that there is no point explicitly interleaving loads + // and computations because the scheduler does that anyway. + + IA_PRELOAD_LOOP2(0, 0); + IA_PRELOAD_LOOP2(1, 0); + IA_PRELOAD_LOOP2(2, 0); + IA_PRELOAD_LOOP2(3, 0); + IA_PRELOAD_LOOP2(4, 0); + IA_PRELOAD_LOOP2(5, 0); + IA_PRELOAD_LOOP2(6, 0); + IA_PRELOAD_LOOP2(7, 0); + IA_PRELOAD_LOOP2(8, 0); + IA_PRELOAD_LOOP2(9, 0); + IA_PRELOAD_LOOP2(10, 0); + IA_PRELOAD_LOOP2(11, 0); + IA_PRELOAD_LOOP2(12, 0); + IA_PRELOAD_LOOP2(13, 0); + IA_PRELOAD_LOOP2(14, 0); + IA_PRELOAD_LOOP2(15, 0); + + IA_PRELOAD_W_TX(0); + IA_PRELOAD_W_TX(1); + IA_PRELOAD_W_TX(2); + IA_PRELOAD_W_TX(3); + IA_PRELOAD_W_TX(4); + IA_PRELOAD_W_TX(5); + + IA_PRELOAD_H_TX(0, 0); + IA_PRELOAD_H_TX(0, 1); + IA_PRELOAD_H_TX(0, 2); + IA_PRELOAD_H_TX(0, 3); + IA_PRELOAD_H_TX(1, 0); + IA_PRELOAD_H_TX(1, 1); + IA_PRELOAD_H_TX(1, 2); + IA_PRELOAD_H_TX(1, 3); + IA_PRELOAD_H_TX(2, 0); + IA_PRELOAD_H_TX(2, 1); + IA_PRELOAD_H_TX(2, 2); + IA_PRELOAD_H_TX(2, 3); + IA_PRELOAD_H_TX(3, 0); + IA_PRELOAD_H_TX(3, 1); + IA_PRELOAD_H_TX(3, 2); + IA_PRELOAD_H_TX(3, 3); + + __syncthreads(); + } + } + } + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * + targets[c * B_Y * imgPixels * numImages + i * B_X] + + scaleOutputs * prod[c][i]; + } + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = + scaleOutputs * prod[c][i]; + } + } + } + } +} + +/* + * hidActs: (numFilters, numModules, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if + * conv (numModules, numFilterColors, filterPixels, numFilters) otherwise + * targets: (overSample, numImgColors, imgPixels, numImages) + * + * Note: all of these convolution routines are optimized for the case when + * the number of images (i.e. the minibatch size) is a multiple of 128. + * Other batch sizes will work, but but I made no attempt whatsoever + * to make them work fast. + */ +void _imgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput, + bool conv) { + CAFFE_ENFORCE(hidActs->ndim() == 2); + CAFFE_ENFORCE(filters->ndim() == 2); + CAFFE_ENFORCE(targets->ndim() == 2); + + int numFilterColors = numImgColors / numGroups; + int numImages = hidActs->dim32(1); + int numFilters = filters->dim32(1); + int numModules = hidActs->dim32(0) / numFilters; + int filterModuleMult = conv ? 1 : numModules; + int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors); + int filterSize = sqrt(filterPixels); + int imgPixels = imgSizeY * imgSizeX; + int numModulesX = numModules / numModulesY; + + CAFFE_ENFORCE(numImgColors % numGroups == 0); + CAFFE_ENFORCE( + numFilters % (16 * numGroups) == + 0); // TODO: insisting on 32 filters due to bug in calling code below. fix + // that. + CAFFE_ENFORCE( + numGroups > 1 || + (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0))); + CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0); + + CAFFE_ENFORCE(filterPixels == filterSize * filterSize); + CAFFE_ENFORCE(hidActs->dim32(0) == numModules * numFilters); + CAFFE_ENFORCE( + filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels); + CAFFE_ENFORCE(numModules == numModulesY * numModulesX); + + // These routines don't handle the case when only part of the image is visited + // in the convolution + CAFFE_ENFORCE(paddingStart <= 0); + CAFFE_ENFORCE( + paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX); + CAFFE_ENFORCE( + paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY); + CAFFE_ENFORCE(moduleStride <= filterSize); + + dim3 blocks; + dim3 threads; + int colorsPerThread, imgsPerThread; + if (numFilterColors % 8 == 0) { + threads = dim3(32, numFilterColors % 64 == 0 ? 8 : 4); + colorsPerThread = numFilterColors % 64 == 0 ? 8 + : numFilterColors % 48 == 0 + ? 12 + : numFilterColors % 32 == 0 ? 8 : numFilterColors % 16 == 0 ? 4 : 2; + imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + CAFFE_ENFORCE(numFilterColors % (threads.y * colorsPerThread) == 0); + + blocks = dim3( + DIVUP(numImages, threads.x * imgsPerThread) * + (numImgColors / (threads.y * colorsPerThread)), + imgPixels); + // NOTE: the case when channels % 32 == 0 but channels % 48 != 0 and + // channels % 64 != 0 has not been optimized!! + } else if (numFilterColors > 3) { + // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!! + imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; + threads = dim3(16, 16); + colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2; + blocks = dim3( + DIVUP(numImages, threads.x * imgsPerThread) * + (numImgColors / colorsPerThread), + DIVUP(imgSizeY, 4) * DIVUP(imgSizeX, 4)); + } else { + // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!! + imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; + threads = dim3(16, 16); + blocks = dim3( + DIVUP(numImages, threads.x * imgsPerThread), + DIVUP(imgSizeY, 4) * DIVUP(imgSizeX, 4)); + } + bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0; + + if (scaleTargets == 0) { // do not scale or use targets matrix + targets->Resize(std::vector{numImgColors * imgPixels, numImages}); + } else { + CAFFE_ENFORCE(targets->dim32(0) == numImgColors * imgPixels); + CAFFE_ENFORCE(targets->dim32(1) == numImages); + } + const bool scale = scaleTargets != 0; + + float* hidacts_data = hidActs->mutable_data(); + float* filters_data = filters->mutable_data(); + float* targets_data = targets->mutable_data(); + + cudaStream_t stream = context->cuda_stream(); + // cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + // 4, 32, 4, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); + // conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, + // 12, 16, 16, false, false, true ><<>>( + // tex_hidacts, tex_filters, targets_data, numModulesY, + // numModulesX, numImages, numFilters, filterSize, imgSizeY, + // imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, + // scaleTargets, scaleOutput); + + // return; + // printf("conv: %d\n", conv); + // printf("scale: %d\n", scale); + // printf("checkCaseBounds: %d\n", checkCaseBounds); + // printf("numFilterColors: %d\n", numFilterColors); + // printf("numImages: %d\n", numImages); + // cudaStream_t stream = NVMatrix::getDefaultStream(); + + if (conv == true) { + if (scale == false) { + if (checkCaseBounds == false) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + false, + false, + true><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + false, + false, + true><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<8, 4, false, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<8, 4, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<4, 4, false, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<4, 4, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, false, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, false, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 3, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 3, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 3, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 3, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 1, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 1, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 1, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 1, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, false, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, false, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } else if (checkCaseBounds == true) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, false, true, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, false, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, false, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, false, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, false, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, false, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } + } else if (scale == true) { + if (checkCaseBounds == false) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + true, + false, + true><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + true, + false, + true><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<8, 4, true, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<8, 4, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<4, 4, true, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<4, 4, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, true, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, true, false, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 3, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 3, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 3, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 3, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 1, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<8, 1, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 1, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<4, 1, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, true, false, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, true, false, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } else if (checkCaseBounds == true) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + true, + true>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + true, + true><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, true, true, true>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, true, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, true, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, true, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, true, true, true>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, true, true, true> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } + } + } else if (conv == false) { + if (scale == false) { + if (checkCaseBounds == false) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + false, + false, + false><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + false, + false, + false><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<8, 4, false, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<8, 4, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<4, 4, false, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<4, 4, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, false, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, false, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 3, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 3, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 3, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 3, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 1, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 1, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 1, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 1, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, false, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, false, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } else if (checkCaseBounds == true) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + false, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, false, true, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, false, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, false, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, false, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, false, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, false, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, false, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, false, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } + } else if (scale == true) { + if (checkCaseBounds == false) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< + 8, + 32, + 4, + 8, + 32, + 16, + true, + false, + false><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 4, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 2, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaTextureObject_t tex_hidacts = + GetTensorTextureObject(hidActs); + cudaTextureObject_t tex_filters = + GetTensorTextureObject(filters); + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< + 4, + 32, + 4, + 12, + 16, + 16, + true, + false, + false><<>>( + tex_hidacts, + tex_filters, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + checkCudaErrors(cudaDestroyTextureObject(tex_filters)); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 12, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 4, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 4, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 4, + 2, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 2, + 2, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + false, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<8, 4, true, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<8, 4, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<4, 4, true, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<4, 4, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, true, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, true, false, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 3, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 3, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 3, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 3, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<8, 1, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<8, 1, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<4, 1, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<4, 1, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, true, false, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, true, false, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } else if (checkCaseBounds == true) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 32, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 8, + 32, + 1, + 8, + 16, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 48 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 12, + 16, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 32, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } else if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 8, + 16, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 16 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 4, + 16, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors % 8 == 0) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + true, + false>, + cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler< + 4, + 32, + 1, + 2, + 16, + 16, + true, + true, + false><<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_mediumcolor<2, 4, true, true, false>, + cudaFuncCachePreferShared); + img_acts_mediumcolor<2, 4, true, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 3, true, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 3, true, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 2) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 2, true, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 2, true, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors == 1) { + if (numFilters % 16 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig( + img_acts_color<2, 1, true, true, false>, + cudaFuncCachePreferShared); + img_acts_color<2, 1, true, true, false> + <<>>( + hidacts_data, + filters_data, + targets_data, + numModulesY, + numModulesX, + numImages, + numFilters, + filterSize, + imgSizeY, + imgSizeX, + paddingStart, + moduleStride, + scaleTargets, + scaleOutput); + } + } + } + } + } + } + } + + getLastCudaError("imgActs: kernel execution failed"); +} + +void convImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups) { + _imgActs( + context, + hidActs, + filters, + targets, + imgSizeY, + imgSizeX, + numModulesY, + paddingStart, + moduleStride, + numImgColors, + numGroups, + 0, + 1, + true); +} + +void convImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput) { + _imgActs( + context, + hidActs, + filters, + targets, + imgSizeY, + imgSizeX, + numModulesY, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + true); +} + +void localImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups) { + _imgActs( + context, + hidActs, + filters, + targets, + imgSizeY, + imgSizeX, + numModulesY, + paddingStart, + moduleStride, + numImgColors, + numGroups, + 0, + 1, + false); +} + +void localImgActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* filters, + caffe2::TensorCUDA* targets, + int imgSizeY, + int imgSizeX, + int numModulesY, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput) { + _imgActs( + context, + hidActs, + filters, + targets, + imgSizeY, + imgSizeX, + numModulesY, + paddingStart, + moduleStride, + numImgColors, + numGroups, + scaleTargets, + scaleOutput, + false); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu b/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu new file mode 100644 index 0000000..b41a617 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu @@ -0,0 +1,6099 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../include/cudaconv2.cuh" + +#define LO16(x) ((x)&0x0000FFFF) +#define HI16(x) ((x) >> 16) + +#define WA_LOOP(r) \ + _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \ + _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \ + prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * \ + shHidActs[threadIdx.x + f * B_X][(r)]; \ + } \ + } + +#define WA_LOOP2(r) \ + _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \ + _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \ + prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * \ + shHidActs[threadIdx.x + f * B_X][(r)]; \ + } \ + } + +#define WA_IMLOAD(r) \ + imPreload[r] = im[(r)*B_X * B_Y / preloadCases * imgPixels * imgStride]; +#define WA_IMLOAD_TX(r) \ + imPreload[r] = tex1Dfetch( \ + images, \ + imgOffset2 + (r)*B_X * B_Y / preloadCases * imgPixels * imgStride); +#define WA_HALOAD(r) \ + haPreload[r] = ha[(r)*B_X * B_Y / preloadCases * numImages * numModules]; +#define WA_HALOAD_TX(r) \ + haPreload[r] = tex1Dfetch( \ + hidActs, \ + hidActsOffset2 + (r)*B_X * B_Y / preloadCases * numImages * numModules); + +__device__ __forceinline__ void +conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + const int my, + const int mx, + const int paddingStart, + const int numModulesX, + const int moduleStride, + const int blockPixelY, + const int blockPixelX, + const int imgSizeX, + const int imgStride, + int& pixIdx, + int& m) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image + const int pxX = imgLoadModPosX + blockPixelX; + pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image + m = my * numModulesX + mx; +} + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X + * filters threadIdx.x determines filter threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of + * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if + * checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, + * numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when + * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's + * unable to optimize that case away. + */ +template < + int B_Y, + int B_X, + int pixelCache, + int pixelsPerThread, + int filtersPerThread, + int preloadCases, + int numColors, + bool scale, + bool checkCaseBounds> +__global__ void conv_weight_acts_c_kepler( + float* images, + float* hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int partialSum, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors] + [preloadCases]; // preload preloadCases cases of B_Y + // * pixelsPerThread pixels + __shared__ float + shHidActs[B_X * filtersPerThread] + [preloadCases + 1]; // preload preloadCases cases of B_X hidActs + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int filterBlocksPerModule = numFilters / (B_X * filtersPerThread); + const int outputModuleIdx = blockIdx.x / filterBlocksPerModule; + const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = + B_X * filtersPerThread * (blockIdx.x % filterBlocksPerModule); + + // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + + images += loadX; + hidActs += blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + loadX; + + targets += (outputModuleIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + float prod[numColors][pixelsPerThread][filtersPerThread]; +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + + __shared__ int pxIdxes[B_Y * pixelsPerThread]; + //__shared__ bool isPxInImage[B_Y*pixelsPerThread]; + for (int m = moduleIdx; m < moduleIdx + partialSum; m++) { + __syncthreads(); + if (tidx < B_Y * pixelsPerThread) { + const int imgLoadModPosY = + paddingStart + (m / numModulesX) * moduleStride; + const int imgLoadModPosX = + paddingStart + (m % numModulesX) * moduleStride; + int pxY = (imgLoadModPosY + (blockPixelOffset + tidx) / filterSize); + int pxX = (imgLoadModPosX + (blockPixelOffset + tidx) % filterSize); + int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX + ? pixIdx + : -1; + // isPxInImage[tidx] = ; + } + __syncthreads(); + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + if (/*loadY < B_X*filtersPerThread &&*/ ( + !checkCaseBounds || caseIdx + loadX < numImages)) { +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_X * filtersPerThread) { + shHidActs[loadY + y][loadX] = + hidActs[caseIdx + y * numImages * numModules + m * numImages]; + } + } + } +#pragma unroll + for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) { +// if (loadY < B_Y * pixelCache) { // This condition is not necessary for +// correctness, but it speeds things a bit +/* + * As long as B_Y * B_X is divisible by preloadCases this will loop the right + * number of times. + * + * This will load some imgGrads from filter pixels that don't exit (it'll set + * those to 0), but the code does not produce any output for those pixels (see + * last lines). + */ +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * pixelCache) { + const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter + + if (pxIdx + blockPixelOffset < filterPixels && + (!checkCaseBounds || caseIdx + loadX < numImages)) { + const int pixIdx = + pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride; + + if (pixIdx >= 0) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = + images[caseIdx + c * imgPixels * imgStride + pixIdx]; + } + } else { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } + } else { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } + } + } + //} + + __syncthreads(); + +#pragma unroll + for (int i = 0; i < preloadCases; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int p = 0; p < pixelCache; p++) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + prod[c][pp + p][f] += + shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y][i] * + shHidActs[threadIdx.x + f * B_X][i]; + } + } + } + } + + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleTargets * + targets[p * B_Y * numFilters + + c * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + +/* + * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread + colors and B_X * filtersPerThread filters + * threadIdx.x determines filter + * threadIdx.y determines color + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of + partialSum + * blockIdx.y determines color batch of B_Y * colorsPerThread + * blockIdx.z determines pixel in filter + * NOTE: blockIdx.z is limited to values < 2^16. This means that this + routine will + * fail for filters >= 256*256. I'm assuming I won't ever use + such large filters. + + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, + filterPixels, numFilters) + + * B_X * B_Y must be divisible by preloadCases + */ +template < + int B_Y, + int B_X, + int filtersPerThread, + int colorsPerThread, + int preloadCases, + bool scale> +__global__ void conv_weight_acts_mc_mf_kepler( + float* images, + float* hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int numImgColors, + const int numGroups, + const int partialSum, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y] + [preloadCases]; // preload preloadCases cases + __shared__ float + shHidActs[filtersPerThread * B_X] + [preloadCases + 1]; // preload preloadCases cases of B_X hidacts + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int outputModuleIdx = blockIdx.x / numFilterBlocks; + const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = + filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, + blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + + images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + + hidActs += blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + loadX; + + targets += outputModuleIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + float prod[colorsPerThread][filtersPerThread]; +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][f] = 0; + } + } + + for (int m = moduleIdx; m < moduleIdx + partialSum; m++) { + const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride; + const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image + const int pxX = imgLoadModPosX + blockPixelX; + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image + if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) { + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + // Checking this condition actually makes things faster ... :/ + // So I've removed the !checkCaseBounds flag and just check it all the + // time. + if (caseIdx + loadX < numImages) { + /* + * As long as B_Y * B_X is divisible by preloadCases this will loop + * the right number of times. + * + * This will load some images from filter pixels that don't exist + * (it'll set those to 0), but the code does not produce any output + * for those pixels (see last lines). + */ + if (loadY < B_Y * colorsPerThread) { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * colorsPerThread) { + shImgLoad[(y)*preloadCases] = + images[caseIdx + y * imgPixels * imgStride + pixIdx]; + } + } + } + + if (loadY < B_X * filtersPerThread) { +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = hidActs + [caseIdx + y * numImages * numModules + m * numImages]; + } + } + } + } else { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * colorsPerThread) { + shImgLoad[(y)*preloadCases] = 0; + } + } +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = 0; + } + } + } + + __syncthreads(); +#pragma unroll + for (int i = 0; i < preloadCases; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * + shHidActs[threadIdx.x + f * B_X][i]; + } + } + } + __syncthreads(); + } + } + } + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * + targets[c * B_Y * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[c][f]; + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = + scaleOutputs * prod[c][f]; + } + } + } +} + +/* + * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread + colors and B_X * filtersPerThread filters + * threadIdx.x determines filter + * threadIdx.y determines color + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of + partialSum + * blockIdx.y determines color batch of B_Y * colorsPerThread + * blockIdx.z determines pixel in filter + * NOTE: blockIdx.z is limited to values < 2^16. This means that this + routine will + * fail for filters >= 256*256. I'm assuming I won't ever use + such large filters. + + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, + filterPixels, numFilters) + + * B_X * B_Y must be divisible by preloadCases + */ +template < + int B_Y, + int B_X, + int filtersPerThread, + int colorsPerThread, + int preloadCases, + bool scale> +__global__ void conv_weight_acts_mc_mf_kepler_sw( + float* images, + float* hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int numImgColors, + const int numGroups, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y] + [preloadCases]; // preload preloadCases cases + __shared__ float + shHidActs[filtersPerThread * B_X] + [preloadCases + 1]; // preload preloadCases cases of B_X hidacts + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = + filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, + blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + + images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + + hidActs += blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = + max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = + max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = + min(numModulesX, + min(blockModuleStartX + sumWidth, + DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = + min(numModulesY, + min(blockModuleStartY + sumWidth, + DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + float prod[colorsPerThread][filtersPerThread]; +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][f] = 0; + } + } + + /* + * Note; iterating this way is about 1% slower and uses a few more registers + * than iterating over the modules linearly. But it's consistent with the + * preload routines, so I'm using it. + */ + for (int my = mStartY; my < mEndY; my++) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image + for (int mx = mStartX; mx < mEndX; mx++) { + const int m = my * numModulesX + mx; + const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxX = imgLoadModPosX + blockPixelX; + const int pixIdx = + (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + // Checking this condition actually makes things faster ... :/ + // So I've removed the !checkCaseBounds flag and just check it all the + // time. + if (caseIdx + loadX < numImages) { + /* + * As long as B_Y * B_X is divisible by preloadCases this will loop + * the right number of times. + * + * This will load some images from filter pixels that don't exist + * (it'll set those to 0), but the code does not produce any output + * for those pixels (see last lines). + */ + if (loadY < B_Y * colorsPerThread) { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * colorsPerThread) { + shImgLoad[(y)*preloadCases] = + images[caseIdx + y * imgPixels * imgStride + pixIdx]; + } + } + } + + if (loadY < B_X * filtersPerThread) { +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = hidActs + [caseIdx + y * numImages * numModules + m * numImages]; + } + } + } + } else { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * colorsPerThread) { + shImgLoad[(y)*preloadCases] = 0; + } + } +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = 0; + } + } + } + + __syncthreads(); +#pragma unroll + for (int i = 0; i < preloadCases; i++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * + shHidActs[threadIdx.x + f * B_X][i]; + } + } + } + __syncthreads(); + } + } + } + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * + targets[c * B_Y * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[c][f]; + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = + scaleOutputs * prod[c][f]; + } + } + } +} + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X + * filters threadIdx.x determines filter threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of + * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if + * checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, + * numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when + * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's + * unable to optimize that case away. + */ +template < + int B_Y, + int B_X, + int pixelCache, + int pixelsPerThread, + int filtersPerThread, + int preloadCases, + int numColors, + bool scale, + bool checkCaseBounds> +__global__ void conv_weight_acts_c_kepler_sw( + float* images, + float* hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors] + [preloadCases]; // preload preloadCases cases of B_Y + // * pixelsPerThread pixels + __shared__ float + shHidActs[B_X * filtersPerThread] + [preloadCases + 1]; // preload preloadCases cases of B_X hidActs + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = + B_X * filtersPerThread * (blockIdx.x % numFilterBlocks); + + // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + + images += loadX; + hidActs += blockFilterIdx * numImages * numModules + // + loadY * numImages * numModules + + loadX; + + targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + // float* shImgLoad = &shImages[loadY][loadX]; + // float* shHidActLoad = &shHidActs[loadY][loadX]; + + float prod[numColors][pixelsPerThread][filtersPerThread]; +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + const int mStartX = blockModuleStartX; + const int mStartY = blockModuleStartY; + const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); + const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); + + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + + const int fYOff = (blockPixelOffset + tidx) / filterSize; + const int fXOff = (blockPixelOffset + tidx) % filterSize; + __shared__ int pxIdxes[B_Y * pixelsPerThread]; + for (int my = mStartY; my < mEndY; my++) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + for (int mx = mStartX; mx < mEndX; mx++) { + const int m = my * numModulesX + mx; + + __syncthreads(); + const int imgLoadModPosX = paddingStart + mx * moduleStride; + if (tidx < B_Y * pixelsPerThread) { + // const int imgLoadModPosY = paddingStart + my * + // moduleStride; const int imgLoadModPosX = paddingStart + // + mx * moduleStride; + int pxY = (imgLoadModPosY + fYOff); + int pxX = (imgLoadModPosX + fXOff); + int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX + ? pixIdx + : -1; + } + __syncthreads(); + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + if (/*loadY < B_X*filtersPerThread &&*/ ( + !checkCaseBounds || caseIdx + loadX < numImages)) { +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + const int fIdx = ((loadY + y) % filtersPerThread) * B_X + + (loadY + y) / filtersPerThread; + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + loadY + y < B_X * filtersPerThread) { + shHidActs[loadY + y][loadX] = hidActs + [caseIdx + fIdx * numImages * numModules + m * numImages]; + } + } + } else { +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // const int fIdx = ((loadY + y) % + // filtersPerThread) * B_X + (loadY + y) / + // filtersPerThread; + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || + loadY + y < B_X * filtersPerThread) { + shHidActs[loadY + y][loadX] = 0; + } + } + } +#pragma unroll + for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) { +// if (loadY < B_Y * pixelCache) { // This condition is not necessary for +// correctness, but it speeds things a bit +/* + * As long as B_Y * B_X is divisible by preloadCases this will loop the right + * number of times. + * + * This will load some imgGrads from filter pixels that don't exit (it'll set + * those to 0), but the code does not produce any output for those pixels (see + * last lines). + */ +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; + y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * pixelCache) { + const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter + + if (pxIdx + blockPixelOffset < filterPixels && + (!checkCaseBounds || caseIdx + loadX < numImages)) { + const int pixIdx = + pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride; + + if (pixIdx >= 0) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = + images[caseIdx + c * imgPixels * imgStride + pixIdx]; + } + } else { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } + } else { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } + } + } + //} + + __syncthreads(); + +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int i = 0; i < preloadCases; i++) { +#pragma unroll + for (int p = 0; p < pixelCache; p++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][pp + p][f] += + shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y] + [i] * + shHidActs[threadIdx.x * filtersPerThread + f][i]; + } + } + } + } + + __syncthreads(); + } + } + } + } + + if (scale) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleTargets * + targets[p * B_Y * numFilters + + c * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + +#define WA_C3_LOOP(pp, c) \ + _Pragma("unroll") for (int i = 0; i < preloadCases; i++) { \ + _Pragma("unroll") for (int p = 0; p < pixelCache; p++) { \ + _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \ + prod[c][(pp) + p][f] += \ + shImages[threadIdx.y + p * B_Y + (c)*pixelCache * B_Y][i] * \ + shHidActs[threadIdx.x * filtersPerThread + f][i]; \ + } \ + } \ + } + +#define WA_C3_LOOP2(pp) \ + _Pragma("unroll") for (int p = 0; p < pixelCache; p++) { \ + _Pragma("unroll") for (int i = 0; i < preloadCases; i++) { \ + _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \ + _Pragma("unroll") for (int c = 0; c < 3; ++c) { \ + prod[c][(pp) + p][f] += \ + shImages[threadIdx.y + p * B_Y + (c)*pixelCache * B_Y][i] * \ + shHidActs[threadIdx.x * filtersPerThread + f][i]; \ + } \ + } \ + } \ + } + +#define WA_3_FIDX(y) \ + (((loadY + (y)*B_X * B_Y / preloadCases) % filtersPerThread) * B_X + \ + (loadY + (y)*B_X * B_Y / preloadCases) / filtersPerThread) + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X + * filters threadIdx.x determines filter threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of + * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if + * checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, + * numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when + * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's + * unable to optimize that case away. + */ +template < + int B_Y, + int B_X, + int pixelCache, + int pixelsPerThread, + int filtersPerThread, + int preloadCases, + int numColors, + bool scale, + bool checkCaseBounds> +//__launch_bounds__(256,2) +__global__ void conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3( + cudaTextureObject_t images, + cudaTextureObject_t hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors] + [preloadCases]; // preload preloadCases cases of B_Y + // * pixelsPerThread pixels + __shared__ float + shHidActs[B_X * filtersPerThread] + [preloadCases + 1]; // preload preloadCases cases of B_X hidActs + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = + B_X * filtersPerThread * (blockIdx.x % numFilterBlocks); + + // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + const int imgOffset = loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX; + // images += loadX; + // hidActs += blockFilterIdx * numImages * numModules + // + loadX; + + targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + // float* shImgLoad = &shImages[loadY][loadX]; + // float* shHidActLoad = &shHidActs[loadY][loadX]; + + float prod[numColors][pixelsPerThread][filtersPerThread]; +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + const int mStartX = blockModuleStartX; + const int mStartY = blockModuleStartY; + const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); + const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); + + const bool doWork = mStartY < mEndY && mStartX < mEndX; + // if (!doWork) { + // hidActs -= + // } + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + + // float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12] + float haPreload[filtersPerThread * preloadCases / B_Y]; // [8] + // if (blockIdx.x != 0 || blockIdx.y !=0) { + // return; + // } + // printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX, + // mStartY, mEndX, mEndY); + const int fYOff = (blockPixelOffset + tidx) / filterSize; + const int fXOff = (blockPixelOffset + tidx) % filterSize; + __shared__ int pxIdxes[B_Y * pixelsPerThread]; + // __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [8] + + int m = mStartY * numModulesX + mStartX; + + int fidx[filtersPerThread * preloadCases / B_Y]; + if (doWork) { +#pragma unroll + for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) { + const int fIdx = WA_3_FIDX(y); + // if (doWork) { + haPreload[y] = tex1Dfetch( + hidActs, + hidActsOffset + fIdx * numImages * numModules + m * numImages); + // } + fidx[y] = fIdx * numImages * numModules; + } + } + + for (int my = mStartY; my < mEndY; my++) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + for (int mx = mStartX; mx < mEndX; mx++) { + m = my * numModulesX + mx; + + // __syncthreads(); + const int imgLoadModPosX = paddingStart + mx * moduleStride; + if (tidx < B_Y * pixelsPerThread) { + // const int imgLoadModPosY = paddingStart + my * + // moduleStride; const int imgLoadModPosX = paddingStart + // + mx * moduleStride; + const int pxY = (imgLoadModPosY + fYOff); + const int pxX = (imgLoadModPosX + fXOff); + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX + ? pixIdx + : -1; + } + __syncthreads(); + + int myNext = my, mxNext = mx, mNext = m; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + mNext = myNext * numModulesX + mxNext; + } + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + const bool lastBatch = caseIdx + preloadCases == numImages; + // const float* im = &images[caseIdx + preloadCases + + // pixIdx]; const float* ha = &hidActs[caseIdx + + // preloadCases + m * numImages]; + int hidActsOffset2 = + hidActsOffset + caseIdx + preloadCases + m * numImages; + + if (lastBatch) { + // ha = &hidActs[mNext * numImages]; + hidActsOffset2 = hidActsOffset + mNext * numImages; + } + +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + shHidActs[loadY + y][loadX] = + haPreload[y * preloadCases / (B_X * B_Y)]; + } + +/* ================================================================================== + * Iteration 0 + * ================================================================================== + */ +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter + if (pxIdx + blockPixelOffset < filterPixels) { + const int pixIdx = + pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride; + if (pixIdx >= 0) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = + tex1Dfetch( + images, + imgOffset + caseIdx + c * imgPixels * imgStride + + pixIdx); + } + } + } + } + + __syncthreads(); + + haPreload[0] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[0]); + haPreload[1] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[1]); + WA_C3_LOOP(0, 0); + haPreload[2] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[2]); + haPreload[3] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[3]); + WA_C3_LOOP(0, 1); + haPreload[4] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[4]); + haPreload[5] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[5]); + WA_C3_LOOP(0, 2); + haPreload[6] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[6]); + haPreload[7] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[7]); + + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleTargets * + targets[p * B_Y * numFilters + + c * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + // if (threadIdx.x == 3) + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X + * filters threadIdx.x determines filter threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of + * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if + * checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, + * numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when + * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's + * unable to optimize that case away. + */ +template < + int B_Y, + int B_X, + int pixelCache, + int pixelsPerThread, + int filtersPerThread, + int preloadCases, + int numColors, + bool scale, + bool checkCaseBounds> +__launch_bounds__(256, 2) __global__ + void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3( + cudaTextureObject_t images, + cudaTextureObject_t hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors] + [preloadCases]; // preload preloadCases cases of B_Y + // * pixelsPerThread pixels + __shared__ float + shHidActs[B_X * filtersPerThread] + [preloadCases + 1]; // preload preloadCases cases of B_X hidActs + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = + B_X * filtersPerThread * (blockIdx.x % numFilterBlocks); + + // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + const int imgOffset = loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX; + // images += loadX; + // hidActs += blockFilterIdx * numImages * numModules + // + loadX; + + targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + // float* shImgLoad = &shImages[loadY][loadX]; + // float* shHidActLoad = &shHidActs[loadY][loadX]; + + float prod[numColors][pixelsPerThread][filtersPerThread]; +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + const int mStartX = blockModuleStartX; + const int mStartY = blockModuleStartY; + const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); + const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); + + const bool doWork = mStartY < mEndY && mStartX < mEndX; + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + + // float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12] + float haPreload[filtersPerThread * preloadCases / B_Y]; // [6] + // if (blockIdx.x != 0 || blockIdx.y !=0) { + // return; + // } + // printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX, + // mStartY, mEndX, mEndY); + const int fYOff = (blockPixelOffset + tidx) / filterSize; + const int fXOff = (blockPixelOffset + tidx) % filterSize; + __shared__ int pxIdxes[B_Y * pixelsPerThread]; + // __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [6] + + int m = mStartY * numModulesX + mStartX; + int fidx[filtersPerThread * preloadCases / B_Y]; + // if (doWork) { +#pragma unroll + for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) { + fidx[y] = WA_3_FIDX(y) * numImages * numModules; + if (doWork) { // Not actually necessary, I think + haPreload[y] = + tex1Dfetch(hidActs, hidActsOffset + fidx[y] + m * numImages); + } + } + // } + int mNext = mStartY * numModulesX + mStartX; + for (int my = mStartY; my < mEndY; my++) { + // const int imgLoadModPosY = paddingStart + my * moduleStride; + for (int mx = mStartX; mx < mEndX; mx++) { + m = mNext; // my * numModulesX + mx; + + // __syncthreads(); + // const int imgLoadModPosX = paddingStart + mx * moduleStride; + if (tidx < B_Y * pixelsPerThread) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxY = (imgLoadModPosY + fYOff); + const int pxX = (imgLoadModPosX + fXOff); + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX + ? pixIdx + : -1; + } + __syncthreads(); + + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + mNext = lastModule * m + + !lastModule * + ((my + (mx + 1 == mEndX)) * numModulesX + + (mx + 1 == mEndX ? mStartX : mx + 1)); + // if (!lastModule) { + // const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + // const int myNext = my + (mx + 1 == mEndX); + // mNext = myNext * numModulesX + mxNext; + // } + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + const bool lastBatch = caseIdx + preloadCases == numImages; + // const float* im = &images[caseIdx + preloadCases + + // pixIdx]; const float* ha = hidActs + !lastBatch * + // (caseIdx + preloadCases + m * numImages) + lastBatch * + // mNext * numImages; + const int hidActsOffset2 = hidActsOffset + + !lastBatch * (caseIdx + preloadCases + m * numImages) + + lastBatch * mNext * numImages; + // if (lastBatch) { + // ha = &hidActs[mNext * numImages]; + // } + +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + shHidActs[loadY + y][loadX] = + haPreload[y * preloadCases / (B_X * B_Y)]; + } + +/* ================================================================================== + * Iteration 0 + * ================================================================================== + */ +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * pixelCache) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } + } +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * pixelCache) { + const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter + const int pixIdx = + pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride; + if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels && + (!checkCaseBounds || caseIdx + loadX < numImages)) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = + tex1Dfetch( + images, + imgOffset + caseIdx + c * imgPixels * imgStride + + pixIdx); + } + } + } + } + + __syncthreads(); + + haPreload[0] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[0]); + haPreload[1] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[1]); + haPreload[2] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[2]); + haPreload[3] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[3]); + haPreload[4] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[4]); + haPreload[5] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[5]); + + WA_C3_LOOP2(0); + + __syncthreads(); + +/* ================================================================================== + * Iteration 1 + * ================================================================================== + */ +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * pixelCache) { + // const int pxIdx = 2 * B_Y + loadY + y; // + // pixel idx in filter +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0; + } + } + } + +#pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of + // rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y * pixelCache) { + const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter + const int pixIdx = + pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride; + if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels && + (!checkCaseBounds || caseIdx + loadX < numImages)) { +#pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY + y + c * pixelCache * B_Y][loadX] = + tex1Dfetch( + images, + imgOffset + caseIdx + c * imgPixels * imgStride + + pixIdx); + } + } + } + } + + __syncthreads(); + + WA_C3_LOOP2(2); + + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleTargets * + targets[p * B_Y * numFilters + + c * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { +#pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { +#pragma unroll + for (int c = 0; c < numColors; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets + [p * B_Y * numFilters + c * filterPixels * numFilters + + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + +/*****************************Function Revision + *Record***************************** Author: Tencent BestImage + *Team(ankerguo@tencent.com) * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + *********************************************************************************/ + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, + * filterPixels, numFilters) + */ +template < + int B_Y, + int B_X, + int filtersPerThread, + int colorsPerThread, + int preloadCases, + bool scale> +__launch_bounds__(128, 4) __global__ + void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16( + cudaTextureObject_t images, + cudaTextureObject_t hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int numImgColors, + const int numGroups, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + // avoid bank conflict by reorganizing the data structure, and improve the + // band width by using 'float2' instead of 'float' + __shared__ float2 + shImages[preloadCases] + [colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases + __shared__ float2 shHidActs[preloadCases] + [filtersPerThread * B_X / 2 + + 2]; // preload preloadCases cases of B_X hidacts + + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + const int tidx = B_X * ty + tx; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + // const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = + filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, + blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + loadX; + // + // hidActs += + // blockFilterIdx * numImages * numModules + // + loadY * numImages * numModules + // + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + ty) * filterPixels * numFilters + + blockPixelOffset * numFilters + blockFilterIdx + tx; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = + max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = + max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = + min(numModulesX, + min(blockModuleStartX + sumWidth, + DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = + min(numModulesY, + min(blockModuleStartY + sumWidth, + DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + const bool doWork = mStartY < mEndY && mStartX < mEndX; + + // reduce 2 registers + // float* shHidActLoad = &shHidActs[loadY][loadX]; + // float* shImgLoad = &shImages[loadY][loadX]; + + float imPreload[preloadCases * colorsPerThread / B_X]; // [8] + float haPreload[preloadCases * filtersPerThread / B_Y]; // [8] + + float prod[filtersPerThread][colorsPerThread]; + +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[f][c] = 0; + } + } + int pixIdx, pixIdxNext, m, mNext; + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + mStartY, + mStartX, + paddingStart, + numModulesX, + moduleStride, + blockPixelY, + blockPixelX, + imgSizeX, + imgStride, + pixIdx, + m); + + if (doWork) { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + // It's bizarre, but this is the fastest way I've found to get it not to + // load nonexistent pixels. All other ways cause crazy excessive register + // usage. + const int idx = (mStartY < mEndY && mStartX < mEndX) * + (0 + y * imgPixels * imgStride + pixIdx); + imPreload[y * preloadCases / (B_X * B_Y)] = + tex1Dfetch(images, imgOffset + idx); + } + } + + if (doWork) { +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + // Almost certainly not necessary here. + const int idx = (mStartY < mEndY && mStartX < mEndX) * + (0 + y * numImages * numModules + m * numImages); + haPreload[y * preloadCases / (B_X * B_Y)] = + tex1Dfetch(hidActs, hidActsOffset + idx); + } + } + + for (int my = mStartY; my < mEndY; my++) { + for (int mx = mStartX; mx < mEndX; mx++) { + int myNext = my, mxNext = mx; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + } + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + myNext, + mxNext, + paddingStart, + numModulesX, + moduleStride, + blockPixelY, + blockPixelX, + imgSizeX, + imgStride, + pixIdxNext, + mNext); + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { +// store the preloaded image's pixel into shared memory +#pragma unroll + for (int y = 0; y < 4; y++) { + shImages[loadX][loadY + y * 8].x = imPreload[y]; + shImages[loadX][loadY + y * 8].y = imPreload[y + 4]; + } + // const float* im = &images[caseIdx + preloadCases + pixIdx]; + // const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; + int hidActsOffset2 = + hidActsOffset + caseIdx + preloadCases + m * numImages; + if (caseIdx + preloadCases == numImages) { + pixIdx = pixIdxNext; + m = mNext; + imgOffset2 = imgOffset + pixIdxNext; + hidActsOffset2 = hidActsOffset + mNext * numImages; + } + + // store the images and hidActs + shHidActs[loadX][loadY].x = haPreload[0]; + shHidActs[loadX][loadY].y = haPreload[2]; + shHidActs[loadX][loadY + 16].x = haPreload[4]; + shHidActs[loadX][loadY + 16].y = haPreload[6]; + shHidActs[loadX][loadY + 8].x = haPreload[1]; + shHidActs[loadX][loadY + 8].y = haPreload[3]; + shHidActs[loadX][loadY + 24].x = haPreload[5]; + shHidActs[loadX][loadY + 24].y = haPreload[7]; + +// preloade the image's and hidAct's pixel +#pragma unroll + for (int r = 0; r < 8; r++) { + imPreload[r] = tex1Dfetch( + images, imgOffset2 + (r)*8 * imgPixels * imgStride); + haPreload[r] = tex1Dfetch( + hidActs, hidActsOffset2 + (r)*8 * numImages * numModules); + } + + __syncthreads(); +// put together the instructions of same type to improve instruction-level +// parallelism +#pragma unroll + for (int r = 0; r < 16; r++) { + for (int c = 0; c < 4; c++) { + prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].x; + prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].y; + prod[2][c] += + shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].x; + prod[3][c] += + shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].y; + prod[0][c + 4] += + shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].x; + prod[1][c + 4] += + shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].y; + prod[2][c + 4] += + shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].x; + prod[3][c + 4] += + shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].y; + } + } + + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * + targets[c * B_Y * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[f][c]; + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = + scaleOutputs * prod[f][c]; + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, + * filterPixels, numFilters) + */ +template < + int B_Y, + int B_X, + int filtersPerThread, + int colorsPerThread, + int preloadCases, + bool scale> +__launch_bounds__(256, 2) __global__ + void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32( + cudaTextureObject_t images, + cudaTextureObject_t hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int numImgColors, + const int numGroups, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y] + [preloadCases]; // preload preloadCases cases + __shared__ float + shHidActs[filtersPerThread * B_X] + [preloadCases + 1]; // preload preloadCases cases of B_X hidacts + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + // const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = + filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, + blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + + const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + loadX; + // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + // + // hidActs += + // blockFilterIdx * numImages * numModules + // + loadY * numImages * numModules + // + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = + max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = + max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = + min(numModulesX, + min(blockModuleStartX + sumWidth, + DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = + min(numModulesY, + min(blockModuleStartY + sumWidth, + DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + const bool doWork = mStartY < mEndY && mStartX < mEndX; + + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + + float imPreload[preloadCases * colorsPerThread / B_X]; // [6] + float haPreload[preloadCases * filtersPerThread / B_Y]; // [16] + + float prod[filtersPerThread][colorsPerThread]; + +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[f][c] = 0; + } + } + int pixIdx, pixIdxNext, m, mNext; + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + mStartY, + mStartX, + paddingStart, + numModulesX, + moduleStride, + blockPixelY, + blockPixelX, + imgSizeX, + imgStride, + pixIdx, + m); + + if (doWork) { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + imPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch( + images, imgOffset + y * imgPixels * imgStride + pixIdx); + } + +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch( + hidActs, hidActsOffset + y * numImages * numModules + m * numImages); + } + } + // if (mStartY > mEndY || mStartX > mEndX) { + // printf("crzy!!\n"); + // } + + for (int my = mStartY; my < mEndY; my++) { + for (int mx = mStartX; mx < mEndX; mx++) { + int myNext = my, mxNext = mx; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + } + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + myNext, + mxNext, + paddingStart, + numModulesX, + moduleStride, + blockPixelY, + blockPixelX, + imgSizeX, + imgStride, + pixIdxNext, + mNext); + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { +#pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; + y += (B_X * B_Y) / preloadCases) { + shImgLoad[(y)*preloadCases] = + imPreload[y * preloadCases / (B_X * B_Y)]; + } + +#pragma unroll + for (int y = 0; y < B_X * filtersPerThread; + y += (B_X * B_Y) / preloadCases) { + shHidActLoad[y * (preloadCases + 1)] = + haPreload[y * preloadCases / (B_X * B_Y)]; + } + + __syncthreads(); + + // const float* im = &images[caseIdx + preloadCases + + // pixIdx]; const float* ha = &hidActs[caseIdx + + // preloadCases + m * numImages]; + int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; + int hidActsOffset2 = + hidActsOffset + caseIdx + preloadCases + m * numImages; + if (caseIdx + preloadCases == numImages) { + pixIdx = pixIdxNext; + m = mNext; + imgOffset2 = imgOffset + pixIdxNext; + hidActsOffset2 = hidActsOffset + mNext * numImages; + } + + WA_LOOP(0); + WA_LOOP(1); + WA_LOOP(2); + WA_LOOP(3); + WA_LOOP(4); + + WA_LOOP(5); + WA_IMLOAD_TX(0); + WA_LOOP(6); + WA_IMLOAD_TX(1); + WA_LOOP(7); + WA_IMLOAD_TX(2); + WA_LOOP(8); + WA_IMLOAD_TX(3); + WA_LOOP(9); + WA_IMLOAD_TX(4); + WA_LOOP(10); + WA_IMLOAD_TX(5); + + WA_LOOP(11); + WA_HALOAD_TX(0); + WA_LOOP(12); + WA_HALOAD_TX(1); + WA_LOOP(13); + WA_HALOAD_TX(2); + WA_LOOP(14); + WA_HALOAD_TX(3); + WA_LOOP(15); + WA_HALOAD_TX(4); + WA_LOOP(16); + WA_HALOAD_TX(5); + WA_LOOP(17); + WA_HALOAD_TX(6); + WA_LOOP(18); + WA_HALOAD_TX(7); + WA_LOOP(19); + WA_HALOAD_TX(8); + WA_LOOP(20); + WA_HALOAD_TX(9); + WA_LOOP(21); + WA_HALOAD_TX(10); + WA_LOOP(22); + WA_HALOAD_TX(11); + WA_LOOP(23); + WA_HALOAD_TX(12); + WA_LOOP(24); + WA_HALOAD_TX(13); + WA_LOOP(25); + WA_HALOAD_TX(14); + WA_LOOP(26); + WA_HALOAD_TX(15); + + WA_LOOP(27); + WA_LOOP(28); + WA_LOOP(29); + WA_LOOP(30); + WA_LOOP(31); + + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * + targets[c * B_Y * filterPixels * numFilters + f * B_X] + + scaleOutputs * prod[f][c]; + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = + scaleOutputs * prod[f][c]; + } + } + } +} + +/*****************************Function Revision + *Record***************************** Author: Tencent BestImage + *Team(ankerguo@tencent.com) * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + *********************************************************************************/ + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, + * filterPixels, numFilters) + */ +template < + int B_Y, + int B_X, + int filtersPerThread, + int colorsPerThread, + int preloadCases, + bool scale> +__launch_bounds__(256, 2) __global__ + void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16( + cudaTextureObject_t images, + cudaTextureObject_t hidActs, + float* targets, + const int numImages, + const int numFilters, + const int numModulesY, + const int numModulesX, + const int imgSizeY, + const int imgSizeX, + const int filterSize, + const int paddingStart, + const int moduleStride, + const int imgStride, + const int numImgColors, + const int numGroups, + const int sumWidth, + const float scaleTargets, + const float scaleOutputs) { + // avoid bank conflict by re-organizing the data structure, and improve band + // width by using 'float2' instead of 'float' + __shared__ float2 + shImages[preloadCases] + [colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases + __shared__ float2 shHidActs[preloadCases] + [filtersPerThread * B_X / 2 + + 2]; // preload preloadCases cases of B_X hidacts + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + // const int tidx = B_X * threadIdx.y + threadIdx.x; + // reduce two registers + // const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + // const int filterPixels = filterSize * filterSize; + // reduce one register + const int filterPixelsAll = numFilters * filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + // const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = + filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, + blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + const int imgOffset = + (imgColorIdx + (ty * B_X + tx) / preloadCases) * imgPixels * imgStride + + (ty * B_X + tx) % preloadCases; + // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + ((ty * B_X + tx) / preloadCases) * numImages * numModules + + ((ty * B_X + tx) % preloadCases); + // + // hidActs += + // blockFilterIdx * numImages * numModules + // + loadY * numImages * numModules + // + loadX; + + // usie one temporary register instead of multiple registers + const int pIdxBase = imgStride * + ((paddingStart + blockPixelY) * imgSizeX + paddingStart + blockPixelX); + + targets += blockModuleChunkIdx * numFilters * filterSize * filterSize * + numFilterColors + + (blockFilterColorIdx + ty) * filterSize * filterSize * numFilters + + blockPixelOffset * numFilters + blockFilterIdx + tx; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = + max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = + max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = + min(numModulesX, + min(blockModuleStartX + sumWidth, + DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = + min(numModulesY, + min(blockModuleStartY + sumWidth, + DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + + // reduce 3 registers + const bool doWork = mStartY < mEndY && mStartX < mEndX; + + // float* shHidActLoad = &shHidActs[loadY][loadX]; + // float* shImgLoad = &shImages[loadY][loadX]; + + float imPreload[preloadCases * colorsPerThread / B_X]; // [4] + float haPreload[preloadCases * filtersPerThread / B_Y]; // [8] + + float prod[filtersPerThread][colorsPerThread]; + +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[f][c] = 0; + } + } + // int pixIdx, pixIdxNext, m, mNext; + + // conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + // mStartY, mStartX, paddingStart, numModulesX, moduleStride, + // blockPixelY, blockPixelX, imgSizeX, imgStride, + // pixIdx, m); + + const int pixIdx = + pIdxBase + (mStartY * imgSizeX + mStartX) * moduleStride * imgStride; + const int m = (mStartY * numModulesX + mStartX); + + // preload the image's pixel + if (doWork && (ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) { +#pragma unroll + for (int i = 0; i < 4; i++) { + imPreload[i] = tex1Dfetch( + images, imgOffset + 16 * i * imgPixels * imgStride + pixIdx); + } + } + + // preload the hidAct's pixel + if (doWork && (ty * B_X + tx) / preloadCases < (B_X * filtersPerThread) / 8) { +#pragma unroll + for (int i = 0; i < 8; i++) { + haPreload[i] = tex1Dfetch( + hidActs, + hidActsOffset + 16 * i * numImages * numModules + m * numImages); + } + } + + for (int my = mStartY; my < mEndY; my++) { + for (int mx = mStartX; mx < mEndX; mx++) { + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + int imgOffset2 = imgOffset + caseIdx + preloadCases + pIdxBase + + (my * imgSizeX + mx) * moduleStride * imgStride; + int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + + (my * numModulesX + mx) * numImages; + + if (caseIdx + preloadCases == numImages) { + const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + const int myNext = my + (mx + 1 == mEndX); + + imgOffset2 = imgOffset + +pIdxBase + + (myNext * imgSizeX + mxNext) * moduleStride * imgStride; + hidActsOffset2 = + hidActsOffset + (myNext * numModulesX + mxNext) * numImages; + } + + if ((ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) { + // store the previousely preloaded pixel into shared memory + shImages[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases] + .x = imPreload[0]; + shImages[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases] + .y = imPreload[2]; + shImages[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 16] + .x = imPreload[1]; + shImages[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 16] + .y = imPreload[3]; + } + + if ((ty * B_X + tx) / preloadCases < (B_X * filtersPerThread / 8)) { + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases] + .x = haPreload[0]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases] + .y = haPreload[2]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 32] + .x = haPreload[4]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 32] + .y = haPreload[6]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 16] + .x = haPreload[1]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 16] + .y = haPreload[3]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 48] + .x = haPreload[5]; + shHidActs[(ty * B_X + tx) % preloadCases] + [(ty * B_X + tx) / preloadCases + 48] + .y = haPreload[7]; + } + +#pragma unroll + for (int r = 0; r < 8; r++) { + haPreload[r] = tex1Dfetch( + hidActs, hidActsOffset2 + r * 16 * numImages * numModules); + } + +#pragma unroll + for (int r = 0; r < 4; r++) { + imPreload[r] = tex1Dfetch( + images, imgOffset2 + r * 16 * imgPixels * imgStride); + } + __syncthreads(); + +// put together the instructions of same type to improve instruction-level +// parallelism calculate the derivative of the hidAct with respect to weight +#pragma unroll + for (int r = 0; r < 16; r++) { +#pragma unroll + for (int c = 0; c < 4; c++) { + prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].x; + prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].y; + prod[2][c] += + shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].x; + prod[3][c] += + shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].y; + prod[0][c + 4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].x; + prod[1][c + 4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].y; + prod[2][c + 4] += + shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].x; + prod[3][c + 4] += + shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].y; + } + } + + __syncthreads(); + } + } + } + + if (scale) { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixelsAll + f * B_X] = + scaleTargets * targets[c * B_Y * filterPixelsAll + f * B_X] + + scaleOutputs * prod[f][c]; + } + } + } else { +#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { +#pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixelsAll + f * B_X] = + scaleOutputs * prod[f][c]; + } + } + } +} + +std::pair getWeightActsOutputSize( + int numModulesY, + int numModulesX, + int numFilterColors, + int filterSize, + int numFilters, + int sumWidth) { + const int outputModuleChunksX = DIVUP(numModulesX, sumWidth); + const int outputModuleChunksY = DIVUP(numModulesY, sumWidth); + const int outputModuleChunks = outputModuleChunksX * outputModuleChunksY; + return std::pair( + outputModuleChunks * numFilterColors * filterSize * filterSize, + numFilters); +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModules, numImages) + * + * targets: (numModuleY*numModulesX/partialSum, numFilterColors, + * filterPixels, numFilters) + * + * TODO: you can get a slight speed boost for local non-convolutional units by + * writing special routines for partialSum = 1. But I dunno if the code + * duplication is worth it... + * + * Note: all of these convolution routines are optimized for the case when + * the number of images (i.e. the minibatch size) is a multiple of 128. + * Other batch sizes will work, but but I made no attempt whatsoever + * to make them work fast. + */ +void _weightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + int sumWidth, + float scaleTargets, + float scaleOutput) { + CAFFE_ENFORCE(images->ndim() == 2); + CAFFE_ENFORCE(hidActs->ndim() == 2); + CAFFE_ENFORCE(targets->ndim() == 2); + + int numFilterColors = numImgColors / numGroups; + int imgStride = images->dim32(1); + int numImages = images->dim32(1); + int imgPixels = images->dim32(0) / numImgColors; + int imgSizeX = imgPixels / imgSizeY; + int numModules = numModulesY * numModulesX; + int numFilters = hidActs->dim32(0) / numModules; + int numFiltersPerGroup = numFilters / numGroups; + + CAFFE_ENFORCE(numImgColors % numGroups == 0); + CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0); + CAFFE_ENFORCE( + numGroups > 1 || + (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 16 == 0))); + CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 16 == 0); + CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels); + CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors); + + int filterPixels = filterSize * filterSize; + int outputModuleChunksX = DIVUP(numModulesX, sumWidth); + int outputModuleChunksY = DIVUP(numModulesY, sumWidth); + int outputModuleChunks = outputModuleChunksX * outputModuleChunksY; + // partialSum = partialSum == 0 ? numModules : partialSum; + + // CAFFE_ENFORCE(numModules % partialSum == 0); + CAFFE_ENFORCE(hidActs->dim32(1) == numImages); + + // These routines don't handle the case when only part of the image is visited + // in the convolution + CAFFE_ENFORCE(paddingStart <= 0); + CAFFE_ENFORCE( + paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX); + CAFFE_ENFORCE( + paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY); + CAFFE_ENFORCE(moduleStride <= filterSize); + + CAFFE_ENFORCE(numModules * numFilters == hidActs->dim32(0)); + + int preloadCases = 32; + + dim3 blocks, threads; + int bx, by; + int pixelsPerThread, filtersPerThread, colorsPerThread; + // Worth playing with these parameters to find best values for your problem. + // These values work relatively well, but not optimal for all problems. + if (numFilterColors > 3) { + filtersPerThread = + numFiltersPerGroup % 64 == 0 ? 4 : numFiltersPerGroup % 32 == 0 ? 2 : 1; + colorsPerThread = numFilterColors % 64 == 0 + ? 8 + : numFilterColors % 48 == 0 ? 6 : numFilterColors % 32 == 0 ? 8 : 4; + by = (numFilterColors / colorsPerThread) % 8 == 0 ? 8 : 4; + bx = numFiltersPerGroup % 128 == 0 ? 32 : 16; + preloadCases = filtersPerThread * colorsPerThread < 32 ? 32 : 16; + blocks = dim3( + outputModuleChunks * (numFilters / (bx * filtersPerThread)), + numFilterColors / (by * colorsPerThread), + filterPixels); + CAFFE_ENFORCE(numFilterColors % (by * colorsPerThread) == 0); + } else { // This is ugly but it's nice to spell it out clearly + CAFFE_ENFORCE(numGroups == 1); // Just for sanity + // NOTE: these things are only optimized for colors = 3. I didn't really + // test other cases. + if (numFilters % 64 == + 0) { // TODO: having a separate case for 128 would make things faster, + // but I probably don't care about 128 + filtersPerThread = 4; + pixelsPerThread = 2; + by = 16; + bx = 16; + preloadCases = 32; + } else if (numFilters % 48 == 0) { + filtersPerThread = 3; + pixelsPerThread = 4; + by = 16; + bx = 16; + preloadCases = 32; + } else if (numFilters % 32 == 0) { + filtersPerThread = 2; + pixelsPerThread = 2; + by = 8; + bx = 16; + preloadCases = 16; + } else { // This case is completely untested. It might be really slow. But + // no time now. + filtersPerThread = 1; + pixelsPerThread = 16; + by = 16; + bx = 16; + preloadCases = 32; + } + blocks = dim3( + outputModuleChunks * (numFilters / (bx * filtersPerThread)), + DIVUP(filterPixels, by * pixelsPerThread)); + } + CAFFE_ENFORCE((by * bx) % preloadCases == 0); + CAFFE_ENFORCE(numFilters % (bx * filtersPerThread) == 0); + threads = dim3(bx, by); + bool checkCaseBounds = numImages % preloadCases != 0; + bool scale = scaleTargets != 0; + std::pair targetSize = getWeightActsOutputSize( + numModulesY, + numModulesX, + numFilterColors, + filterSize, + numFilters, + sumWidth); + if (!scale) { + targets->Resize(std::vector{targetSize.first, targetSize.second}); + } else { + CAFFE_ENFORCE(targets->dim32(0) == targetSize.first); + CAFFE_ENFORCE(targets->dim32(1) == targetSize.second); + } + + float* images_data = images->mutable_data(); + float* hidacts_data = hidActs->mutable_data(); + float* targets_data = targets->mutable_data(); + const std::size_t images_bytes = images->nbytes(); + + cudaStream_t stream = context->cuda_stream(); + + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); + + if (scale == false) { + if (checkCaseBounds == false) { + if (numFilterColors > 3) { + if (numFilterColors % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< + 8, + 32, + 4, + 8, + 16, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< + 8, + 32, + 4, + 8, + 16, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 64 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< + 8, + 16, + 4, + 8, + 16, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< + 8, + 16, + 4, + 8, + 16, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 48 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< + 8, + 32, + 4, + 6, + 32, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< + 8, + 32, + 4, + 6, + 32, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 16 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFiltersPerGroup % 64 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3< + 16, + 16, + 2, + 2, + 4, + 32, + 3, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3< + 16, + 16, + 2, + 2, + 4, + 32, + 3, + false, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 48 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3< + 16, + 16, + 2, + 4, + 3, + 32, + 3, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3< + 16, + 16, + 2, + 4, + 3, + 32, + 3, + false, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 3, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 3, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 2) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 2, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 2, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 2, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 2, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 1) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 1, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 1, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 1, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 1, + false, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, false, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } + } else if (checkCaseBounds == true) { + if (numFilterColors > 3) { + if (numFilterColors % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 48 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 16 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 3, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 3, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 3, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 3, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 3, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 3, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 2) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 2, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 2, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 2, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 2, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 1) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 1, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 1, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 1, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 1, + false, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, false, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } + } + } else if (scale == true) { + if (checkCaseBounds == false) { + if (numFilterColors > 3) { + if (numFilterColors % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< + 8, + 32, + 4, + 8, + 16, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< + 8, + 32, + 4, + 8, + 16, + true><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 64 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< + 8, + 16, + 4, + 8, + 16, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< + 8, + 16, + 4, + 8, + 16, + true><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 48 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< + 8, + 32, + 4, + 6, + 32, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< + 8, + 32, + 4, + 6, + 32, + true><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 16 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFiltersPerGroup % 64 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3< + 16, + 16, + 2, + 2, + 4, + 32, + 3, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3< + 16, + 16, + 2, + 2, + 4, + 32, + 3, + true, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 48 == 0) { + cudaTextureObject_t tex_images = GetTensorTextureObject(images); + cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); + cudaFuncSetCacheConfig( + conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3< + 16, + 16, + 2, + 4, + 3, + 32, + 3, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3< + 16, + 16, + 2, + 4, + 3, + 32, + 3, + true, + false><<>>( + tex_images, + tex_hidacts, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + checkCudaErrors(cudaDestroyTextureObject(tex_images)); + checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 3, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 3, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 2) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 2, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 2, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 2, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 2, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 1) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 1, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 1, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 8, + 16, + 2, + 2, + 2, + 16, + 1, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 1, + true, + false>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, true, false> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } + } else if (checkCaseBounds == true) { + if (numFilterColors > 3) { + if (numFilterColors % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 48 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors % 16 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>, + cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + numImgColors, + numGroups, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 3, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 3, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 3, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 3, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 3, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 2) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 2, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 2, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 2, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } else if (numFilterColors == 1) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 2, + 4, + 32, + 1, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 4, + 3, + 32, + 1, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } else if (numFiltersPerGroup % 16 == 0) { + cudaFuncSetCacheConfig( + conv_weight_acts_c_kepler_sw< + 16, + 16, + 2, + 16, + 1, + 32, + 1, + true, + true>, + cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, true, true> + <<>>( + images_data, + hidacts_data, + targets_data, + numImages, + numFilters, + numModulesY, + numModulesX, + imgSizeY, + imgSizeX, + filterSize, + paddingStart, + moduleStride, + imgStride, + sumWidth, + scaleTargets, + scaleOutput); + } + } + } + } + } + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); + getLastCudaError("weightActs: kernel execution failed"); +} + +void convWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + int partialSum) { + _weightActs( + context, + images, + hidActs, + targets, + imgSizeY, + numModulesY, + numModulesX, + filterSize, + paddingStart, + moduleStride, + numImgColors, + numGroups, + partialSum, + 0, + 1); +} + +void convWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + int partialSum, + float scaleTargets, + float scaleOutput) { + _weightActs( + context, + images, + hidActs, + targets, + imgSizeY, + numModulesY, + numModulesX, + filterSize, + paddingStart, + moduleStride, + numImgColors, + numGroups, + partialSum, + scaleTargets, + scaleOutput); +} + +void localWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups) { + _weightActs( + context, + images, + hidActs, + targets, + imgSizeY, + numModulesY, + numModulesX, + filterSize, + paddingStart, + moduleStride, + numImgColors, + numGroups, + 1, + 0, + 1); +} + +void localWeightActs( + caffe2::CUDAContext* context, + caffe2::TensorCUDA* images, + caffe2::TensorCUDA* hidActs, + caffe2::TensorCUDA* targets, + int imgSizeY, + int numModulesY, + int numModulesX, + int filterSize, + int paddingStart, + int moduleStride, + int numImgColors, + int numGroups, + float scaleTargets, + float scaleOutput) { + _weightActs( + context, + images, + hidActs, + targets, + imgSizeY, + numModulesY, + numModulesX, + filterSize, + paddingStart, + moduleStride, + numImgColors, + numGroups, + 1, + scaleTargets, + scaleOutput); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile b/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile new file mode 100644 index 0000000..2e1c1e7 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile @@ -0,0 +1,112 @@ +################################################################################ +# +# Copyright 1993-2012 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ + +# Location of the CUDA Toolkit binaries and libraries +CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include +CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin +CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64 + +# Common binaries +NVCC = $(CUDA_BIN_PATH)/nvcc +GCC = g++ +AR = ar + +# CUDA code generation flags +GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 +GENCODE_FLAGS := $(GENCODE_SM35) + +LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart +CCFLAGS := -m64 +NVCCFLAGS := -m64 + +# Debug build flags +ifeq ($(dbg),1) + CCFLAGS += -g + NVCCFLAGS += -g -G + DBG := debug +else + DBG := release + NVCCFLAGS += -O3 + CCFLAGS += -O3 +endif + +# Add profiler output +ifeq ($(prof),1) + NVCCFLAGS += --ptxas-options=-v +endif + +TARGETDIR := ./bin/$(DBG) +OBJDIR := ./obj/$(DBG) + +########## USER STUFF ########### +PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2) +MODELNAME := _ConvNet +LDFLAGS += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3 +INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) + +DEFINES := -DNUMPY_INTERFACE + +CUFILES := $(shell find . -name "*.cu") +CU_DEPS := $(shell find . -name "*.cuh") +CCFILES := $(shell find . -name "*.cpp") +C_DEPS := $(shell find . -name "*.h") + +NVCCFLAGS += --compiler-options '-fPIC' +LDFLAGS += -shared +CCFLAGS += -fPIC +TARGET := $(TARGETDIR)/$(MODELNAME).so + +################################################################################ +# Set up target and object files +################################################################################ +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES)) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES)) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES)) + +# Target rules +all: makedirs $(TARGET) + +$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS) + $(NVCC) $(DEFINES) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $< + +$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS) + $(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $< + +$(TARGET): $(OBJS) + $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS) + ln -sf $(TARGET) . + +makedirs: + mkdir -p $(TARGETDIR) + mkdir -p $(OBJDIR)/src + +clean: + rm -rf ./obj diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/__init__.py b/caffe2/contrib/cuda-convnet2/cudaconvnet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh new file mode 100644 index 0000000..58e34a5 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh @@ -0,0 +1,66 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ACTBROADCASTER_CUH_H_ +#define ACTBROADCASTER_CUH_H_ + +#include +#include "streambroadcast.cuh" +#include "copypipeline.cuh" + +class BroadcastMessage { +public: + enum MESSAGE_TYPE { + BROADCAST, + EXIT + }; +protected: + int _srcDevice; + std::map _mats; + int _userIdx; + Queue* _finishQueue; + MESSAGE_TYPE _type; + BroadcastMessage(MESSAGE_TYPE type); +public: + BroadcastMessage(std::map mats, int srcDevice, int userIdx, Queue& finishQueue); + + int getSrcDevice(); + std::map& getMatrices(); + int getUserIdx(); + Queue& getFinishQueue(); + MESSAGE_TYPE getMessageType(); +}; + +class ExitBroadcastMessage : public BroadcastMessage { +public: + ExitBroadcastMessage(); +}; + +class ActBroadcaster : public Thread { +protected: + std::map _broadcasters; // src device --> broadcaster + Queue _messageQueue; + int _numUsers; +public: + ActBroadcaster(int numUsers, intv& cpus); + ~ActBroadcaster(); + Queue& getMessageQueue(); + virtual void* run(); + void stop(); +}; + + +#endif /* ACTBROADCASTER_CUH_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh new file mode 100644 index 0000000..230a721 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh @@ -0,0 +1,180 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CONVNET3 +#define CONVNET3 + +#include +#include +#include +#include +#include +#include +#include "../../util/include/queue.h" +#include "../../util/include/thread.h" +#include +#include "../../util/include/sync.h" +#include "messages.cuh" +#include "streambroadcast.cuh" + +#include "layer.cuh" +#include "data.cuh" +#include "worker.cuh" +#include "weights.cuh" +#include "pipedispenser.cuh" +#include "timer.cuh" + +class Worker; +class WorkResult; +class Layer; +class DataLayer; +class CostLayer; +class ConvNetThread; +class StreamBroadcast; +class Weights; + +// name -> device id -> layer* +typedef std::map > NameReplicaLayerMap; +typedef std::map NameLayerMap; +// name -> ReplicaMap +//typedef std::map ReplicaNameLayerMap; +typedef std::vector ConvNetThreadV; +typedef std::vector DataLayerVector; +//typedef std::map ReplicaThreadsMap; + +class ConvNet : public Thread { +private: + void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights); +protected: + NameReplicaLayerMap _layerMap; + DataLayerVector _dataLayers; + // Vector of convnet threads (one thread == one GPU) + ConvNetThreadV _convNetThreads; + + DataProvider* _dp; + CPUData* _data, *_bufferData; + int _bufferMinibatchIdx, _bufferPassIdx; + ThreadSynchronizer* _sync; + intv _deviceIDs; + + Queue _workerQueue; + Queue _resultQueue; + Queue _msgQueue; + + int _numFwdTerminal; + std::map _numBwdTerminal; // pass idx -> #terminal + int _totalPassesDone; + int _numReplicasMin, _numReplicasMax; + // For gradient checking + int _numFailures; + int _numTests; + + // Training progress (between 0 and 1). + // Used to determine learning rate based on ParameterSchedule. + double _trainingProgress; + double _baseErr; + bool _conserveMem; + PipeDispenser *_dataCopyPD; + + void waitForTerminals(int numMsgs, MESSAGES msg); + void sendMessage(MESSAGES msg, bool sync); + void sendMessage(Message* msg, bool sync); + void findBwdTerminal(Layer& l, std::set& visited, int& terminal, int passIdx); + void connectReplicas(); + void initDataLayers(PyObjectV* layerList); + void initGPUThreads(PyObjectV* layerList); + void connectChildren(PyObject* layerParams); + void* run(); + void setData(CPUData& data, int passIdx); + void setDataFromBuffer(); + void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx); +public: + ConvNet(PyObject* layerParams, intv& deviceIDs, + int minibatchSize, bool conserveMem); + ~ConvNet(); + void stop(); + + Queue& getMessageQueue(); + Queue& getWorkerQueue(); + Queue& getResultQueue(); + DataProvider& getDataProvider(); + + Layer& getLayer(std::string& name, int replicaID); + void copyToCPU(); + void copyToGPU(); + void updateWeights(int passIdx); + void reset(int passIdx); + void reset(); + + void bprop(int passIdx, PASS_TYPE passType); + void fprop(int miniIdx, int passIdx, PASS_TYPE passType); + void fprop(CPUData& data, int passIdx, PASS_TYPE passType); + + void setTrainingProgress(double progress); + double getTrainingProgress() const; + + bool checkGradient(const std::string& name, float eps, Weights& weights); + void checkGradients(); + Cost& getCost(); + Cost& getCost(Cost& cost); + CPUData& getData(); // Returns last minibatch fpropped + double getCostValue(); + intv& getDeviceIDs(); + ThreadSynchronizer& getSync(); + void syncWithChildren(); + int getMinibatchSize(); + bool isConserveMemory(); + int getNumReplicasMax(); + int getNumReplicasMin(); + int getNumPasses(); + int getTotalPassesDone(); + PipeDispenser& getDataCopyPD(); +}; + +class ConvNetThread : public Thread { +protected: + NameLayerMap _nameLayerMap; + std::vector _costs; + ConvNet* _convNet; + int _deviceID; + Queue _msgQueue; + Timer _timer; +// StreamBroadcast* _weightSynchronizer; + + void initCuda(); + virtual void initLayer(PyObject* paramsDict, int replicaID); + void* run(); +public: + ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet); + ~ConvNetThread(); + + NameLayerMap& getLayerMap(); + int getDeviceID(); + + ConvNet& getConvNet(); + + Queue& getMessageQueue(); + std::vector& getCostLayers(); +// StreamBroadcast& getWeightSynchronizer(); + + Cost& getCost(); + Layer& getLayer(std::string& name); + void startTimer(); + double stopTimer(); +}; + +#endif /* CONVNET */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh new file mode 100644 index 0000000..f9dfa81 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh @@ -0,0 +1,218 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COPYPIPELINE_CUH_ +#define COPYPIPELINE_CUH_ + +#include +#include "../../util/include/thread.h" +#include "../../util/include/queue.h" +#include +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "util.cuh" + +#define COPY_MIN_CHUNK_SIZE (1<<18) // 256k +#define COPY_MAX_CHUNKS 16 +#define COPY_MIN_CHUNKS 2 + +class CopyPeer; +class CopySource; +class ICopySegment; +class IBroadcastNetwork; + +class CopyMessage { +protected: + std::map* _mats; + float _scaleSource, _scaleTargets; +public: + enum COPY_MESSAGE_TYPE { + COPY_CHUNK, + COPY_START, + EXIT + }; + CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map& mats) + : _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) { + } + CopyMessage(COPY_MESSAGE_TYPE msgType) + : _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) { + } + inline COPY_MESSAGE_TYPE getType() const { + return _msgType; + } + inline NVMatrix& getMatrix(int deviceID) const { + return *_mats->at(deviceID); + } + inline std::map& getMatrices() const { + return *_mats; + } + inline float getScaleSource() const { + return _scaleSource; + } + inline float getScaleTargets() const { + return _scaleTargets; + } +protected: + COPY_MESSAGE_TYPE _msgType; +}; + +class CopyChunkMessage : public CopyMessage { +protected: + int _chunkIdx; + int _chunkSize; + int _numChunks; +public: + CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map& mats) + : _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) { + } + + inline int getChunkIdx() const { + return _chunkIdx; + } + inline int getChunkSize() const { + return _chunkSize; + } + inline int getNumChunks() const { + return _numChunks; + } +}; + +class CopyStartMessage : public CopyMessage { +public: + CopyStartMessage(float scaleSource, float scaleTargets, std::map& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) { + } +}; + +class ICopySegment : public Thread { +protected: + int _deviceID, _execDeviceID; + cudaStream_t _stream; + ICopySegment* _prev; + std::vector _next; + Queue _queue; + Queue* _finishQueue; + HostNVMatrix _hmat; + IBroadcastNetwork* _parent; + + NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx); + void* run(); + virtual bool processMessage(CopyMessage& msg) = 0; + +public: + ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue); + virtual ~ICopySegment(); + inline NVMatrix& getMatrix(CopyMessage& msg); + Queue& getQueue(); + inline int getDeviceID(); + void addPrev(ICopySegment& c); + void addNext(CopyPeer& c); + bool isTerminal() const; + virtual bool isSource() const = 0; +}; + +class CopySource : public ICopySegment { +protected: + bool processMessage(CopyMessage& msg); +public: + CopySource(IBroadcastNetwork& parent, int deviceID); + inline bool isSource() const; +}; + +class CopyPeer : public ICopySegment { +protected: + bool processMessage(CopyMessage& msg); +public: + CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue); + inline bool isSource() const; +}; + +class IBroadcastNetwork { +protected: + Queue _finishQueue; + CopySource* _src; + std::vector _peers; + int _srcDeviceID, _numTerminal; + bool _constructed; + std::set _devices; + std::pair,std::vector > makeGPULists(); + + void makePeers(std::pair,std::vector >& gpus); + virtual void makeConnections() = 0; + virtual void _broadcast(std::map& mats, float scaleSource, float scaleTargets); + IBroadcastNetwork(std::set& devices, int srcDeviceID, int numTerminal); +public: + virtual IBroadcastNetwork& construct(); + virtual ~IBroadcastNetwork(); + + virtual void broadcast(std::map& mats); + int getSourceDeviceID() const; + static IBroadcastNetwork& make(std::set devices, int srcDeviceID); +}; + +class ISafeBroadcastNetwork : public IBroadcastNetwork { +protected: + ISafeBroadcastNetwork(std::set& devices, int srcDeviceID, int numTerminal); +public: + virtual void broadcast(std::map& mats, float scaleSource, float scaleTargets); + virtual ISafeBroadcastNetwork& construct(); + static ISafeBroadcastNetwork& make(std::set devices, int srcDeviceID); +}; + +class NullBroadcaster : public ISafeBroadcastNetwork { +protected: + NullBroadcaster(std::set& devices, int srcDeviceID); + void makeConnections(); +public: + NullBroadcaster& construct(); + void broadcast(std::map& mats, float scaleSource, float scaleTargets); + void broadcast(std::map& mats); + friend class IBroadcastNetwork; + friend class ISafeBroadcastNetwork; +}; + +/* + * This one goes to host and then to targets. + */ +class NaiveBroadcaster : public ISafeBroadcastNetwork { +protected: + NaiveBroadcaster(std::set& devices, int srcDeviceID); + void makeConnections(); + friend class IBroadcastNetwork; + friend class ISafeBroadcastNetwork; +}; + +class EightGPUBroadcaster1 : public IBroadcastNetwork { +protected: + EightGPUBroadcaster1(std::set& devices, int srcDeviceID); + void makeConnections(); + friend class IBroadcastNetwork; +}; + +class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork { +protected: + int _tgtDeviceID; + cudaStream_t _tgtStream; + void makeConnections(); + void resetDeviceID(int d); + void _broadcast(std::map& mats, float scaleSource, float scaleTargets); +public: + TwoPeeringGPUsBroadcaster(std::set& devices, int srcDeviceID); + ~TwoPeeringGPUsBroadcaster(); + ISafeBroadcastNetwork& construct(); + friend class IBroadcastNetwork; + friend class ISafeBroadcastNetwork; +}; + +#endif /* COPYPIPELINE_CUH_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh new file mode 100644 index 0000000..80270e3 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh @@ -0,0 +1,56 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COST_CUH +#define COST_CUH + +#include +#include +#include + +#include "layer.cuh" +#include "util.cuh" + +class CostLayer; + +/* + * Wrapper for dictionary mapping cost name to vector of returned values. + */ +class Cost { +protected: + std::map _numCases; + CostMap _costMap; + CostCoeffMap _costCoeffMap; + std::map& getNumCasesMap(); +public: + Cost(); + Cost(std::vector& costs); + doublev& operator [](const std::string s); + CostMap& getCostMap(); + CostCoeffMap& getCostCoeffMap(); + int getNumCases(); + /* + * Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients. + */ + double getValue(); + Cost& operator += (Cost& er); + virtual ~Cost(); + void print(); +}; + + +#endif /* COST_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh new file mode 100644 index 0000000..e64601f --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh @@ -0,0 +1,101 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DATA_CUH +#define DATA_CUH + +#include +#include +#include "util.cuh" + +class CPUData { +protected: + MatrixV* _data; + void assertDimensions() { + assert(_data->size() > 0); + for (int i = 1; i < _data->size(); i++) { + assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols()); + if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) { + _data->at(i)->setTrans(_data->at(i-1)->isTrans()); + } + assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans()); + } + assert(_data->at(0)->getNumCols() > 0); + } +public: + typedef typename MatrixV::iterator T_iter; + // Cases in columns, but array may be transposed + // (so in memory they can really be in rows -- in which case the array is transposed + // during the copy to GPU). + CPUData(PyObject* pyData) { + _data = getMatrixV(pyData); + assertDimensions(); + } + + CPUData(MatrixV* data) : _data(data) { + assertDimensions(); + } + + ~CPUData() { + for (T_iter it = _data->begin(); it != _data->end(); ++it) { + delete *it; + } + delete _data; + } + + Matrix& operator [](int idx) const { + return *_data->at(idx); + } + + int getSize() const { + return _data->size(); + } + + MatrixV& getData() const { + return *_data; + } + + Matrix& getData(int i) const { + return *_data->at(i); + } + + bool isTrans() const { + return _data->at(0)->isTrans(); + } + + int getNumCases() const { + return _data->at(0)->getNumCols(); + } +}; + +class DataProvider { +protected: + CPUData* _hData; + NVMatrixV _data; + int _minibatchSize; +public: + DataProvider(int minibatchSize); + void setData(CPUData&); + void clearData(); + CPUData& getMinibatch(int idx); + CPUData& getDataSlice(int startCase, int endCase); + int getNumMinibatches(); + int getMinibatchSize(); + int getNumCases(); +}; + +#endif /* DATA_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh new file mode 100644 index 0000000..84079ae --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh @@ -0,0 +1,88 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRADREDUCER_CUH_ +#define GRADREDUCER_CUH_ + +#include +#include +#include "streambroadcast.cuh" +#include "reducepipeline.cuh" +#include "layer.cuh" +#include "util.cuh" + +class StreamBroadcast; +class Layer; + +#define ACT_GRAD_REDUCER_EXIT (1 << 16) + +//class ReduceMessage { +// ReduceMessage(); +// ReduceMessage(bool exit); +//}; + +class IActGradReducer : public Thread { +protected: + Layer* _parent; + Queue _finishQueue; + int _numExpectedMsgsTotal; + std::map _numExpectedMsgs; // map from device id -> num expected msgs + + void* run(); + virtual bool reduce() = 0; + virtual void reset() = 0; +public: + IActGradReducer(Layer& parent, std::map numExpectedMsgs); + virtual ~IActGradReducer(); + int waitForFinish(); + virtual void enqueueReduction(int deviceID) = 0; + virtual void stop() = 0; + static IActGradReducer& makeGradReducer(Layer& parent, std::map numExpectedMsgs); +}; + +class SequentialActGradReducer : public IActGradReducer { +protected: + + std::map _numReceivedMsgs; // map from device id -> num received msgs + + std::map* > _messageQueues; + intv _deviceIDs; + StreamBroadcast* _broadcaster; + bool reduce(); + void reset(); +public: + SequentialActGradReducer(Layer& parent, std::map numExpectedMsgs); + ~SequentialActGradReducer(); + void enqueueReduction(int deviceID); + void stop(); +}; + +class ParallelActGradReducer : public IActGradReducer { +protected: + IEightGPUReducer* _reducer; + int _numReceivedMsgs; + float _scaleTarget; + Queue _messageQueue; + bool reduce(); + void reset(); +public: + ParallelActGradReducer(Layer& parent, std::map numExpectedMsgs); + void enqueueReduction(int deviceID); + void stop(); +}; + + +#endif /* GRADREDUCER_CUH_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h new file mode 100644 index 0000000..83c5061 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h @@ -0,0 +1,61 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef JPEG_MAIN_H +#define JPEG_MAIN_H + +#include +#include +#include +#include +#include +#include +#include +//#include +#include "../../util/include/thread.h" +#include "../../util/include/matrix.h" + +#ifndef DIVUP +#define DIVUP(x, y) (((x) + (y) - 1) / (y)) +#endif + +#define NUM_JPEG_DECODER_THREADS 4 + + +class DecoderThread : public Thread { + protected: + PyObject* _pyList; + Matrix* _target; + int64 _start_img, _end_img; + int64 _img_size, _inner_size, _inner_pixels; + bool _test, _multiview; + + unsigned char* _decodeTarget; + int64 _decodeTargetSize; + unsigned int _rseed; + + void* run(); + void decodeJpeg(int idx, int& width, int& height); + double randUniform(); + double randUniform(double min, double max); + void crop(int64 i, int64 width, int64 height, bool flip); + virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y); + public: + DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview); + virtual ~DecoderThread(); +}; + +#endif // JPEG_MAIN_H diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh new file mode 100644 index 0000000..2400413 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh @@ -0,0 +1,812 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LAYER_CUH +#define LAYER_CUH + +#include +#include +#include +#include +#include +#include +#include "../../nvmatrix/include/nvmatrix.cuh" +//#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh" + +#include "weights.cuh" +#include "convnet.cuh" +#include "cost.cuh" +#include "neuron.cuh" +#include "data.cuh" +#include "layer_kernels.cuh" +#include "streambroadcast.cuh" +#include "actbroadcaster.cuh" +#include "gradreducer.cuh" +#include "util.cuh" +#include "timer.cuh" +#include "memorysource.cuh" + +class Cost; +class ConvNet; +class ConvNetThread; +class CostLayer; +class DataLayer; +class Layer; +class ActBroadcaster; +class BroadcastMessage; +class IActGradReducer; +class Weights; +class WeightList; +typedef std::vector LayerV; + +class BinomialCrossEntOperator { +protected: + float _posWeight; +public: + BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) { + } + __device__ inline float operator()(const float t, const float y) const { + return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y); + } +}; + +class CrossEntOperator { +protected: + float _posWeight; +public: + CrossEntOperator(float posWeight) : _posWeight(posWeight) { + } + __device__ inline float operator()(const float t, const float y) const { + return _posWeight * t * safelog(y); + } +}; + +/* + * Abstract layer. + */ +class Layer { +protected: + ConvNetThread* _convNetThread; + + // This is a vector[#layers_next] + std::vector _next; + // This is a vector[#replicas_prev][#layers_prev] + std::map > _prev; + + int _rcvdFInputMsgs; + std::map _numComputedActsGrads; + int _rcvdBInputMsgs; + int _numOutputs; + std::map _inputs; // input idx -> matrix + std::map _memSrcActs; // device id -> memory source + std::map _memSrcActsGrad; // device id -> memory source + + bool _gradConsumer, _foundGradConsumers, _trans; + std::map _bwdTerminal; // One bool per pass + int _numGradProducersNext; + int _actsTarget, _actsGradTarget; + std::string _name, _type; + intv _nextDeviceIDs, _prevDeviceIDs; + HostNVMatrix _hostMemFwd; + + // New replica-related stuff: + std::map _replicas; // NOTE: a layer is its own sibling, too + // Previous layers sorted by device ID, in reverse order in which they are procesed by + // sequential grad reducer. map from replica -> device id -> layers + std::map > > _prevByDevice; + std::map _inputIndices; + int _replicaID; + int _numReplicas; + int _numReplicasPrev, _numReplicasNext; + + Queue _broadcastFinishQueue; + Queue _reductionFinishQueue; + ActBroadcaster* _actBroadcaster; + IActGradReducer* _gradReducer; + Timer _timer; + bool _initialized; + + virtual void fpropNext(PASS_TYPE passType, int passIdx); + virtual void truncBwdActs(); + virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0; + + virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) { + // Do nothing by default + } + virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(!isGradProducer()); // Only do nothing if not grad producer + } + virtual void fpropCommon(PASS_TYPE passType) { + + } + void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx); + + ActBroadcaster& getActBroadcaster(); + IActGradReducer& getGradReducer(); + int getInputIdx(std::string& parentName); + void setInputIdx(std::string& parentName, int idx); + +public: + static bool _saveActsGrad, _saveActs; + + Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); + virtual ~Layer(); + + virtual bool fprop(PASS_TYPE passType, int passIdx); + void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx); + virtual void fprop(std::map& v, PASS_TYPE passType, int passIdx); + virtual void bprop(PASS_TYPE passType, int passIdx); + virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx); + virtual void reset(); + virtual void resetPassIdx(); + int getNumCases(NVMatrix& v); + int& getNumComputedActsGrads(int deviceID); + int incRcvdBInputMsgs(); + bool isGradConsumer(); + bool hasGradProducerNext(std::string& layerName); + // Does this layer produce a gradient for any layer? + virtual bool isGradProducer(); + // Does this layer produce a gradient for layer of given name? + virtual bool isGradProducer(std::string& layerName); + std::string& getName(); + std::string& getType(); + virtual void addNext(Layer& l); + virtual void addPrev(Layer& l, int replicaIdx); + virtual void addReplica(Layer& l); + std::map >& getPrev(); + std::vector& getNext(); + virtual NVMatrix& getActs(); + virtual NVMatrix& getActs(int deviceID); + virtual NVMatrix& getActs(int deviceID, int numCases); + virtual NVMatrix& getActsGrad(); + virtual NVMatrix& getActsGrad(int deviceID); + virtual std::map getAllActs(); + virtual std::map getAllActsGrads(); + virtual bool postInit(); + int getDeviceID(); + ConvNetThread& getConvNetThread(); + cudaStream_t getStream(); + void syncStream(); + void setBwdTerminal(int passIdx); + // Do nothing if this layer has no weights + virtual bool updateWeights() { + return false; + } + virtual bool constrainWeights() { + return false; + } + virtual void checkGradient() { + } + virtual void copyToCPU() { + } + virtual void copyToGPU() { + } + intv& getNextDeviceIDs() { + return _nextDeviceIDs; + } + + int getReplicaID(); + int getNumReplicas(); + int getNumSiblingReplicas(); + int getNumReplicasPrev(); + int getNumReplicasNext(); + int getNumOutputs(); + void setMemorySourceActs(int deviceID, MemoryView& mem); + void setMemorySourceActsGrad(int deviceID, MemoryView& mem); + MemoryView& getMemorySourceActs(int deviceID); + MemoryView& getMemorySourceActsGrad(int deviceID); + int getFwdActiveInputReplicaIdx(int passIdx); + int getBwdActiveInputReplicaIdx(int passIdx); + int getFwdActiveReplicaIdx(int passIdx); + int getNumLayersPrev(); + virtual int getNumInputReplicas(); + int getNumExpectedBwdMsgs(); + int getNumExpectedFwdMsgs(); + int getReplicaIdx(); + int getActivePassPeriod(); + int getNumGradProducersNext(); + virtual ConvNet& getConvNet(); +}; + +class TwoDLayerInterface { +protected: + int _channels, _imgSize, _imgPixels; +public: + TwoDLayerInterface(PyObject* paramsDict); +}; + +class NeuronLayer : public Layer { +protected: + Neuron* _neuron; + std::string _neuronType; + + virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + class CrossEntLogisticGradientOperator { + private: + float _coeff, _posWeight; + public: + CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) { + } + __device__ inline float operator()(const float y, const float t) const { + return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y); + } + }; + NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + ~NeuronLayer(); + std::string& getNeuronType(); +}; + +class WeightLayer : public Layer { +protected: + WeightList* _weights; + Weights *_biases; + NVMatrix _norm2; + float _wStep, _bStep; + int _weightUpdatePassPeriod; + void fpropCommon(PASS_TYPE passType); + void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType); + virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0; + virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0; + virtual void _constrainWeights(); + virtual float getGradScale(int inpIdx, PASS_TYPE passType); + virtual float getIncScale(int inpIdx, PASS_TYPE passType); + virtual float getBGradScale(PASS_TYPE passType); + virtual float getBIncScale(); + virtual NVMatrix& getGradTarget(int inpIdx); + NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx); + NVMatrix& getBiasMatrix(PASS_TYPE passType); +public: + WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad); + virtual ~WeightLayer(); + virtual bool updateWeights(); + virtual bool constrainWeights(); + virtual void copyToCPU(); + virtual void copyToGPU(); + virtual void checkGradient(); + Weights& getWeights(int idx); + void addReplica(Layer& l); + virtual bool postInit(); +}; + +class FCLayer : public WeightLayer { +protected: + virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType); + virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); + virtual void _constrainWeights(); +public: + FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad); + FCLayer(); +}; + +class SplitFCLayer : public FCLayer { +protected: + int _numParts; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +// void bpropBiases(NVMatrix& v, PASS_TYPE passType); + void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); + void splitWeights(); +public: + SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad); +}; + +class SoftmaxLayer : public Layer { +protected: + bool _doUpperGrad; + NVMatrix _max, _sum; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + void setDoUpperGrad(bool b); +}; + +class ConcatenationLayer : public Layer { +protected: + intv* _copyOffsets; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + virtual ~ConcatenationLayer(); +}; + +class PassThroughLayer : public Layer { +protected: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + virtual bool postInit(); +}; + +class EltwiseSumLayer : public Layer { +protected: + floatv* _coeffs; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + ~EltwiseSumLayer(); +}; + +class EltwiseMaxLayer : public Layer { +protected: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class SumLayer : public Layer { +protected: + int _stride; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class DataCopyMessage { +public: + enum MESSAGE_TYPE { + COPY, + EXIT + }; +protected: + CPUData* _cpuData; + int _passIdx; + bool _other; + DataCopyMessage::MESSAGE_TYPE _type; + DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) { + } +public: + DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) { + } + + CPUData& getData() const { + return *_cpuData; + } + + int getPassIdx() const { + return _passIdx; + } + + bool isOther() const { + return _other; + } + + DataCopyMessage::MESSAGE_TYPE getType() { + return _type; + } +}; + +class DataCopyExitMessage : public DataCopyMessage { +public: + DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) { + } +}; + +class DataCopyThread; + +class DataLayer : public Layer { +protected: + bool _useBuffer; + int _dataIdx; + ConvNet* _convNet; +// std::map _outputs2; // Buffer for copying data during computation + std::map _memSrcActs2; // // Buffer for copying data during computation + std::map _copyStreams; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + Queue _copyFinishQueue; + DataCopyThread* _copier; + bool _outstandingCopyRequest; + int _start, _end; + +public: + void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer); + DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID); + ~DataLayer(); + NVMatrix& getActs(int deviceID); +// NVMatrix& getActs(int deviceID, bool other); + NVMatrix& getActs(int deviceID, bool other, int numCases); + bool isGradProducer(); + void toggleBuffer(int passIdx); + void copyData(CPUData& data, bool other, int passIdx); + bool postInit(); + ConvNet& getConvNet(); + int getNumInputReplicas(); + cudaStream_t getCopyStream(int deviceID); + Queue& getCopyFinishQueue() { + return _copyFinishQueue; + } + void waitForCopyFinish(); + int getDataIdx() const { + return _dataIdx; + } + int getStart() const { + return _start; + } + int getEnd() const { + return _end; + } +}; + + +class DataCopyThread : public Thread { +protected: + DataLayer* _parent; + Queue _queue; + HostNVMatrix _hostMemFwd; + Timer _requestTimer; + int _sleepUsec; + virtual void* run(); + +public: + DataCopyThread(DataLayer& parent, intv& cpus); + Queue& getQueue(); + void stop(); +}; + + +class LocalLayer : public WeightLayer { +protected: + intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups; + intv* _imgPixels, *_filterPixels, *_filterChannels; + int _modulesX, _modules, _numFilters; + +public: + LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad); + virtual ~LocalLayer(); +}; + +class ConvLayer : public LocalLayer { +protected: + int _sumWidth; + bool _sharedBiases; + floatv* _weightContrastNormMin, *_weightContrastNormMax; + NVMatrix _weightGradTmp; + + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + void bpropBiases(NVMatrix& v, PASS_TYPE passType); + void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); + void truncBwdActs(); + void _constrainWeights(); + +public: + ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + virtual ~ConvLayer(); +}; + +class LocalUnsharedLayer : public LocalLayer { +protected: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + void bpropBiases(NVMatrix& v, PASS_TYPE passType); + void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); + void _constrainWeights(); +public: + LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class PoolLayer : public Layer, public TwoDLayerInterface { +protected: + int _sizeX, _start, _stride, _outputsX; + std::string _pool; +public: + PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); + + static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class AvgPoolLayer : public PoolLayer { +protected: + bool _sum; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class MaxPoolLayer : public PoolLayer { +protected: + bool _abs; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs); +}; + +class CrossMapPoolLayer : public Layer, public TwoDLayerInterface { +protected: + int _size, _start, _stride, _outputs; + std::string _pool; +public: + CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); + + static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class CrossMapMaxPoolLayer : public CrossMapPoolLayer { +protected: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class RandomScaleLayer : public Layer, public TwoDLayerInterface { +protected: + int _tgtSize, _minScaledSize; + float _maxScale; // should be >= 1 + NVMatrix _rescaledActs; + std::vector _scaleProbs; +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class CropLayer : public Layer, public TwoDLayerInterface { +protected: + int _tgtSize, _startX, _startY; +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class NailbedLayer : public Layer, public TwoDLayerInterface { +protected: + int _start, _stride, _outputsX; +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class GaussianBlurLayer : public Layer, public TwoDLayerInterface { +protected: + Matrix* _hFilter; + NVMatrix _filter; + NVMatrix _actGradsTmp; +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + void copyToGPU(); + + GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + ~GaussianBlurLayer(); +}; + +class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface { +protected: +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID); +}; + +class ResizeLayer : public Layer, public TwoDLayerInterface { +protected: + float _scale; + int _tgtSize; +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class DropoutLayer : public Layer { +protected: + bool _enable; + float _keep; + NVMatrix _keepMask; +public: + virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + void truncBwdActs(); + DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + class DropoutSmallerThanOperator { + private: + float _keep, _scale; + public: + DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) { + } + __device__ inline float operator()(const float x) const { + return (x < _keep) * _scale; + } + }; +}; + +class Dropout2Layer : public DropoutLayer { +protected: +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class RGBToYUVLayer : public Layer { +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class RGBToLABLayer : public Layer { +protected: + bool _center; +public: + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + + RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class ResponseNormLayer : public Layer, public TwoDLayerInterface { +protected: + int _size; + float _scale, _pow; + float _minDiv; + NVMatrix _denoms; + + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + void truncBwdActs(); +public: + ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class CrossMapResponseNormLayer : public ResponseNormLayer { +protected: + bool _blocked; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class ContrastNormLayer : public ResponseNormLayer { +protected: + NVMatrix _meanDiffs; + + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); + void truncBwdActs(); +public: + ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class CostLayer : public Layer { +protected: + float _coeff; + doublev _costv; + NVMatrix _tmpbuf; // For error accumulation + int _numCases; // number of cases that the values in _costv were computed on + bool _aggregated; + void fpropCommon(PASS_TYPE passType); +public: + CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); + void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx); + bool fprop(PASS_TYPE passType, int passIdx); + + int getNumCases(); + virtual doublev& getCost(); + float getCoeff(); + bool isGradProducer(); + void setSendTerminalMessages(bool send); + void resetPassIdx(); + + static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID); +}; + +/* + * Input 0: labels + * Input 1: softmax outputs + */ +class CrossEntCostLayer : public CostLayer { +protected: + NVMatrix _trueLabelLogProbs, _correctProbs; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +/* + * Input 0: labels + * Input 1: softmax outputs + */ +class LogregCostLayer : public CostLayer { +protected: + NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs; + std::map _probsAccum; // input replica idx -> nvmatrix + NVMatrix _maxProbs; + std::map _numAccumed; // input replica idx -> int + int _topk; + bool _doCompute; + virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + NVMatrix& getProbsAccum(int replicaIdx); +}; + +/* + * Input 0: labels + * Input 1: logistic outputs + */ +class BinomialCrossEntropyCostLayer : public CostLayer { +protected: + bool _computeSoftmaxErrorRate; + NVMatrix _tmpProbs, _tmpVec, _correctProbs; + float _posWeight; + virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); + float getPosWeight(); + + // Only for use with non-logistic units + class BinomialCrossEntGradientOperator { + private: + float _coeff, _posWeight; + public: + BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) { + } + __device__ inline float operator()(const float t, const float y) const { + return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y)); + } + }; +}; + +/* + * Input 0: labels + * Input 1: logistic outputs + */ +class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer { +protected: + Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive; + NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); +public: + DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +class SumOfSquaresCostLayer : public CostLayer { +protected: + NVMatrix _tmp; + void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); + void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); +public: + SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); +}; + +#endif /* LAYER_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh new file mode 100644 index 0000000..ec61266 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh @@ -0,0 +1,88 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LAYER_KERNELS_CUH +#define LAYER_KERNELS_CUH + +#include +#include +#include "../../nvmatrix/include/nvmatrix.cuh" + +#define LOGREG_GRAD_THREADS_X 32 +#define LOGREG_GRAD_THREADS_Y 4 + +#define LOGREG_ERR_THREADS_X 128 +#define LOGREG_ERR_THREADS_Y 1 + +__device__ inline float safelog(const float x) { + return x > 0.0f ? __logf(x) : -50.0f; +} + +// The input matrix here is the squared norm. +// This replaces the squared norm with: +// 1 if it is below the threshold given by norm2 +// norm/sqrt(a) otherwise -- i.e. the desired norm (not squared) +class MaxWeightConstraintOperator { +private: + float _norm, _norm2; +public: + MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) { + } + __device__ inline float operator()(const float a) const { + return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f; + } +}; + +class HardWeightConstraintOperator { +private: + float _norm, _norm2; +public: + HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) { + } + __device__ inline float operator()(const float a) const { + return __fdividef(_norm, sqrtf(a)); + } +}; + +class WeightContrastNormOperator { +private: + float _min, _max, _scale; +public: + WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) { + } + __device__ inline float operator()(float a) const { + a = sqrtf(a) * _scale; + return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f; + } +}; + +void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out); +void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); +void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad); + +void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out); +void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); + + +// Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad +// to avoi dividing and then multiplying by quantities that may be near zero. +void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); +void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); +void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add); +void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, + NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize); +#endif /* LAYER_KERNELS_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh new file mode 100644 index 0000000..10a409a --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh @@ -0,0 +1,74 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LR_CUH +#define LR_CUH + +#include +#include +#include +#include +#include +#include +#include "util.cuh" +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "../../util/include/matrix.h" + +/* + * The maximum learning rate is _baseRate. + * The minimum learning rate is _baseRate / _tgtFactor. + * + * These classes define annealing schedules that interpolate between these + * two extrema. + */ +class ParameterSchedule { +protected: + double _baseRate; +public: + ParameterSchedule(double base); + virtual double getValue(double progress); + double getBaseValue() const; + virtual ~ParameterSchedule(); + + static ParameterSchedule& make(PyObject* schedDict); +}; + +class LinearParameterSchedule : public ParameterSchedule { +protected: + double _finalRate; +public: + LinearParameterSchedule(double base, double tgtFactor); + virtual double getValue(double progress); +}; + +class ExpParameterSchedule : public ParameterSchedule { +protected: + double _powBase; +public: + ExpParameterSchedule(double baseRate, double tgtFactor); + virtual double getValue(double progress); +}; + +class DiscreteExpParameterSchedule : public ParameterSchedule { +protected: + std::vector _rates; +public: + DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps); + virtual double getValue(double progress); +}; + + +#endif /* LR_CUH */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh new file mode 100644 index 0000000..9ea3f69 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh @@ -0,0 +1,61 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../../nvmatrix/include/nvmatrix.cuh" + +class MemorySource; + +class MemoryView { +protected: + MemorySource* _src; + std::string _name; +public: + MemoryView(MemorySource& src, std::string& name); + ~MemoryView(); + NVMatrix& getMemory(int numCases); + NVMatrix& getMemory(); + MemorySource& getMemorySource(); + bool isParent(); + std::string& getName(); + MemoryView& clone(std::string& name); +}; + +// Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU. +class MemorySource { +protected: +// int _inputIdx; + NVMatrix _memory; + int _deviceID; + int _size; + std::map > _viewRanges; + std::map _memoryViews; // input idx --> slice of _memory + std::set _truncateRequests; + Lock _lock; +public: + MemorySource(int size, int deviceID); + ~MemorySource(); + NVMatrix& getMemory(std::string& name, int numCases); + NVMatrix& getMemory(std::string& name); + MemoryView& addUser(std::string& name, std::pair range); + MemoryView& addUser(std::string& name); + std::pair getRange(std::string& name); + int getSize(); + bool truncate(std::string& name); + static MemoryView& make(int size, int deviceID, std::string& parentUser); +}; + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh new file mode 100644 index 0000000..25dd2f4 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh @@ -0,0 +1,128 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MESSAGES_CUH_ +#define MESSAGES_CUH_ + +#include +#include "layer.cuh" + +class Layer; + +enum MESSAGES { FPROP_TERMINAL, + BPROP_TERMINAL, + BPROP_READY, + FPROP_READY, + SYNC, + COPY_TO_CPU, + COPY_TO_GPU, + UPDATE_WEIGHTS, + CONSTRAIN_WEIGHTS, + RESET, + RESET_PASS_IDX, + COST_COMPUTED, + BPROP_START, + EXIT_CONVNET}; + +class Message { +protected: + MESSAGES _messageType; +public: + MESSAGES getType() { + return _messageType; + } + virtual Message* clone() { + return new Message(_messageType); + } + Message(MESSAGES messageType) : _messageType(messageType) { + } + virtual ~Message() { + } +}; + +class PropMessage : public Message { +protected: + Layer *_toLayer; + PASS_TYPE _passType; + int _passIdx; +public: + + Layer& getToLayer() { + return *_toLayer; + } + + PASS_TYPE getPassType() { + return _passType; + } + + int getPassIdx() { + return _passIdx; + } + + virtual PropMessage* clone() { + return new PropMessage(*_toLayer, _passType, _passIdx, _messageType); + } + + PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType) + : _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) { + } +}; + +class FpropMessage : public PropMessage { +public: + FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx) + : PropMessage(toLayer, passType, passIdx, FPROP_READY) { + } + virtual FpropMessage* clone() { + return new FpropMessage(*_toLayer, _passType, _passIdx); + } +}; + +class BpropMessage : public PropMessage { +public: + BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx) + : PropMessage(toLayer, passType, passIdx, BPROP_READY) { + } + virtual BpropMessage* clone() { + return new BpropMessage(*_toLayer, _passType, _passIdx); + } +}; + +class BpropStartMessage : public Message { +protected: + PASS_TYPE _passType; + int _passIdx; +public: + PASS_TYPE getPassType() { + return _passType; + } + + int getPassIdx() { + return _passIdx; + } + + virtual BpropStartMessage* clone() { + return new BpropStartMessage(_passType, _passIdx); + } + + BpropStartMessage(PASS_TYPE passType, int passIdx) + : _passType(passType), Message(BPROP_START), _passIdx(passIdx) { + } +}; + + + +#endif /* MESSAGES_CUH_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh new file mode 100644 index 0000000..d573901 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh @@ -0,0 +1,541 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NEURONS_CUH +#define NEURONS_CUH + +#include +#include +#include +#include "../../nvmatrix/include/nvmatrix.cuh" +#include + +template +class AddGradientBinaryOperator { + GradientOp _op; +public: + AddGradientBinaryOperator(GradientOp op) : _op(op) { + } + __device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const { + return _op(unitActGrad, unitAct) + target; + } +}; + +template +class AddGradientOperator { + GradientOp _op; +public: + AddGradientOperator(GradientOp op) : _op(op) { + } + __device__ inline float operator()(const float unitActGrad, const float target) const { + return target + _op(unitActGrad); + } +}; + +/* ======================= + * Neuron + * ----------------------- + * + * f(x) = x + * ======================= + */ +class Neuron { +protected: + bool _activated; + // Inputs and outputs potentially point to the same matrix, depending on the neuron + NVMatrix* _inputs, *_outputs; + virtual void _activate() { + if (_inputs != _outputs) { + _inputs->copy(*_outputs); + } + } + virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + if (&target != &actsGrad) { + actsGrad.copy(target); + } + } + virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + if (&target != &actsGrad) { + target.add(actsGrad); + } + } +public: + Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) { + } + virtual void activate(NVMatrix& inputs, NVMatrix& outputs) { + _activated = true; + _inputs = &inputs; + _outputs = &outputs; + _activate(); + } + + virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) { + assert(_activated); + if (!add) { + target.resize(actsGrad); + _computeInputGrad(actsGrad, target); + } else { + _addInputGrad(actsGrad, target); + } + } + + static Neuron& makeNeuron(PyObject* neuronDict); +}; + +/* ======================= + * LogisticNeuron + * ----------------------- + * + * f(x) = 1 / (1 + e^-x) + * ======================= + */ +class LogisticNeuron : public Neuron { +protected: + void _activate() { + _inputs->apply(NVMatrixOps::Logistic(), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(LogisticGradientOperator()), *_outputs, target, target); + } +public: + class LogisticGradientOperator { + public: + __device__ inline float operator()(float unitActGrad, float unitAct) const { + return unitActGrad * unitAct * (1.0f - unitAct); + } + }; + + LogisticNeuron() : Neuron() { + } +}; + +/* ======================= + * LogNeuron + * ----------------------- + * + * f(x) = log(eps + x) + * ======================= + */ +class LogNeuron : public Neuron { +protected: + float _eps; + void _activate() { + _inputs->apply(LogOperator(_eps), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(LogGradientOperator(_eps)), *_inputs, target, target); + } +public: + class LogGradientOperator { + protected: + float _eps; + public: + __device__ inline float operator()(float unitActGrad, float unitInput) const { + return __fdividef(unitActGrad, _eps + unitInput); + } + LogGradientOperator(float eps) : _eps(eps) { + + } + }; + + class LogOperator { + protected: + float _eps; + public: + __device__ inline float operator()(float x) const { + return __logf(_eps + x); + } + LogOperator(float eps) : _eps(eps) { + + } + }; + + LogNeuron(float eps) : _eps(eps), Neuron() { + } +}; + +/* ======================= + * ReluNeuron + * ----------------------- + * + * f(x) = max(0, x) + * ======================= + */ +class ReluNeuron : public Neuron { +protected: + virtual void _activate() { + _inputs->apply(ReluOperator(), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(ReluGradientOperator()), *_outputs, target, target); + } +public: + class ReluOperator { + public: + __device__ inline float operator()(float x) const { + return x < 0.0f ? 0.0f : x; + } + }; + + class ReluGradientOperator { + public: + __device__ inline float operator()(float unitActGrad, float unitAct) const { + return unitActGrad * (unitAct > 0.0f); + } + }; + + ReluNeuron() : Neuron() { + } +}; + + +/* ======================= + * BoundedReluNeuron + * ----------------------- + * + * f(x) = min(a, max(0, x)) + * ======================= + */ +class BoundedReluNeuron : public Neuron { +protected: + float _a; + + void _activate() { + _inputs->apply(BoundedReluOperator(_a), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(BoundedReluGradientOperator(_a)), *_outputs, target, target); + } +public: + class BoundedReluOperator { + private: + float _a; + public: + BoundedReluOperator(float a) : _a(a) { + } + __device__ inline float operator()(float x) const { + return x < 0.0f ? 0.0f : x > _a ? _a : x; + } + }; + + class BoundedReluGradientOperator { + private: + float _a; + public: + BoundedReluGradientOperator(float a) : _a(a) { + } + __device__ inline float operator()(float unitActGrad, float unitAct) const { + return unitActGrad * (unitAct > 0.0f) * (unitAct < _a); + } + }; + + BoundedReluNeuron(float a) : Neuron(), _a(a) { + } +}; + +/* ======================= + * AbsNeuron + * ----------------------- + * + * f(x) = abs(x) + * ======================= + */ +class AbsNeuron : public Neuron { +protected: + void _activate() { + assert(_inputs != _outputs); + _inputs->apply(NVMatrixOps::Abs(), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(AbsGradientOperator()), *_inputs, target, target); + } +public: + class AbsGradientOperator { + public: + __device__ inline float operator()(float unitActGrad, float unitInput) const { + return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f); + } + }; + + AbsNeuron() : Neuron() { + } +}; + +/* ======================= + * TanhNeuron + * ----------------------- + * + * f(x) = a*tanh(b*x) + * ======================= + */ +class TanhNeuron : public Neuron { +protected: + float _a, _b; + + void _activate() { + _inputs->apply(TanhOperator(_a, _b), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(TanhGradientOperator(_a, _b)), *_outputs, target, target); + } +public: + class TanhOperator { + private: + float _a, _n2b; + public: + TanhOperator(float a, float b) : _a(a), _n2b(-2*b) { + } + virtual __device__ inline float operator()(float x) const { + return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f); + } + }; + + class TanhGradientOperator { + private: + float _b, _a; + public: + TanhGradientOperator(float a, float b) : _b(b), _a(a) { + } + __device__ inline float operator()(float unitActGrad, float unitAct) const { +// const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f; +// return unitActGrad * _n4ab * (t * (t - 1.0f)); + return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a)); + } + }; + + TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) { + } +}; + +/* ======================= + * DoubleReluNeuron + * ----------------------- + * + * f(x) = x - a*tanh(x/a) + * ======================= + */ +class DoubleReluNeuron : public Neuron { +protected: + float _a; + + void _activate() { + assert(_inputs != _outputs); + _inputs->apply(DoubleReluOperator(_a), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(DoubleReluGradientOperator(_a)), *_inputs, target, target); + } +public: + class DoubleReluOperator { + private: + float _a, _n2a; + public: + DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) { + } + virtual __device__ inline float operator()(float x) const { + return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f); + } + }; + + class DoubleReluGradientOperator { + private: + float _n2a; + public: + DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) { + } + __device__ inline float operator()(float unitActGrad, float unitInput) const { + const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f; + return unitActGrad * (tanh*tanh); + } + }; + + DoubleReluNeuron(float a) : Neuron(), _a(a) { + } +}; + +/* ======================= + * SoftReluNeuron + * ----------------------- + * + * f(x) = log(1 + e^x) + * ======================= + */ +class SoftReluNeuron : public Neuron { +protected: + void _activate() { +// assert(_inputs != _outputs); + _inputs->apply(SoftReluOperator(), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(SoftReluGradientOperator()), *_outputs, target, target); + } +public: + class SoftReluOperator { + public: + __device__ inline float operator()(float x) const { + // This piece-wise implementation has better numerical stability than + // simply computing log(1 + e^x). + return x > 4.0f ? x : __logf(1.0f + __expf(x)); + } + }; + + class SoftReluGradientOperator { + public: + __device__ inline float operator()(float unitActGrad, float unitOutput) const { + if (unitOutput > 4.0f) { + return unitActGrad; + } + const float f = __expf(-unitOutput); + return unitActGrad * (1.0f - f); + } + }; + + SoftReluNeuron() : Neuron() { + } +}; + +/* ======================= + * SquareNeuron + * ----------------------- + * + * f(x) = x^2 + * ======================= + */ +class SquareNeuron : public Neuron { +protected: + void _activate() { + assert(_inputs != _outputs); + _inputs->apply(NVMatrixOps::Square(), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(SquareGradientOperator()), *_inputs, target, target); + } +public: + class SquareGradientOperator { + public: + __device__ inline float operator()(float unitActGrad, float unitInput) const { + return unitActGrad * 2.0f * unitInput; + } + }; + + SquareNeuron() : Neuron() { + } +}; + +/* ======================= + * SqrtNeuron + * ----------------------- + * + * f(x) = sqrt(x) + * ======================= + */ +class SqrtNeuron : public Neuron { +protected: + void _activate() { + _inputs->apply(NVMatrixOps::Sqrt(), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyTernary(AddGradientBinaryOperator(SqrtGradientOperator()), *_outputs, target, target); + } +public: + class SqrtGradientOperator { + public: + __device__ inline float operator()(float unitActGrad, float unitAct) const { + return __fdividef(unitActGrad, 2.0f * unitAct); + } + }; + + SqrtNeuron() : Neuron() { + } +}; + +/* ======================= + * LinearNeuron + * ----------------------- + * + * f(x) = a*x + b + * ======================= + */ +class LinearNeuron : public Neuron { +protected: + float _a, _b; + void _activate() { + _inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs); + } + + void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.scale(_a, target); + } + + void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { + actsGrad.applyBinary(AddGradientOperator(NVMatrixOps::MultByScalar(_a)), target, target); + } +public: + LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) { + } +}; +#endif /* NEURONS_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh new file mode 100644 index 0000000..9c43c9d --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh @@ -0,0 +1,175 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PIPEDISPENSER_CUH_ +#define PIPEDISPENSER_CUH_ + +#include +#include +#include +#include +#include "../../util/include/thread.h" +#include "util.cuh" + +/* + * PipeDispenser interface + */ +class PipeDispenser { +protected: + int _numPipes; + seti _pipes; + pthread_mutex_t *_mutex; + + void lock() { + pthread_mutex_lock(_mutex); + } + + void unlock() { + pthread_mutex_unlock(_mutex); + } + + virtual void init() { + _mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t))); + pthread_mutex_init(_mutex, NULL); + } +public: + PipeDispenser(const seti& pipes) { + _pipes.insert(pipes.begin(), pipes.end()); + init(); + } + + PipeDispenser(int numPipes) { + for (int i = 0; i < numPipes; ++i) { + _pipes.insert(i); + } + init(); + } + + virtual ~PipeDispenser() { + pthread_mutex_destroy(_mutex); + free(_mutex); + } + + virtual int getPipe(const seti& interested) = 0; + + int getPipe(int interested) { + seti tmp; + tmp.insert(interested); + return getPipe(tmp); + } + + virtual void freePipe(int pipe) = 0; +}; + +/* + * This one blocks until there is a free pipe to return. + */ +class PipeDispenserBlocking : public PipeDispenser { +protected: + pthread_cond_t *_cv; + + void wait() { + pthread_cond_wait(_cv, _mutex); + } + + void broadcast() { + pthread_cond_broadcast(_cv); + } + + int getAvailablePipes(const seti& interested, intv& available) { + available.clear(); + std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available)); + return available.size(); + } + + virtual void init() { + PipeDispenser::init(); + _cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t))); + pthread_cond_init(_cv, NULL); + } +public: + PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) { + init(); + } + + PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) { + init(); + } + + ~PipeDispenserBlocking() { + pthread_cond_destroy(_cv); + free(_cv); + } + + int getPipe(const seti& interested) { + lock(); + intv avail; + while (getAvailablePipes(interested, avail) == 0) { + wait(); + } + int pipe = avail[0]; + _pipes.erase(pipe); + unlock(); + return pipe; + } + + void freePipe(int pipe) { + lock(); + _pipes.insert(pipe); + broadcast(); + unlock(); + } +}; + +/* + * This one returns the least-occupied pipe. + */ +class PipeDispenserNonBlocking : public PipeDispenser { +protected: + std::map _pipeUsers; + +public: + PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) { + for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) { + _pipeUsers[*it] = 0; + } + } + + int getPipe(const seti& interested) { + lock(); + int pipe = -1, users = 1 << 30; + for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) { + if (interested.count(*it) > 0 && _pipeUsers[*it] < users) { + pipe = *it; + users = _pipeUsers[*it]; + } + } + if (pipe >= 0) { + _pipeUsers[pipe]++; + } + unlock(); + return pipe; + } + + void freePipe(int pipe) { + lock(); + _pipeUsers[pipe]--; + unlock(); + } +}; + + +#endif /* PIPEDISPENSER_CUH_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh new file mode 100644 index 0000000..911c4cd --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh @@ -0,0 +1,35 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PYCONVNET3_CUH +#define PYCONVNET3_CUH + +#define _QUOTEME(x) #x +#define QUOTEME(x) _QUOTEME(x) + +extern "C" void init_ConvNet(); + +PyObject* initModel(PyObject *self, PyObject *args); +PyObject* startBatch(PyObject *self, PyObject *args); +PyObject* finishBatch(PyObject *self, PyObject *args); +PyObject* checkGradients(PyObject *self, PyObject *args); +PyObject* syncWithHost(PyObject *self, PyObject *args); +PyObject* startMultiviewTest(PyObject *self, PyObject *args); +PyObject* startFeatureWriter(PyObject *self, PyObject *args); +PyObject* startDataGrad(PyObject *self, PyObject *args); +PyObject* decodeJpeg(PyObject *self, PyObject *args); + +#endif diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh new file mode 100644 index 0000000..8bafce5 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh @@ -0,0 +1,185 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef REDUCEPIPELINE_CUH_H_ +#define REDUCEPIPELINE_CUH_H_ + +#include "../../util/include/thread.h" +#include "../../util/include/queue.h" +#include +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "util.cuh" + +#define REDUCE_MIN_CHUNK_SIZE (1<<18) // 256k +#define REDUCE_MAX_CHUNKS 16 +#define REDUCE_MIN_CHUNKS 2 + +enum REDUCE_MESSAGE_TYPE { + REDUCE_CHUNK, + REDUCE_START, + EXIT +}; + +class ReducePeer; +class ReducerSource; +class IReduceSegment; +class IEightGPUReducer; + +class ReduceMessage { +protected: + REDUCE_MESSAGE_TYPE _msgType; + float _scaleIntermediates, _scaleTarget; + std::map* _mats; +public: + ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map& mats) + : _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) { + } + ReduceMessage(REDUCE_MESSAGE_TYPE msgType) + : _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) { + } + inline REDUCE_MESSAGE_TYPE getType() const { + return _msgType; + } + inline float getScaleIntermediates() const { + return _scaleIntermediates; + } + inline float getScaleTarget() const { + return _scaleTarget; + } + inline NVMatrix& getMatrix(int deviceID) const { + return *_mats->at(deviceID); + } + inline std::map& getMatrices() const { + return *_mats; + } +}; + +class ReduceChunkMessage : public ReduceMessage { +protected: + int _chunkIdx; + int _chunkSize; + int _numChunks; + + IReduceSegment* _src; +public: + ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map& mats) + : _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), + ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) { + } + + inline int getChunkIdx() const { + return _chunkIdx; + } + + inline int getChunkSize() const { + return _chunkSize; + } + + inline int getNumChunks() const { + return _numChunks; + } + + inline IReduceSegment& getSource() const { + return *_src; + } +}; + +class ReduceStartMessage : public ReduceMessage { +public: + ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map& mats) + : ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) { + } +}; + +class IReduceSegment : public Thread { +protected: + int _deviceID; + std::vector _prev; + ReducePeer* _next; + Queue _queue; + Queue* _finishQueue; + + NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx); + void* run(); + virtual bool processMessage(ReduceMessage& msg) = 0; + +public: + IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue* finishQueue); + virtual ~IReduceSegment(); + inline virtual NVMatrix& getMatrix(ReduceMessage& msg); + Queue& getQueue(); + int getDeviceID() const; + void addPrev(IReduceSegment& c); + void addNext(ReducePeer& c); + bool isTerminal() const; +}; + +class ReducerSource : public IReduceSegment { +protected: + bool processMessage(ReduceMessage& msg); +public: + ReducerSource(IEightGPUReducer& parent, int deviceID); +}; + +class ReducePeer : public IReduceSegment { +protected: + std::map _streams; // device id -> stream + std::map _numInputsReceived; // chunk idx -> num inputs + int _numInputsFinished; + HostNVMatrix _mat; + bool _add; + bool processMessage(ReduceMessage& msg); + inline cudaStream_t getStream(int deviceID); + inline NVMatrix& getMatrix(ReduceMessage& msg); + void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt); +public: + ReducePeer(IEightGPUReducer& parent, int deviceID, Queue* finishQueue); + ReducePeer(IEightGPUReducer& parent); + ~ReducePeer(); +}; + +class IEightGPUReducer { +protected: + std::vector _sources; + std::vector _peers; + Queue _finishQueue; + int _tgtDeviceID; + virtual void makeConnections(std::vector& same, std::vector&other) = 0; +public: + IEightGPUReducer(int tgtDeviceID); + virtual ~IEightGPUReducer(); + IEightGPUReducer& construct(); + void reduce(std::map& mats, float scaleIntermediates, float scaleTarget); + void reduce(std::map& mats, float scaleIntermediates); + void reduce(std::map& mats); + int getTgtDeviceID() const; +}; + +class EightGPUReducer1 : public IEightGPUReducer { +protected: + void makeConnections(std::vector& same, std::vector&other); +public: + EightGPUReducer1(int tgtDeviceID); +}; + +class EightGPUReducer2 : public IEightGPUReducer { +protected: + void makeConnections(std::vector& same, std::vector&other); +public: + EightGPUReducer2(int tgtDeviceID); +}; + +#endif /* REDUCEPIPELINE_CUH_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh new file mode 100644 index 0000000..7aa27f9 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh @@ -0,0 +1,53 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STREAMBROADCAST_CUH_ +#define STREAMBROADCAST_CUH_ + +#include +#include "../../util/include/queue.h" +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "util.cuh" + +class Layer; + +//#define NUM_STREAM_COPY_PARTS 4 +// This is in 4-byte words, not bytes +#define SB_MIN_CHUNK_SIZE (1<<17) +#define SB_MAX_CHUNKS 16 + +class StreamBroadcast { +protected: + std::map _streams; + std::set _ownedStreams; + HostNVMatrix _hostMem; + void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice); + void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput); + void init(std::map& streams); + void init(std::map& mats); +public: + StreamBroadcast(std::map& streams); + StreamBroadcast(); + virtual ~StreamBroadcast(); + + void transfer(std::map& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput); + void transfer(std::map& mats, int srcDevice, float scaleTarget, float scaleOutput); + void transfer(std::map& mats, int srcDevice); + void sync(int deviceID); + cudaStream_t getStream(int deviceID); +}; + +#endif /* STREAMBROADCAST_CUH_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh new file mode 100644 index 0000000..3f479f2 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh @@ -0,0 +1,52 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TIMER_CC_H_ +#define TIMER_CC_H_ + +#include + +class Timer { +protected: + StopWatchInterface* _timer; + bool _started; + +public: + Timer() : _started(false) { + sdkCreateTimer(&_timer); + } + + ~Timer() { + sdkDeleteTimer(&_timer); + } + inline void start () { + _started = true; + sdkResetTimer(&_timer); + sdkStartTimer(&_timer); + } + + inline double stop() { + sdkStopTimer(&_timer); + _started = false; + return sdkGetTimerValue(&_timer); + } + + inline bool isStarted() const { + return _started; + } +}; + +#endif /* TIMER_CC_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh new file mode 100644 index 0000000..ef31e44 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh @@ -0,0 +1,130 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef UTIL_H +#define UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "../../util/include/matrix.h" + + +#define PASS_TYPE uint +#define PASS_TRAIN 0x1 +#define PASS_TEST 0x2 +#define PASS_GC 0x4 +#define PASS_MULTIVIEW_TEST (PASS_TEST | 0x8) +#define PASS_MULTIVIEW_TEST_START (PASS_MULTIVIEW_TEST | 0x10) +#define PASS_MULTIVIEW_TEST_END (PASS_MULTIVIEW_TEST | 0x20) +#define PASS_FEATURE_GEN 0x40 + +#define HAS_FLAG(f, x) (((x) & (f)) == (f)) +#define IS_MULTIVIEW_TEST(x) HAS_FLAG(PASS_MULTIVIEW_TEST, x) +#define IS_MULTIVIEW_TEST_START(x) HAS_FLAG(PASS_MULTIVIEW_TEST_START, x) +#define IS_MULTIVIEW_TEST_END(x) HAS_FLAG(PASS_MULTIVIEW_TEST_END, x) +#define IS_TEST(x) HAS_FLAG(PASS_TEST, x) +#define IS_TRAIN(x) HAS_FLAG(PASS_TRAIN, x) + +// For gradient checking +#define GC_SUPPRESS_PASSES false +#define GC_REL_ERR_THRESH 0.02 + +#ifdef DO_PRINT +#define PRINT(x, args...) printf(x, ## args); +#else +#define PRINT(x, args...) ; +#endif + +/* + * Generates a random floating point number in the range 0-1. + */ +#define randf ((float)rand() / RAND_MAX) + +//typedef std::vector MatrixV; +//typedef std::vector NVMatrixV; +typedef std::map*> CostMap; +typedef std::map CostCoeffMap; +typedef std::vector doublev; +typedef std::vector floatv; +typedef std::vector intv; +typedef std::vector stringv; +typedef std::set seti; +typedef std::vector PyObjectV; + +stringv* getStringV(PyObject* pyList); +floatv* getFloatV(PyObject* pyList); +intv* getIntV(PyObject* pyList); +MatrixV* getMatrixV(PyObject* pyList); +MatrixV* getMatrixV(PyObject* pyList, int len); +int* getIntA(PyObject* pyList); + +int pyDictGetInt(PyObject* dict, const char* key); +intv* pyDictGetIntV(PyObject* dict, const char* key); +std::string pyDictGetString(PyObject* dict, const char* key); +float pyDictGetFloat(PyObject* dict, const char* key); +floatv* pyDictGetFloatV(PyObject* dict, const char* key); +Matrix* pyDictGetMatrix(PyObject* dict, const char* key); +MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key); +int* pyDictGetIntA(PyObject* dict, const char* key); +stringv* pyDictGetStringV(PyObject* dict, const char* key); +bool pyDictHasKey(PyObject* dict, const char* key); +PyObjectV* pyDictGetValues(PyObject* dict); + +template std::string tostr(T n); +template void shuffleVector(std::vector& v, int start, int end); +template void deleteElements(std::vector& v); +template void deleteElements(std::vector& v, bool deleteContainer); + +template +int indexOf(std::vector& v, T e) { + int i = 0; +// typename vector::iterator it2 = v.begin(); + for (typename std::vector::const_iterator it = v.begin(); it != v.end(); ++it) { + if (*it == e) { + return i; + } + ++i; + } + return -1; +} + +std::vector& getDeviceCPUs(int deviceID); + +template std::set getKeys(std::map& m) { + std::set s; + for (typename std::map::const_iterator it = m.begin(); it != m.end(); ++it) { + s.insert(it->first); + } + return s; +} + +struct LayerIDComparator { + bool operator()(PyObject* i, PyObject* j) { + return pyDictGetInt(i, "id") < pyDictGetInt(j, "id"); + } +}; + +#endif /* UTIL_H */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh new file mode 100644 index 0000000..dd1e522 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh @@ -0,0 +1,159 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef WEIGHTS_CUH +#define WEIGHTS_CUH + +#include +#include +#include +#include +#include +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "../../util/include/matrix.h" +#include "util.cuh" +#include "lr.cuh" +#include "layer.cuh" +#include "copypipeline.cuh" +#include "reducepipeline.cuh" +#include "streambroadcast.cuh" + +class Layer; +class Weights; +class StreamBroadcast; + +class IWeightReducer { +protected: + int _tgtReplicaID; + std::map _replicas; + + int getDeviceID(); +public: + IWeightReducer(std::map& replicas, int srcReplicaID); + virtual ~IWeightReducer(); + static IWeightReducer& make(std::map& replicas, int srcReplicaID); + virtual void reduce(std::map gradShards, float gradScale, bool toInc) = 0; +}; + +class SequentialWeightReducer : public IWeightReducer { +protected: + StreamBroadcast* _sb; +public: + SequentialWeightReducer(std::map& replicas, int srcReplicaID); + ~SequentialWeightReducer(); + void reduce(std::map gradShards, float gradScale, bool toInc); +}; + +class ParallelWeightReducer : public IWeightReducer { +protected: + IEightGPUReducer* _reducer; +public: + ParallelWeightReducer(std::map& replicas, int srcReplicaID); + ~ParallelWeightReducer(); + void reduce(std::map gradShards, float gradScale, bool toInc); +}; + +class Weights { +protected: + Matrix* _hWeights, *_hWeightsInc; + NVMatrix* _weights, *_weightsInc, *_weightsGrad; + + ParameterSchedule* _lrs; + + float _wc, _mom, _wball; + bool _onGPU, _useGrad, _cleanup; + int _numUpdates; + + // Note: every layer is its own sibling too + std::map _replicas; + + // Non-NULL if these weights are really shared from some other layer + Weights* _srcWeights; + Layer* _parent; + int _shardSize; + IWeightReducer* _reducer; + ISafeBroadcastNetwork* _broadcaster; + + void aggregateReplicaGradients(float progress); + + // TODO: assert that these retrun contiguous views + template T& getShard(T& mat, int replicaID); + template T& getShard(T& mat); + void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup); + +public: + NVMatrix& operator*() const; + + Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent); + Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, + float wc, float wball, float mom, bool useGrad); + + virtual ~Weights(); + + virtual NVMatrix& getW() const; + virtual NVMatrix& getInc() const; + virtual NVMatrix& getGrad() const; + virtual Matrix& getCPUW() const; + virtual Matrix& getCPUWInc() const; + virtual ParameterSchedule& getLearningRateSchedule() const; + virtual int getNumRows() const; + virtual int getNumCols() const; + virtual void copyToCPU(); + + // This function is assumed to be called in the order in which the layers + // were defined + virtual void copyToGPU(); + + virtual void update(float progress); + virtual void addReplica(Weights& sibling); + int incNumUpdates(); + + // Returns the number of times a gradient has been computed for this + // weight matrix during the current pass (interval between two calls of update()) + // through the net. This number will only be greater than 1 if this weight matrix + // is *shared* by multiple layers in the net. + int getNumUpdates() const; + float getEps(float progress) const; + float getMom() const; + float getWC() const; + float getWBall() const; + bool isUseGrad() const; + bool isOwner() const; + int getReplicaID(); + int getDeviceID(); + Layer& getParent(); + std::map& getReplicas(); + ISafeBroadcastNetwork& getBroadcaster(); + IWeightReducer& getReducer(); +}; + +class WeightList { +private: + std::vector _weightList; +public: + Weights& operator[](const int idx) const; + ~WeightList(); + WeightList(); + Weights& at(const int i) const; + void addWeights(Weights& w); + void addReplica(WeightList& sibling); + void update(float progress); + void copyToCPU(); + void copyToGPU(); + int getSize() const; +}; + +#endif /* WEIGHTS_CUH */ diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh new file mode 100644 index 0000000..233e383 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh @@ -0,0 +1,123 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef WORKER_CUH +#define WORKER_CUH + +#include "convnet.cuh" +#include "cost.cuh" +#include "data.cuh" + +class ConvNet; +class Cost; + +class WorkResult { +public: + enum RESULTS {BATCH_DONE, SYNC_DONE}; +protected: + WorkResult::RESULTS _resultType; + Cost* _results; +public: + WorkResult(WorkResult::RESULTS resultType, Cost& results); + WorkResult(WorkResult::RESULTS resultType); + virtual ~WorkResult(); + Cost& getResults() const; + WorkResult::RESULTS getResultType() const; +}; + +class Worker { +protected: + ConvNet* _convNet; +public: + Worker(ConvNet& convNet); + virtual ~Worker(); + virtual bool run() = 0; +}; + +class DataWorker : public Worker { +protected: + CPUData* _data; + DataProvider* _dp; +public: + DataWorker(ConvNet& convNet, CPUData& data); + virtual ~DataWorker(); + bool run(); + virtual void _run() = 0; +}; + +class TrainingWorker : public DataWorker { +protected: + bool _test; + double _progress; +public: + TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test); + void _run(); +}; + +class SyncWorker : public Worker { +public: + SyncWorker(ConvNet& convNet); + bool run(); +}; + +class ExitWorker : public Worker { +public: + ExitWorker(ConvNet& convNet); + bool run(); +}; + +class GradCheckWorker : public DataWorker { +public: + GradCheckWorker(ConvNet& convNet, CPUData& data); + void _run(); +}; + +class MultiviewTestWorker : public DataWorker { +protected: + int _numViews; + Matrix* _cpuProbs; + std::string _logregName; + CPUData& getMinibatch(int v, int i); +public: + MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName); + MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews); + ~MultiviewTestWorker(); + void _run(); +}; + +class FeatureWorker : public DataWorker { +protected: + MatrixV *_ftrs; + stringv *_layerNames; + bool _deleteFeatures; +public: + FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true); + ~FeatureWorker(); + void _run(); +}; + +class DataGradWorker : public DataWorker { +protected: + Matrix* _dataGrads; + int _dataLayerIdx, _softmaxLayerIdx; +public: + DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx); + ~DataGradWorker(); + void _run(); +}; + +#endif/* WORKER_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu new file mode 100644 index 0000000..0493d40 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu @@ -0,0 +1,107 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../include/actbroadcaster.cuh" + +using namespace std; + +/* + * ===================== + * BroadcastMessage + * ===================== + */ +BroadcastMessage::BroadcastMessage(map mats, int srcDevice, int userIdx, Queue& finishQueue) + : _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) { +} + +BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type) + : _type(type), _finishQueue(NULL) { +} + +int BroadcastMessage::getSrcDevice() { + return _srcDevice; +} + +map& BroadcastMessage::getMatrices() { + return _mats; +} + +int BroadcastMessage::getUserIdx() { + return _userIdx; +} + +Queue& BroadcastMessage::getFinishQueue() { + return *_finishQueue; +} + +BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() { + return _type; +} + +/* + * ===================== + * ExitBroadcastMessage + * ===================== + */ +ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) { +} + +/* + * ===================== + * ActBroadcaster + * ===================== + */ +ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) { +} + +ActBroadcaster::~ActBroadcaster() { + for (map::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) { + delete it->second; + } +} + +Queue& ActBroadcaster::getMessageQueue() { + return _messageQueue; +} + +void* ActBroadcaster::run() { + int nextUserIdx = 0; + bool exit = false; + while (!exit) { + BroadcastMessage& msg = *_messageQueue.dequeue(); + if (msg.getMessageType() == BroadcastMessage::EXIT) { + exit = true; + delete &msg; + } else { + if (msg.getUserIdx() == nextUserIdx) { + if (_broadcasters.count(msg.getSrcDevice()) == 0) { + _broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice()); + } + _broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices()); + msg.getFinishQueue().enqueue(0); + delete &msg; + nextUserIdx = (nextUserIdx + 1) % _numUsers; + } else { + _messageQueue.enqueue(&msg); + } + } + } + return NULL; +} + +void ActBroadcaster::stop() { + getMessageQueue().enqueue(new ExitBroadcastMessage()); + join(); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu new file mode 100644 index 0000000..bb4c70c --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu @@ -0,0 +1,782 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "../../nvmatrix/include/nvmatrix.cuh" +#include "../../nvmatrix/include/nvmatrix_operators.cuh" +#include "../../util/include/matrix.h" +#include "../include/convnet.cuh" +#include "../include/util.cuh" + +using namespace std; + +/* + * ======================= + * ConvNet + * ======================= + */ +ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs, + int minibatchSize, bool conserveMem) : Thread(true) { + _deviceIDs = deviceIDs; + _data = NULL; + _bufferData = NULL; + _bufferMinibatchIdx = -1; + _bufferPassIdx = -1; + _trainingProgress = 0; + _totalPassesDone = 0; + _conserveMem = conserveMem; + _sync = new ThreadSynchronizer(deviceIDs.size() + 1); + PyObjectV* layerList = pyDictGetValues(layerParams); + std::sort(layerList->begin(), layerList->end(), LayerIDComparator()); + + + _dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now + + initDataLayers(layerList); + initGPUThreads(layerList); + connectReplicas(); // Connect replicas to one another + connectChildren(layerParams); // Connect forward/backward links in graph + _numFwdTerminal = 0; + // Execute post-initialization stuff + for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { + for (int r = 0; r < it->second.size(); r++) { + _numFwdTerminal += it->second[r]->getNext().size() == 0; + if (it->second[r]->getNext().size() == 0) { + printf("Fwd terminal: %s\n", it->second[r]->getName().c_str()); + } + it->second[r]->postInit(); + } + } + + // Find and count the terminal nodes in the backward pass + for (int p = 0; p < getNumPasses(); p++) { + set visited; + _numBwdTerminal[p] = 0; + for (int t = 0; t < _convNetThreads.size(); t++) { + vector& cl = _convNetThreads[t]->getCostLayers(); + for (int c = 0; c < cl.size(); c++) { + findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p); + } + } + } + + _dp = new DataProvider(minibatchSize); +// Py_DECREF(layerList); + delete layerList; +} + +ConvNet::~ConvNet() { + for (vector::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) { + (*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET)); + (*it)->join(); + delete *it; + } + for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) { + delete *it; + } + for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) { + DEVICE_MEMORY_MANAGER::destroyInstance(*it); + } + HOST_MEMORY_MANAGER::destroyInstance(); + delete _sync; + delete _dataCopyPD; + delete _dp; +} + +void ConvNet::stop() { + getWorkerQueue().enqueue(new ExitWorker(*this)); + join(); +} + +PipeDispenser& ConvNet::getDataCopyPD() { + return *_dataCopyPD; +} + +void ConvNet::initDataLayers(PyObjectV* layerList) { + for (int i = 0; i < layerList->size(); i++) { + PyObject* paramsDict = layerList->at(i); + std::string layerType = pyDictGetString(paramsDict, "type"); + + if (layerType == "data") { + int numReplicas = pyDictGetInt(paramsDict, "numReplicas"); + for (int r = 0; r < numReplicas; ++r) { + DataLayer* dataLayer = new DataLayer(this, paramsDict, r); + _dataLayers.push_back(dataLayer); + _layerMap[dataLayer->getName()][r] = dataLayer; + } + } + } +} + +void ConvNet::initGPUThreads(PyObjectV* layerList) { + // Initialize GPU worker threads + for (int i = 0; i < _deviceIDs.size(); ++i) { + ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this); + _convNetThreads.push_back(cng); + for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) { + const std::string& name = it->first; + Layer* layer = it->second; + _layerMap[name][layer->getReplicaID()] = layer; + } + } +} + +void ConvNet::connectReplicas() { + _numReplicasMax = 0; + _numReplicasMin = 1 << 16; + for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { + _numReplicasMax = max(_numReplicasMax, int(it->second.size())); + _numReplicasMin = min(_numReplicasMin, int(it->second.size())); + for (map::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) { + Layer& l1 = *it2->second; + for (map::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) { + Layer& l2 = *it3->second; + l1.addReplica(l2); + } + } + } +} + +void ConvNet::connectChildren(PyObject* layerParams) { + for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { + PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str()); + PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs"); + if (inputList != NULL) { + // Iterate over "replicas" of this layer + int numReplicas = _layerMap[it->first].size(); + for (int i = 0; i < PyList_GET_SIZE(inputList); i++) { + std::string inputName = PyString_AsString(PyList_GetItem(inputList, i)); + int numReplicasPrev = _layerMap[inputName].size(); + // How many replicas from the previous layer must this layer be connected to? + int numInputReplicas = numReplicasPrev / numReplicas; + for (int r = 0; r < numReplicas; r++) { + for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) { + it->second[r]->addPrev(*_layerMap[inputName][rp], ridx); + _layerMap[inputName][rp]->addNext(*it->second[r]); + } + } + } + } + } +} + +void ConvNet::findBwdTerminal(Layer& l, set& visited, int& terminal, int passIdx) { + if (visited.count(&l) == 0) { + visited.insert(&l); + if (l.isGradConsumer()) { + bool hasPrevConsumer = false; + if (l.getPrev().size() > 0) { + for (int i = 0; i < l.getPrev()[0].size(); i++) { + // Looking only at 0th replica is fine to see if you have + // grad consumers below you. + hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer(); + } + } + if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) { + terminal++; + l.setBwdTerminal(passIdx); + printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx); + } else if (l.isGradProducer()) { + for (int r = 0; r < l.getPrev().size(); r++) { + for (int i = 0; i < l.getPrev()[r].size(); i++) { + findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx); + } + } + } + } + } +} + +void* ConvNet::run() { + for (vector::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) { + (*it)->start(); + } + // The manager thread defaults to using the GPU of the first worker. + // Put more logic here if this is inappropriate. + NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID()); + copyToGPU(); + bool exit = false; + while (!exit) { + Worker* worker = _workerQueue.dequeue(); + exit = worker->run(); + delete worker; + } + + return NULL; +} + +Queue& ConvNet::getWorkerQueue() { + return _workerQueue; +} + +Queue& ConvNet::getResultQueue() { + return _resultQueue; +} + +DataProvider& ConvNet::getDataProvider() { + return *_dp; +} + +Layer& ConvNet::getLayer(std::string& name, int replicaID) { + return *_layerMap[name][replicaID]; +} + +void ConvNet::sendMessage(MESSAGES msg, bool sync) { + sendMessage(new Message(msg), sync); +} + +void ConvNet::sendMessage(Message* msg, bool sync) { + for (int i = 0; i < _convNetThreads.size(); i++) { + _convNetThreads[i]->getMessageQueue().enqueue(msg->clone()); + } + + delete msg; + + if (sync) { + syncWithChildren(); + } +} + +void ConvNet::copyToCPU() { + sendMessage(COPY_TO_CPU, true); +} + +void ConvNet::copyToGPU() { + sendMessage(COPY_TO_GPU, false); +} + +void ConvNet::updateWeights(int passIdx) { + sendMessage(UPDATE_WEIGHTS, true); + sendMessage(CONSTRAIN_WEIGHTS, true); +} + +void ConvNet::reset(int passIdx) { + sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false); +} + +void ConvNet::reset() { + reset(0); +} + +// Fprop given data +void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) { + reset(passIdx); + // This is necessary because setData below could delete data. If there's + // an outstanding copy request, this'll cause a segfault. + for (int i = 0; i < _dataLayers.size(); i++) { + _dataLayers[i]->waitForCopyFinish(); + } + + setData(data, passIdx); + for (int i = 0; i < _dataLayers.size(); i++) { + _dataLayers[i]->fprop(passType, passIdx, false); + } + waitForTerminals(_numFwdTerminal, FPROP_TERMINAL); +} + +// Fprop given minibatch idx +void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) { + reset(passIdx); + + bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx; + if (!fromBuffer) { + // This is necessary because setData below could delete data. If there's + // an outstanding copy request, this'll cause a segfault. + for (int i = 0; i < _dataLayers.size(); i++) { + _dataLayers[i]->waitForCopyFinish(); + } + + setData(_dp->getMinibatch(miniIdx), passIdx); + + } else { + setDataFromBuffer(); + } + for (int i = 0; i < _dataLayers.size(); i++) { + _dataLayers[i]->fprop(passType, passIdx, fromBuffer); + } + + if (passIdx == getNumPasses() - 1) { + // Do double-buffering from next minibatch from the DataProvider + setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0); + } else { + // Do double-buffering from next microbatch within current minibatch + setBuffer(_data, miniIdx, passIdx + 1); + } + + waitForTerminals(_numFwdTerminal, FPROP_TERMINAL); +} + +void ConvNet::setDataFromBuffer() { + if (_bufferData != _data) { + delete _data; + } + _data = _bufferData; + _bufferData = NULL; + _bufferMinibatchIdx = -1; + _bufferPassIdx = -1; +} + +void ConvNet::setData(CPUData& data, int passIdx) { + bool same = _data == _bufferData; + if (&data != _data) { + delete _data; + } + if (&data != _bufferData && !same) { + delete _bufferData; + _bufferData = NULL; + _bufferMinibatchIdx = -1; + _bufferPassIdx = -1; + } + _data = &data; + for (int i = 0; i < _dataLayers.size(); i++) { + _dataLayers[i]->copyData(*_data, false, passIdx); + } +} + +void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) { + _bufferData = bufferData; + _bufferMinibatchIdx = bufferMinibatchIdx; + _bufferPassIdx = bufferPassIdx; + if (bufferData != NULL) { + for (int i = 0; i < _dataLayers.size(); i++) { + _dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx); + } + } +} + +CPUData& ConvNet::getData() { + assert(_data != NULL); + return *_data; +} + +void ConvNet::bprop(int passIdx, PASS_TYPE passType) { + _totalPassesDone++; + sendMessage(new BpropStartMessage(passType, passIdx), false); + waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL); + reset(passIdx + 1); +} + +void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) { + for (int rcvd = 0; rcvd < numMsgs; rcvd++) { + Message* m = _msgQueue.dequeue(); + assert(m->getType() == msgType); + delete m; + } +} + +// Same as getCost() but adds results to given cost and returns it +Cost& ConvNet::getCost(Cost& cost) { + Cost &tmp = getCost(); + cost += tmp; + delete &tmp; + return cost; +} + +Cost& ConvNet::getCost() { + Cost& cost = *new Cost(); + for (int t = 0; t < _convNetThreads.size(); t++) { + Cost& tcost = _convNetThreads[t]->getCost(); + cost += tcost; + delete &tcost; + } + return cost; +} + +double ConvNet::getCostValue() { + Cost& cost = getCost(); + double val = cost.getValue(); + delete &cost; + return val; +} + +Queue& ConvNet::getMessageQueue() { + return _msgQueue; +} + +intv& ConvNet::getDeviceIDs() { + return _deviceIDs; +} + +ThreadSynchronizer& ConvNet::getSync() { + return *_sync; +} + +void ConvNet::syncWithChildren() { + sendMessage(SYNC, false); + _sync->sync(); +} + +int ConvNet::getTotalPassesDone() { + return _totalPassesDone; +} + +int ConvNet::getMinibatchSize() { + return _dp->getMinibatchSize(); +} + +int ConvNet::getNumReplicasMax() { + return _numReplicasMax; +} + +int ConvNet::getNumReplicasMin() { + return _numReplicasMin; +} + +int ConvNet::getNumPasses() { + return _numReplicasMax / _numReplicasMin; +} + +void ConvNet::setTrainingProgress(double progress) { + _trainingProgress = progress; +} + +double ConvNet::getTrainingProgress() const { + return _trainingProgress; +} + +bool ConvNet::isConserveMemory() { + return _conserveMem; +} + +/* + * Gradient checking stuff + */ +void ConvNet::checkGradients() { + _numFailures = 0; + _numTests = 0; + _baseErr = 0; + for (int p = 0; p < getNumPasses(); ++p) { + fprop(0, p, PASS_GC); + _baseErr += getCostValue(); + bprop(p, PASS_GC); + } + // We call grad check only on the first replica, + // but because weights are aware of their fellow replicas, + // we can simultaneously perturb the weights of all + // replicas. + for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { + map& layers = it->second; + if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't) + layers[0]->checkGradient(); + } + } + + cout << "------------------------" << endl; + if (_numFailures > 0) { + cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl; + } else { + cout << "ALL " << _numTests << " TESTS PASSED" << endl; + } +} + +// Copies to all replicas +void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) { + int d = NVMatrix::getDeviceID(); + for (map::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) { + NVMatrix::setDeviceID(it->second->getDeviceID()); + it->second->getW().copyFromHost(weightsCPU); + } + NVMatrix::setDeviceID(d); +} + +/* + * name: weight matrix name + * eps: finite difference step + */ +bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) { + Matrix numGrad(weights.getNumRows(), weights.getNumCols()); + Matrix diff(numGrad); + numGrad.apply(Matrix::ZERO); + Matrix weightsCPU; + + weights.getW().copyToHost(weightsCPU, true); + + for(int i = 0; i < weights.getNumRows(); i++) { + for (int j = 0; j < weights.getNumCols(); j++) { + float v = weightsCPU(i,j); + weightsCPU(i,j) += eps; + + checkGradient_copyWeightsToGPU(weightsCPU, weights); + + weightsCPU(i,j) = v; + double err = 0; + for (int p = 0; p < getNumPasses(); ++p) { +// printf("trying fprop %d\n", p); + fprop(0, p, PASS_GC); +// printf(" success\n"); + err += getCostValue(); + } + numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps); + if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) { + cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl; + cout << "Consider reducing the sizes of the weights or finite difference steps." << endl; + cout << "Exiting." << endl; + exit(1); + } + checkGradient_copyWeightsToGPU(weightsCPU, weights); + } + } + Matrix gradCPU; + NVMatrix::setDeviceID(weights.getDeviceID()); + map mats; + for (map::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) { + mats[it->first] = &it->second->getGrad(); + } + weights.getReducer().reduce(mats, 1, false); + + weights.getGrad().copyToHost(gradCPU, true); + gradCPU.scale(-1.0 / _data->getNumCases()); + float analNorm = gradCPU.norm(); + float numNorm = numGrad.norm(); + numGrad.subtract(gradCPU, diff); + float relErr = diff.norm() / analNorm; + bool fail = relErr >= GC_REL_ERR_THRESH; + if (fail || !GC_SUPPRESS_PASSES) { + cout << "========================" << endl; + printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str()); + cout << "========================" << endl; + cout << "Analytic:" << endl; + gradCPU.print(0, 6, 0, 4); + cout << "Numeric:" << endl; + numGrad.print(0, 6, 0, 4); + printf("Analytic norm: %e\n", analNorm); + printf("Numeric norm: %e\n", numNorm); + printf("Relative error: %e\n", relErr); + } + _numTests++; + _numFailures += fail; + return fail; +} + +/* + * ======================================================================================================= + * ConvNetThread + * ======================================================================================================= + */ +ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet) + : Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) { + try { + int numLayers = layerList->size(); + + for (int i = 0; i < numLayers; i++) { + PyObject* paramsDict = layerList->at(i); + std::string layerType = pyDictGetString(paramsDict, "type"); + if (layerType != "data") { + intv& gpus = *pyDictGetIntV(paramsDict, "gpu"); + int rid = indexOf(gpus, deviceIdx); + if (rid >= 0) { + initLayer(paramsDict, rid); + } + delete &gpus; + } + } + } catch (std::string& s) { + cout << "Error creating ConvNet: " << s << endl; + exit(1); + } +} + +ConvNetThread::~ConvNetThread() { + NVMatrix::setDeviceID(_deviceID); + NVMatrix::destroyCublas(); + NVMatrix::destroyRandom(); + for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + delete it->second; + } + _nameLayerMap.clear(); +} + +void ConvNetThread::startTimer() { + NVMatrix::syncStream(); + _timer.start(); +} + +double ConvNetThread::stopTimer() { + NVMatrix::syncStream(); + return _timer.stop(); +} + +void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) { + std::string type = pyDictGetString(paramsDict, "type"); + std::string name = pyDictGetString(paramsDict, "name"); + if (type == "fc") { + _nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false); + } else if (type == "sfc") { + _nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false); + } else if (type == "conv") { + _nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID); + } else if (type == "local") { + _nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID); + } else if (type == "pool") { + _nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID); + } else if (type == "cmpool") { + _nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID); + } else if (type == "rnorm") { + _nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID); + } else if (type == "cmrnorm") { + _nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID); + } else if (type == "cnorm") { + _nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID); + } else if (type == "softmax") { + _nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID); + } else if (type == "eltsum") { + _nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID); + } else if (type == "eltmax") { + _nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID); + } else if (type == "neuron") { + _nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID); + } else if (type == "nailbed") { + _nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID); + } else if (type == "blur") { + _nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID); + } else if (type == "href") { + _nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID); + } else if (type == "resize") { + _nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID); + } else if (type == "rgb2yuv") { + _nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID); + } else if (type == "rgb2lab") { + _nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID); + } else if (type == "rscale") { + _nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID); + } else if (type == "crop") { + _nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID); + } else if (type == "concat") { + _nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID); + } else if (type == "pass") { + _nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID); + } else if (type == "dropout") { + _nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID); + } else if (type == "dropout2") { + _nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID); + } else if (strncmp(type.c_str(), "cost.", 5) == 0) { + CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID); + _nameLayerMap[name] = c; + _costs.push_back(c); + } else { + throw std::string("Unknown layer type ") + type; + } +} + +/* + * This executes in a new CPU thread so it's OK to initialize CUDA stuff here. + */ +void ConvNetThread::initCuda() { + NVMatrix::setDeviceID(_deviceID); + checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); + for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) { + int d = _convNet->getDeviceIDs()[i]; + if (d != _deviceID) { + if (NVMatrix::canAccessPeer(_deviceID, d)) { + printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d); + checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0)); + } else { + printf("No peer access GPU %d --> GPU %d\n", _deviceID, d); + } + } + } +// NVMatrix::syncStream(); + NVMatrix::initCublas(); + NVMatrix::initRandom(/*7*/); + srand(time(0)); +} + +void* ConvNetThread::run() { + initCuda(); + bool exit = false; + while (!exit) { + Message* m = _msgQueue.dequeue(); + if (m->getType() == FPROP_READY) { + FpropMessage* msg = static_cast(m); + msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx()); + } else if (m->getType() == BPROP_READY) { + BpropMessage* msg = static_cast(m); + msg->getToLayer().incRcvdBInputMsgs(); + msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx()); + } else if (m->getType() == BPROP_START) { + BpropStartMessage* msg = static_cast(m); + for (int i = 0; i < _costs.size(); i++) { + dynamic_cast(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx()); + } + } else if (m->getType() == SYNC) { + NVMatrix::syncStream(); + _convNet->getSync().sync(); + } else if (m->getType() == COPY_TO_CPU) { + for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + it->second->copyToCPU(); + } + } else if (m->getType() == COPY_TO_GPU) { + for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + it->second->copyToGPU(); + } + } else if (m->getType() == RESET) { + for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + it->second->reset(); + } + } else if (m->getType() == RESET_PASS_IDX) { + for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + it->second->resetPassIdx(); + } + } else if (m->getType() == UPDATE_WEIGHTS) { + for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + it->second->updateWeights(); + } + } else if (m->getType() == CONSTRAIN_WEIGHTS) { + for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { + it->second->constrainWeights(); + } + } else if (m->getType() == EXIT_CONVNET) { + exit = true; + } + delete m; + } + return NULL; +} + +Cost& ConvNetThread::getCost() { + // In a single ConvNetThread, all costs are guaranteed to be different + // (i.e. not replicas of one another) + return *new Cost(_costs); +} + +Layer& ConvNetThread::getLayer(std::string& name) { + return *_nameLayerMap[name]; +} + +int ConvNetThread::getDeviceID() { + return _deviceID; +} + +Queue& ConvNetThread::getMessageQueue() { + return _msgQueue; +} + +vector& ConvNetThread::getCostLayers() { + return _costs; +} + +NameLayerMap& ConvNetThread::getLayerMap() { + return _nameLayerMap; +} + +ConvNet& ConvNetThread::getConvNet() { + return *_convNet; +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu new file mode 100644 index 0000000..37afa33 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu @@ -0,0 +1,378 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/copypipeline.cuh" +//#include "gpu_util.cuh" + +using namespace std; + +/* ========================= + * ICopySegment + * ========================= + */ +ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue) + : _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) { + _execDeviceID = _deviceID; +} + +ICopySegment::~ICopySegment() { + if (_stream != NULL) { + checkCudaErrors(cudaStreamDestroy(_stream)); + } +} + +void* ICopySegment::run() { + assert(_execDeviceID != DEVICE_HOST); + NVMatrix::setDeviceID(_execDeviceID); + checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking)); + bool exit = false; + while (!exit) { + CopyMessage& msg = *_queue.dequeue(); + if (msg.getType() == CopyMessage::EXIT) { + exit = true; + } else { + bool term = processMessage(msg); + if (term) { + assert(_finishQueue != NULL); + _finishQueue->enqueue(1); + } + } + delete &msg; + } + return NULL; +} + +NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) { + NVMatrix& line = mat.reshaped(1, mat.getNumElements()); + int start = chunkIdx * chunkSize; + int end = min((chunkIdx+1) * chunkSize, mat.getNumElements()); + NVMatrix& chunk = line.sliceCols(start, end); + delete &line; + return chunk; +} + +inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) { + if (getDeviceID() == DEVICE_HOST) { + return _hmat; + } + return msg.getMatrix(getDeviceID()); +} + +Queue& ICopySegment::getQueue() { + return _queue; +} + +inline int ICopySegment::getDeviceID() { + return _deviceID; +} + +void ICopySegment::addPrev(ICopySegment& c) { + _prev = &c; + if (_deviceID == DEVICE_HOST) { + _execDeviceID = c.getDeviceID(); + } +} + +void ICopySegment::addNext(CopyPeer& c) { + _next.push_back(&c); + c.addPrev(*this); +} + +bool ICopySegment::isTerminal() const { + return _next.size() == 0; +} + +/* ========================= + * CopySource + * ========================= + */ +CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) { +} + +bool CopySource::processMessage(CopyMessage& msg) { + assert(msg.getType() == CopyMessage::COPY_START); + int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE)))); + int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks); +// printf("num chunks: %d\n", numChunks); + for (int c = 0; c <= numChunks; ++c) { + for (vector::const_iterator it = _next.begin(); it != _next.end(); ++it) { + (*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices())); + } + } + return false; +} + +inline bool CopySource::isSource() const { + return true; +} + +/* ========================= + * CopyPeer + * ========================= + */ +CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue) : ICopySegment(parent, deviceID, finishQueue) { +} + +bool CopyPeer::processMessage(CopyMessage& msg) { + assert(msg.getType() == CopyMessage::COPY_CHUNK); + CopyChunkMessage& cmsg = *static_cast(&msg); + if (cmsg.getChunkIdx() < cmsg.getNumChunks()) { + if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) { + getMatrix(msg).resize(_prev->getMatrix(msg)); + } +// getMatrix(msg).printShape("getMatrix(msg)"); +// _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)"); + assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg))); + const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0; + const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1; + NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); + NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); + prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream); + NVMatrix::syncStream(_stream); + delete &prevChunk; + delete &myChunk; + } + for (vector::const_iterator it = _next.begin(); it != _next.end(); ++it) { + (*it)->getQueue().enqueue(new CopyChunkMessage(cmsg)); + } + return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal(); +} + +inline bool CopyPeer::isSource() const { + return false; +} + +/* ========================= + * IBroadcastNetwork + * ========================= + */ +IBroadcastNetwork& IBroadcastNetwork::make(set devices, int srcDevice) { + if (devices.size() == 8) { + return (new EightGPUBroadcaster1(devices, srcDevice))->construct(); + } else if (devices.size() == 1) { + return (new NullBroadcaster(devices, srcDevice))->construct(); + } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) { + return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct(); + } + return (new NaiveBroadcaster(devices, srcDevice))->construct(); +} + +IBroadcastNetwork::IBroadcastNetwork(set& devices, int srcDeviceID, int numTerminal) + : _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) { +} + +IBroadcastNetwork::~IBroadcastNetwork() { + vector v; + v.insert(v.end(), _peers.begin(), _peers.end()); + v.insert(v.end(), _src); + for (vector::const_iterator it = v.begin(); it != v.end(); ++it) { + (*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT)); + (*it)->join(); + delete *it; + } +} + +IBroadcastNetwork& IBroadcastNetwork::construct() { + assert(!_constructed); + pair,vector > gpus = makeGPULists(); + _src = new CopySource(*this, _srcDeviceID); + makePeers(gpus); + makeConnections(); + _src->start(); + for (vector::const_iterator it = _peers.begin(); it != _peers.end(); ++it) { + (*it)->start(); + } + _constructed = true; + return *this; +} + +pair,vector > IBroadcastNetwork::makeGPULists() { + vector same, other; + for (set::const_iterator it = _devices.begin(); it != _devices.end(); ++it) { + if (*it != _srcDeviceID) { + if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) { + same.insert(same.begin() + rand() % (1 + same.size()), *it); + } else { + other.insert(other.begin() + rand() % (1 + other.size()), *it); + } + } + } + return pair,vector >(same, other); +} + +void IBroadcastNetwork::broadcast(std::map& mats) { + _broadcast(mats, 1, 0); +} + +void IBroadcastNetwork::_broadcast(std::map& mats, float scaleSource, float scaleTargets) { + assert(_constructed); + assert(_finishQueue.getNumElements() == 0); + assert(mats.size() == _devices.size()); + assert(mats.size() > 1); + if (mats[_srcDeviceID]->getNumElements() == 0) { + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + it->second->resize(*mats[_srcDeviceID]); + } + } else { + _src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats)); + for (int i = 0; i < _numTerminal; ++i) { + _finishQueue.dequeue(); + } + } + assert(_finishQueue.getNumElements() == 0); +} + +int IBroadcastNetwork::getSourceDeviceID() const { + return _srcDeviceID; +} + +void IBroadcastNetwork::makePeers(pair,vector >& gpus) { + vector& same = gpus.first, &other = gpus.second; + for (int i = 0; i < same.size(); ++i) { + _peers.push_back(new CopyPeer(*this, same[i], &_finishQueue)); + } + for (int i = 0; i < other.size(); ++i) { + _peers.push_back(new CopyPeer(*this, other[i], &_finishQueue)); + } + _peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7] +} + +/* ========================= + * ISafeBroadcastNetwork + * ========================= + */ +ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set devices, int srcDevice) { + if (devices.size() == 1) { + return (new NullBroadcaster(devices, srcDevice))->construct(); + } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) { + return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct(); + } + return (new NaiveBroadcaster(devices, srcDevice))->construct(); +} + +ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) { +} + +void ISafeBroadcastNetwork::broadcast(std::map& mats, float scaleSource, float scaleTargets) { + _broadcast(mats, scaleSource, scaleTargets); +} + +ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() { + IBroadcastNetwork::construct(); + return *this; +} + +/* ========================= + * NullBroadcaster + * ========================= + */ +NullBroadcaster::NullBroadcaster(std::set& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) { +} + +void NullBroadcaster::makeConnections() { +} + +NullBroadcaster& NullBroadcaster::construct() { + _constructed = true; + return *this; +} + +void NullBroadcaster::broadcast(std::map& mats, float scaleSource, float scaleTargets) { +} + +void NullBroadcaster::broadcast(std::map& mats) { +} + +/* ========================= + * NaiveBroadcaster + * ========================= + * + * This one does src -> host -> all + */ +NaiveBroadcaster::NaiveBroadcaster(std::set& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) { +} + +void NaiveBroadcaster::makeConnections() { + _src->addNext(*_peers.back()); // Make connection src -> host + for (int i = 0; i < _peers.size() - 1; ++i) { + if (_peers[i]->getDeviceID() != _src->getDeviceID()) { + _peers.back()->addNext(*_peers[i]); // Make connection host -> peer + } + } +} + +/* ========================= + * EightGPUBroadcaster1 + * ========================= + * + * This one does a fancy graph + */ +EightGPUBroadcaster1::EightGPUBroadcaster1(set& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) { +} + +void EightGPUBroadcaster1::makeConnections() { + _src->addNext(*_peers[7]); + _peers[7]->addNext(*_peers[0]); + _peers[7]->addNext(*_peers[1]); + _peers[7]->addNext(*_peers[3]); + _peers[7]->addNext(*_peers[4]); + + _peers[1]->addNext(*_peers[2]); + _peers[3]->addNext(*_peers[5]); + _peers[4]->addNext(*_peers[6]); +} + +/* ========================= + * TwoPeeringGPUsBroadcaster + * ========================= + */ +TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) { + _tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin(); +} + +TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() { + if (_constructed) { + checkCudaErrors(cudaStreamDestroy(_tgtStream)); + } +} + +void TwoPeeringGPUsBroadcaster::makeConnections() { +} + +void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) { + if (d >= 0) { + NVMatrix::setDeviceID(d); + } +} + +ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() { + assert(!_constructed); + int d = NVMatrix::getDeviceID(); + NVMatrix::setDeviceID(_tgtDeviceID); + checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking)); + resetDeviceID(d); + _constructed = true; + return *this; +} + +void TwoPeeringGPUsBroadcaster::_broadcast(std::map& mats, float scaleSource, float scaleTargets) { + int d = NVMatrix::getDeviceID(); + NVMatrix::setDeviceID(_tgtDeviceID); + mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream); + NVMatrix::syncStream(_tgtStream); + resetDeviceID(d); +} + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu new file mode 100644 index 0000000..55d466a --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu @@ -0,0 +1,113 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "../include/cost.cuh" + +using namespace std; + +/* + * ===================== + * Cost + * ===================== + */ + +Cost::Cost() { +} + +Cost::Cost(vector& costs) { + for (vector::iterator it = costs.begin(); it != costs.end(); ++it) { + _costMap[(*it)->getName()] = &(*it)->getCost(); + _costCoeffMap[(*it)->getName()] = (*it)->getCoeff(); + _numCases[(*it)->getName()] = (*it)->getNumCases(); + } +} + +int Cost::getNumCases() { + return _numCases.size() == 0 ? 0 : _numCases.begin()->second; +} + +map& Cost::getNumCasesMap() { + return _numCases; +} + +doublev& Cost::operator [](const std::string s) { + return *_costMap[s]; +} + +CostMap& Cost::getCostMap() { + return _costMap; +} + +CostCoeffMap& Cost::getCostCoeffMap() { + return _costCoeffMap; +} + +double Cost::getValue() { + double val = 0; + for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) { + val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0)); + } + return val; +} + +Cost& Cost::operator += (Cost& er) { + CostMap& otherMap = er.getCostMap(); + CostCoeffMap& otherCoeffMap = er.getCostCoeffMap(); + + for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) { + bool newCost = _costMap.count(it->first) == 0; + if (newCost) { + _costMap[it->first] = new doublev(); + _costCoeffMap[it->first] = otherCoeffMap[it->first]; + _numCases[it->first] = er.getNumCasesMap()[it->first]; + } else { + _numCases[it->first] += er.getNumCasesMap()[it->first]; + } + + doublev& myVec = *_costMap[it->first]; + doublev& otherVec = *otherMap[it->first]; + assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size()); + // Add costs from otherVec to me + for (int i = 0; i < otherVec.size(); i++) { + if (myVec.size() <= i) { + myVec.push_back(0); + } + myVec[i] += otherVec[i]; + } + } + return *this; +} + +Cost::~Cost() { + for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) { + delete it->second; + } +} + +void Cost::print() { + for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) { + printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]); + doublev& vec = *_costMap[it->first]; + for (int z = 0; z < vec.size(); ++z) { + printf("%.3f", vec[z]); + if (z < vec.size() - 1) { + printf(", "); + } + } + printf("\n"); + } +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu new file mode 100644 index 0000000..6c2cdcd --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu @@ -0,0 +1,82 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../../util/include/matrix.h" +#include "../include/data.cuh" +#include "../include/timer.cuh" + +using namespace std; + +DataProvider::DataProvider(int minibatchSize) : + _minibatchSize(minibatchSize), _hData(NULL) { +} + +void DataProvider::clearData() { + delete _hData; + _hData = NULL; +} + +void DataProvider::setData(CPUData& hData) { + // DataWorker calls clearData + _hData = &hData; + assert(_hData != NULL); +} + +CPUData& DataProvider::getMinibatch(int idx) { + assert(idx >= 0 && idx < getNumMinibatches()); + return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize); +} + +CPUData& DataProvider::getDataSlice(int startCase, int endCase) { + assert(_hData != 0); + assert(_hData->getNumCases() > 0); + endCase = min(_hData->getNumCases(), endCase); + // TODO: maintain these matrices, no point re-creating them all the time + MatrixV& miniData = *new MatrixV(); + + for (int i = 0; i < _hData->getData().size(); i++) { + // NOTE: if hData is transposed, then the output minibatch matrix + // can be a view. No need to allocate new CPU memory here. Might + // want to look into optimizing that in the future, though it's + // unlikely to be a big deal. + if (_hData->isTrans()) { + miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase)); + } else { + miniData.push_back(new Matrix()); + (*_hData)[i].sliceCols(startCase, endCase, *miniData.back()); + } + } + CPUData& cpuData = *new CPUData(&miniData); + return *new CPUData(&miniData); +} + +int DataProvider::getNumMinibatches() { + assert(_hData != 0); + assert(_hData->getNumCases() > 0); + return DIVUP(_hData->getNumCases(), _minibatchSize); +} + +int DataProvider::getMinibatchSize() { + return _minibatchSize; +} + +int DataProvider::getNumCases() { + assert(_hData != 0); + assert(_hData->getNumCases() > 0); + return _hData->getNumCases(); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu new file mode 100644 index 0000000..0a70182 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu @@ -0,0 +1,202 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/util.cuh" +#include "../include/gradreducer.cuh" + +using namespace std; + +/* ===================== + * IGradReducer + * ===================== + */ +IActGradReducer::IActGradReducer(Layer& parent, map numExpectedMsgs) + : Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) { + _numExpectedMsgsTotal = 0; + for (map::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) { + _numExpectedMsgsTotal += it->second; + } +// printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal); +} + +IActGradReducer::~IActGradReducer() { + +} + +void* IActGradReducer::run() { + while (true) { + reset(); + if (reduce()) { + break; + } + _finishQueue.enqueue(0); + } + return NULL; +} + +// Cost layer will have nothing to dequeue, so just return immediately. +int IActGradReducer::waitForFinish() { + if (_numExpectedMsgsTotal > 0) { + int i = _finishQueue.dequeue(); + assert(_finishQueue.getNumElements() == 0); + return i; + } +// printf("%s not waiting for finish\n", _name.c_str()); + return 0; +} + +IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map numExpectedMsgs) { + int tgtDeviceID = parent.getDeviceID(); + if (numExpectedMsgs.count(tgtDeviceID) == 0) { + numExpectedMsgs[tgtDeviceID] = 0; + } + if (numExpectedMsgs.size() == 8) { + return *new ParallelActGradReducer(parent, numExpectedMsgs); + } + return *new SequentialActGradReducer(parent, numExpectedMsgs); +} + +/* ===================== + * SequentialGradReducer + * ===================== + */ +SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map numExpectedMsgs) + : IActGradReducer(parent, numExpectedMsgs) { + intv deviceIDs; + int tgtDeviceID = parent.getDeviceID(); + for (map::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) { + if (it->first != tgtDeviceID) { + deviceIDs.push_back(it->first); + } + } + if (numExpectedMsgs[tgtDeviceID] > 0) { + deviceIDs.push_back(tgtDeviceID); + } + + sort(deviceIDs.begin(), deviceIDs.end()); + + int firstDeviceIdx = 0, firstDeviceID = 1 << 16; + for (int i = 0; i < deviceIDs.size(); ++i) { + if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) { + firstDeviceIdx = i; + firstDeviceID = deviceIDs[i]; + } + } + + // This is the order in which we process devices. + for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) { + int d = deviceIDs[i]; + _deviceIDs.push_back(d); + _messageQueues[d] = new Queue(); + } + //shuffleVector(_deviceIDs, 1, _deviceIDs.size()); + _broadcaster = new StreamBroadcast(); + + // Note that we MUST process the tgtDeviceID first because + // we write to it at every iteration, and the computation + // thread writes to it too. By processing it first we ensure + // that there's no race condition. + assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID); + reset(); +} + +SequentialActGradReducer::~SequentialActGradReducer() { + for(map* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) { + delete it->second; + } + delete _broadcaster; +} + +void SequentialActGradReducer::reset() { + for (map::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) { + _numReceivedMsgs[it->first] = 0; + } +} + +bool SequentialActGradReducer::reduce() { + int tgtDeviceID = _parent->getDeviceID(); + for (int didx = 0; didx < _deviceIDs.size(); ) { + int d = _deviceIDs[didx]; + _numReceivedMsgs[d] += _messageQueues[d]->dequeue(); + if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) { + if (d != tgtDeviceID) { + NVMatrix::setDeviceID(tgtDeviceID); + + _parent->getActsGrad().resize(_parent->getActsGrad(d)); + map mats; + mats[d] = &_parent->getActsGrad(d); + mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID); + + _broadcaster->transfer(mats, d, didx > 0, 1); + } + didx++; + assert(_messageQueues[d]->getNumElements() == 0); + } else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit + return true; + } + } + return false; +} + +void SequentialActGradReducer::enqueueReduction(int deviceID) { + _messageQueues[deviceID]->enqueue(1); +} + +void SequentialActGradReducer::stop() { + for(map* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) { + it->second->enqueue(ACT_GRAD_REDUCER_EXIT); + } + join(); +} + +/* ===================== + * ParallelActGradReducer + * ===================== + */ +ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map numExpectedMsgs) + : IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) { + _reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct(); + + _scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0; +} + +bool ParallelActGradReducer::reduce() { + // TODO: make it so that you can start the reduction before you've received all the messages. + while(_numReceivedMsgs < _numExpectedMsgsTotal) { + _numReceivedMsgs += _messageQueue.dequeue(); + } + if (_numReceivedMsgs > _numExpectedMsgsTotal) { + return true; // exit + } + map mats = _parent->getAllActsGrads(); + _reducer->reduce(mats, 1, _scaleTarget); + assert(_messageQueue.getNumElements() == 0); + return false; + +} + +void ParallelActGradReducer::enqueueReduction(int deviceID) { + _messageQueue.enqueue(1); +} + +void ParallelActGradReducer::stop() { + _messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT); + join(); +} + +void ParallelActGradReducer::reset() { + _numReceivedMsgs = 0; +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp new file mode 100644 index 0000000..7d158df --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp @@ -0,0 +1,135 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/jpeg.h" + +using namespace std; + +/* ======================== + * DecoderThread + * ======================== + */ +DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview) +: Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img), + _img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview), + _decodeTarget(0), _decodeTargetSize(0) { + + _inner_pixels = _inner_size * _inner_size; + _rseed = time(0); +} + +DecoderThread::~DecoderThread(){ + free(_decodeTarget); +} + +void* DecoderThread::run() { + int numSrcCases = PyList_GET_SIZE(_pyList); + assert(_target->getNumCols() == _inner_pixels * 3); + assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1)); + + int width, height; + + for (int64 i = _start_img; i < _end_img; ++i) { + decodeJpeg(i, width, height); + assert((width == _img_size && height >= _img_size) + || (height == _img_size && width >= _img_size)); + if (_multiview) { + for (int flip = 0; flip < 2; ++flip) { + crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left + crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right + crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center + crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left + crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right + } + } else { + crop(i, width, height, !_test && (rand_r(&_rseed) % 2)); + } + + } + return NULL; +} + +void DecoderThread::decodeJpeg(int idx, int& width, int& height) { + PyObject* pySrc = PyList_GET_ITEM(_pyList, idx); + unsigned char* src = (unsigned char*)PyString_AsString(pySrc); + size_t src_len = PyString_GET_SIZE(pySrc); + + struct jpeg_decompress_struct cinf; + struct jpeg_error_mgr jerr; + cinf.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinf); + jpeg_mem_src(&cinf, src, src_len); + assert(jpeg_read_header(&cinf, TRUE)); + cinf.out_color_space = JCS_RGB; + assert(jpeg_start_decompress(&cinf)); + assert(cinf.num_components == 3 || cinf.num_components == 1); + width = cinf.image_width; + height = cinf.image_height; + + if (_decodeTargetSize < width * height * 3) { + free(_decodeTarget); + _decodeTargetSize = width * height * 3 * 3; + _decodeTarget = (unsigned char*)malloc(_decodeTargetSize); + } + + while (cinf.output_scanline < cinf.output_height) { + JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline]; + assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0); + } + assert(jpeg_finish_decompress(&cinf)); + jpeg_destroy_decompress(&cinf); +} + +/* + * Uniform in [0,1) + */ +inline double DecoderThread::randUniform() { + return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1); +} + +/* + * Uniform in [min, max) + */ +inline double DecoderThread::randUniform(double min, double max) { + return (max - min) * randUniform() + min; +} + +void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) { + crop(i, src_width, src_height, flip, -1, -1); +} + +void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) { + const int64 border_size_y = src_height - _inner_size; + const int64 border_size_x = src_width - _inner_size; + if (crop_start_x < 0) { + crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1)); + } + if (crop_start_y < 0) { + crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1)); + } + const int64 src_pixels = src_width * src_height; + for (int64 c = 0; c < 3; ++c) { + for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) { + for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) { + assert((y >= 0 && y < src_height && x >= 0 && x < src_width)); + _target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size + + (flip ? (_inner_size - 1 - x + crop_start_x) + : (x - crop_start_x))) + = _decodeTarget[3 * (y * src_width + x) + c]; + } + } + } +} \ No newline at end of file diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu new file mode 100644 index 0000000..4ff54f8 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu @@ -0,0 +1,2306 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../../cudaconv3/include/cudaconv2.cuh" +#include "../../util/include/matrix.h" +#include "../include/layer_kernels.cuh" +#include "../include/layer.cuh" +#include "../include/data.cuh" +#include "../include/util.cuh" +#include "../include/weights.cuh" + +using namespace std; + +/* + * ======================= + * Layer + * ======================= + */ +Layer::Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) : + _convNetThread(convNetThread), _replicaID(replicaID), _trans(trans) { + _name = pyDictGetString(paramsDict, "name"); + _type = pyDictGetString(paramsDict, "type"); + + _foundGradConsumers = false; + _gradConsumer = pyDictGetInt(paramsDict, "gradConsumer"); + _actsTarget = pyDictGetInt(paramsDict, "actsTarget"); + _actsGradTarget = pyDictGetInt(paramsDict, "actsGradTarget"); + _numOutputs = pyDictGetInt(paramsDict, "outputs"); + _numReplicas = pyDictGetInt(paramsDict, "numReplicas"); + _numReplicasPrev = 1; + _rcvdBInputMsgs = 0; + + _actBroadcaster = NULL; + _gradReducer = NULL; + _initialized = false; +} + +Layer::~Layer() { + if (_actBroadcaster != NULL) { + _actBroadcaster->stop(); + delete _actBroadcaster; + } + if (_gradReducer != NULL) { + _gradReducer->stop(); + delete _gradReducer; + } + // For now, gradReducer doesn't have a destructor +// delete _gradReducer; + for (std::map::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { + if (it->second->getMemorySource().truncate(_name)) { + delete &it->second->getMemorySource(); + } + } + for (std::map::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) { + if (it->second->getMemorySource().truncate(_name)) { + delete &it->second->getMemorySource(); + } + } +} + +cudaStream_t Layer::getStream() { + assert(getDeviceID() >= 0); + return NVMatrix::getDefaultStream(getDeviceID()); +} + +void Layer::syncStream() { + NVMatrix::syncStream(getStream()); +} + +void Layer::fpropNext(PASS_TYPE passType, int passIdx) { + if (_next.size() > 0) { + if (getFwdActiveReplicaIdx(passIdx) == 0/*getReplicaIdx()*/) { // 0 turns on pipelining + if (_nextDeviceIDs.size() > 1 || (_nextDeviceIDs.size() == 1 && _nextDeviceIDs[0] != getDeviceID())) { + syncStream(); // Make sure I've finished computing before broadcasting + } + getActBroadcaster().getMessageQueue().enqueue(new BroadcastMessage(getAllActs(), getDeviceID(), getReplicaIdx(), _broadcastFinishQueue)); + } + if (getFwdActiveReplicaIdx(passIdx) == getReplicaIdx()) { + _broadcastFinishQueue.dequeue(); + assert(_broadcastFinishQueue.getNumElements() == 0); + } + } + + for (int i = 0; i < _next.size(); i++) { + _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx)); + } +} + +bool Layer::fprop(PASS_TYPE passType, int passIdx) { + _rcvdFInputMsgs++; + // I require messages from *all* input replicas because it makes the propagation easier to think about. + // Without this requirement, when all fprop terminal msgs arrive to ConvNet, the forward propagation + // might not actually be finished yet. + if (_rcvdFInputMsgs == getNumExpectedFwdMsgs()) { +// printf("Layer %s[%d] fprop\n", _name.c_str(), getReplicaID()); + int ridx = getFwdActiveInputReplicaIdx(passIdx); + assert(getDeviceID() == NVMatrix::getDeviceID()); + map v; + if (ridx >= 0) { + for (int i = 0; i < getNumLayersPrev(); i++) { + v[i] = &_prev[ridx][i]->getActs(getDeviceID()); + } + } + fprop(v, passType, passIdx); + return true; + } + return false; +} + +void Layer::fprop(map& v, PASS_TYPE passType, int passIdx) { + if (getFwdActiveInputReplicaIdx(passIdx) >= 0) { + assert(v.size() == getNumLayersPrev()); + _inputs.clear(); + _inputs.insert(v.begin(), v.end()); + + int numCases = _inputs[0]->getLeadingDim(); + for (map::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { + it->second->getMemory(numCases); + } + + if (numCases > 0) { + //printf("layer %s fprop, numcases: %d\n", _name.c_str(), numCases); + _rcvdFInputMsgs = getNumExpectedFwdMsgs(); + for (map::iterator it = v.begin(); it != v.end(); ++it) { + it->second->transpose(_trans); + } + getActs().transpose(_trans); + + fpropCommon(passType); + + // First do fprop on the input whose acts matrix I'm sharing, if any + if (_actsTarget >= 0) { + fpropActs(_actsTarget, 0, passType, passIdx); + } + // Then add the rest of the inputs to that + for (int i = 0; i < getNumLayersPrev(); i++) { + if (i != _actsTarget) { + fpropActs(i, _actsTarget >= 0 || i > 0, passType, passIdx); + } + } + } + } + fpropNext(passType, passIdx); +} + +void Layer::truncBwdActs() { + // Only truncate actsGrad if I own it + if (_actsGradTarget < 0) { + for (map::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) { + it->second->getMemorySource().truncate(getName()); + } + } + if (_actsTarget < 0) { + for (map::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { + it->second->getMemorySource().truncate(getName()); + } + } +} + +int Layer::getNumGradProducersNext() { + return _numGradProducersNext; +} + +int Layer::getNumExpectedBwdMsgs() { + return _numGradProducersNext * getNumSiblingReplicas(); +} + +int Layer::getNumExpectedFwdMsgs() { + return getNumLayersPrev() * getNumInputReplicas(); +} + +void Layer::bprop(PASS_TYPE passType, int passIdx) { + if (getBwdActiveInputReplicaIdx(passIdx) >= 0 && _rcvdBInputMsgs == getNumExpectedBwdMsgs()) { +// printf("Layer %s[%d] bprop\n", _name.c_str(), getReplicaID()); + if (_gradReducer != NULL) { + _gradReducer->waitForFinish(); + } + + // This does sync, but only if it has grad consumers below! so we must sync again before sending bprop terminal messages + bprop(getActsGrad(), passType, passIdx); + + if (_bwdTerminal[passIdx]) { + syncStream(); + getConvNet().getMessageQueue().enqueue(new Message(BPROP_TERMINAL)); + } + } +} + +void Layer::bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx) { + Layer& prev = *_prev[replicaIdx][inputIdx]; + if (prev.isGradConsumer() && isGradProducer(prev.getName())) { + if (v.getLeadingDim() > 0) { // Only do computation if #cases > 0 + bpropActs(v, replicaIdx, inputIdx, prev.getNumComputedActsGrads(getDeviceID()) > 0, passType); + } + prev.getNumComputedActsGrads(getDeviceID())++; + // Synchronize if the previous layer is going to actually do a reduction. + // If the previous layer is on the same GPU as us and has no next layers + // on other GPUs then it won't need to do a reduction. + if (prev.getNextDeviceIDs().size() > 1 || (prev.getNextDeviceIDs().size() == 1 && getDeviceID() != prev.getDeviceID())) { + syncStream(); + } + prev.getGradReducer().enqueueReduction(getDeviceID()); + } +} + +void Layer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) { + + v.transpose(_trans); + assert(getDeviceID() == NVMatrix::getDeviceID()); + int ridx = getBwdActiveInputReplicaIdx(passIdx); + LayerV& prev = _prev[ridx]; + map > prevByDevice = _prevByDevice[ridx]; + + for (int i = 0; i < prev.size(); i++) { + _inputs[i]->transpose(_trans); + prev[i]->getActsGrad().transpose(_trans); + } + getActs().transpose(_trans); + // NOTE: this should be here (before the bpropActs) because if you have a layer + // that has a weight matrix AND actsGradTarget >= 0, then the stuff below will overwrite + // v which is used in bpropCommon. So bpropCommon must come first. + bpropCommon(v, ridx, passType); + + if (isGradProducer()) { + // First propagate activity gradient to all layers whose activity + // gradient matrix I'm definitely not sharing. + for (map >::const_iterator it = prevByDevice.begin(); it != prevByDevice.end(); ++it) { + const set& deviceLayers = it->second; + for (set::const_iterator it2 = deviceLayers.begin(); it2 != deviceLayers.end(); ++it2) { + if (_actsGradTarget != (*it2)->getInputIdx(_name)) { + bpropActsCall(v, passType, ridx, (*it2)->getInputIdx(_name)); + } + } + } + + // Then propagate activity gradient to the layer whose activity gradient + // matrix I'm sharing, if any. + if (_actsGradTarget >= 0) { + bpropActsCall(v, passType, ridx, _actsGradTarget); + } + } + + // Synchronization is necessary because the kernel calls that compute my backward acts + // execute asynchronously. Therefore I don't want to tell other threads that I've + // computed bprop activities for them when in fact I've only called a function which + // will eventually compute them. + if (_prevDeviceIDs.size() > 1 || (_prevDeviceIDs.size() == 1 && _prevDeviceIDs[0] != getDeviceID())) { + syncStream(); + } + + if (getConvNet().isConserveMemory()) { + truncBwdActs(); + } + + if (isGradProducer()) { + /*for (int i = 0; i < prev.size(); i++) { + if (prev[i]->isGradConsumer() && isGradProducer(prev[i]->getName())) { + prev[i]->getGradReducer().enqueueReduction(getDeviceID()); + } + }*/ + + // Send backward messages to *all* replicas. + // Note that the messages will be dismissed unless the passIdx indicates + // that the previous layer should do some work. + for (int r = 0; r < getNumInputReplicas(); r++) { + for (int i = 0; i < _prev[r].size(); i++) { + if (_prev[r][i]->isGradConsumer() && isGradProducer(_prev[r][i]->getName())) { + _prev[r][i]->getConvNetThread().getMessageQueue().enqueue(new BpropMessage(*_prev[r][i], passType, passIdx)); + } + } + } + } +} + +IActGradReducer& Layer::getGradReducer() { + return *_gradReducer; +} + +// This is called between minibatches +void Layer::reset() { + _rcvdFInputMsgs = 0; + _rcvdBInputMsgs = 0; + for (map::iterator it = _numComputedActsGrads.begin(); it != _numComputedActsGrads.end(); ++it) { + it->second = 0; + } +} + +// This is called between microbatches +void Layer::resetPassIdx() { + _rcvdFInputMsgs = 0; + if (_rcvdBInputMsgs >= getNumExpectedBwdMsgs()) { + reset(); + } +} + +/* + * Returns number of cases in given matrix. + */ +int Layer::getNumCases(NVMatrix& v) { + return v.getLeadingDim(); +} + +int Layer::incRcvdBInputMsgs() { + return ++_rcvdBInputMsgs; +} + +std::string& Layer::getName() { + return _name; +} + +std::string& Layer::getType() { + return _type; +} + +int& Layer::getNumComputedActsGrads(int deviceID) { + return _numComputedActsGrads[deviceID]; +} + +void Layer::addNext(Layer& l) { + _next.push_back(&l); + _numReplicasNext = l.getNumReplicas(); + if (count(_nextDeviceIDs.begin(), _nextDeviceIDs.end(), l.getDeviceID()) == 0) { + int pos = rand() % (_nextDeviceIDs.size() + 1); + _nextDeviceIDs.insert(_nextDeviceIDs.begin() + pos, l.getDeviceID()); + } +} + +void Layer::addPrev(Layer& l, int replicaIdx) { + _prev[replicaIdx].push_back(&l); + _numReplicasPrev = l.getNumReplicas(); + l.setInputIdx(getName(), _prev[replicaIdx].size() - 1); + if (l.getDeviceID() >= 0 && count(_prevDeviceIDs.begin(), _prevDeviceIDs.end(), l.getDeviceID()) == 0) { + int pos = rand() % (_prevDeviceIDs.size() + 1); + _prevDeviceIDs.insert(_prevDeviceIDs.begin() + pos, l.getDeviceID()); + } +} + +void Layer::addReplica(Layer& l) { + assert(_replicas.count(l.getReplicaID()) == 0); + _replicas[l.getReplicaID()] = &l; +} + +bool Layer::hasGradProducerNext(std::string& layerName) { + bool b = _next.size() == 0; + for (int i = 0; i < _next.size(); i++) { + b |= _next[i]->hasGradProducerNext(_name); + } + return b && isGradProducer(layerName); +} + +bool Layer::postInit() { + // We choose not to populate _outputs[getDeviceID()] here because we do it instead in fprop(). + // In fprop(), we can populate it from the _inputs vector, which is a bit more general than populating + // it from _prev->getActs() +// _outputs = _actsTarget < 0 ? new NVMatrix() : &_prev[_actsTarget]->getActs(); + if (!_initialized) { + _initialized = true; + map numGradProducersNext; + _numGradProducersNext = 0; + for (int r = 0; r < getNumInputReplicas(); ++r) { + for (vector::const_iterator it = _prev[r].begin(); it != _prev[r].end(); ++it) { + (*it)->postInit(); + } + } + + _memSrcActs[getDeviceID()] = _actsTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName()) + : &_prev[0][_actsTarget]->getMemorySourceActs(getDeviceID()).clone(_name); + + // _actsGradTarget will only be >= 0 when the number of replicas is the same in both layers, so this justifies the use of _prev[0] + + _memSrcActsGrad[getDeviceID()] = _actsGradTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName()) + : &_prev[0][_actsGradTarget]->getMemorySourceActsGrad(getDeviceID()).clone(_name); + for (int i = 0; i < _next.size(); ++i) { + int d = _next[i]->getDeviceID(); + _numComputedActsGrads[d] = 0; + if (_next[i]->hasGradProducerNext(_name)) { + if (numGradProducersNext.count(d) == 0) { + numGradProducersNext[d] = 0; + } + numGradProducersNext[d]++; + _numGradProducersNext++; + if (_memSrcActsGrad.count(d) == 0) { + _memSrcActsGrad[d] = &MemorySource::make(_numOutputs, d, getName()); + } + } + if (_memSrcActs.count(d) == 0) { + _memSrcActs[d] = &MemorySource::make(_numOutputs, d, getName()); + } + } + + if (_next.size() == 0) { + _numReplicasNext = getNumReplicas(); + } + + /* + * Initialize forward broadcaster. First sibling owns it. + */ + if (getReplicaIdx() == 0 && _convNetThread != NULL) { + _actBroadcaster = new ActBroadcaster(getNumSiblingReplicas(), getDeviceCPUs(_convNetThread->getDeviceID())); + _actBroadcaster->start(); + } + + /* + * Initialize backward reducer. + */ + if (isGradConsumer() && _numGradProducersNext > 0) { + _gradReducer = &IActGradReducer::makeGradReducer(*this, numGradProducersNext); + _gradReducer->start(); + } + + /* + * Initialize specially sorted previous array + */ + for (int r = 0; r < _prev.size(); ++r) { + for (int i = 0; i < _prev[r].size(); ++i) { + // Previous devices in reverse order of processing by (sequential) GradReducer + _prevByDevice[r][getDeviceID() - _prev[r][i]->getDeviceID() + + 16 * (_prev[r][i]->getDeviceID() > getDeviceID())].insert(_prev[r][i]); + + } + } + return true; + } + return false; +} + +ActBroadcaster& Layer::getActBroadcaster() { + return getReplicaIdx() == 0 ? *_actBroadcaster : _replicas[getReplicaID() - getReplicaIdx()]->getActBroadcaster(); +} + +// Does this layer, or some layer below it, need the gradient +// for parameter updates? +// Only weight layers should be grad consumers themselves. +bool Layer::isGradConsumer() { + if (!_foundGradConsumers && _prev.size() > 0) { + for (int i = 0; i < _prev[0].size(); i++) { + _gradConsumer |= _prev[0][i]->isGradConsumer(); + } + _foundGradConsumers = true; + } + return _gradConsumer; +} + +// Does this layer produce gradient for layers below? +bool Layer::isGradProducer() { + return true; +} + +bool Layer::isGradProducer(std::string& layerName) { + return isGradProducer(); +} + +map >& Layer::getPrev() { + return _prev; +} + +vector& Layer::getNext() { + return _next; +} + +NVMatrix& Layer::getActs() { + return getActs(getDeviceID()); +} + +NVMatrix& Layer::getActs(int deviceID) { + assert(_memSrcActs.count(deviceID) > 0); + return _memSrcActs[deviceID]->getMemory(); +} + +NVMatrix& Layer::getActs(int deviceID, int numCases) { + assert(_memSrcActs.count(deviceID) > 0); + return _memSrcActs[deviceID]->getMemory(numCases); +} + +NVMatrix& Layer::getActsGrad(int deviceID) { + assert(_memSrcActsGrad.count(deviceID) > 0); + return _memSrcActsGrad[deviceID]->getMemory(getActs(deviceID).getLeadingDim()); +} + +NVMatrix& Layer::getActsGrad() { + return getActsGrad(NVMatrix::getDeviceID()); +} + +map Layer::getAllActs() { + map m; + for (map::const_iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { + m[it->first] = &it->second->getMemory(); + } + return m; +} + +map Layer::getAllActsGrads() { + map m; + for (map::const_iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) { + m[it->first] = &it->second->getMemory(); + } + return m; +} + +int Layer::getDeviceID() { + return _convNetThread == NULL ? -1 : _convNetThread->getDeviceID(); +} + +ConvNetThread& Layer::getConvNetThread() { + assert(_convNetThread != NULL); + return *_convNetThread; +} + +ConvNet& Layer::getConvNet() { + return getConvNetThread().getConvNet(); +} + +void Layer::setBwdTerminal(int passIdx) { + _bwdTerminal[passIdx] = true; +} + +int Layer::getReplicaID() { + return _replicaID; +} + +int Layer::getActivePassPeriod() { + return getNumReplicas() / getConvNet().getNumReplicasMin(); +} + +int Layer::getFwdActiveInputReplicaIdx(int passIdx) { + const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas(); + return passIdx % getActivePassPeriod() == 0 ? edge : -1; +} + +int Layer::getBwdActiveInputReplicaIdx(int passIdx) { + const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas(); + return (passIdx + 1) % getActivePassPeriod() == 0 ? edge : -1; +} + +int Layer::getFwdActiveReplicaIdx(int passIdx) { + assert(_next.size() > 0); + return _next[0]->getFwdActiveInputReplicaIdx(passIdx); +} + +int Layer::getNumReplicas() { + return _replicas.size(); +} + +int Layer::getNumSiblingReplicas() { + return getNumReplicas() / getNumReplicasNext(); +} + +int Layer::getNumReplicasPrev() { + return _numReplicasPrev; +} + +int Layer::getNumReplicasNext() { + return _numReplicasNext; +} + +int Layer::getNumInputReplicas() { + return _numReplicasPrev / getNumReplicas(); +} + +int Layer::getReplicaIdx() { + return getReplicaID() % getNumSiblingReplicas(); +} + +int Layer::getNumLayersPrev() { + return _prev.size() > 0 ? _prev[0].size() : 0; +} + +void Layer::setMemorySourceActs(int deviceID, MemoryView& mem) { + assert(_memSrcActs[deviceID]->isParent()); + delete _memSrcActs[deviceID]; + _memSrcActs[deviceID] = &mem; + if (_actsTarget >= 0 && deviceID == getDeviceID()) { + assert(getNumInputReplicas() == 1); + _prev[0][_actsTarget]->setMemorySourceActs(deviceID, mem.clone(_prev[0][_actsTarget]->getName())); + } +} + +void Layer::setMemorySourceActsGrad(int deviceID, MemoryView& mem) { + assert(_memSrcActsGrad[deviceID]->isParent()); + delete _memSrcActsGrad[deviceID]; + _memSrcActsGrad[deviceID] = &mem; + if (_actsGradTarget >= 0 && deviceID == getDeviceID()) { + assert(getNumInputReplicas() == 1); + _prev[0][_actsGradTarget]->setMemorySourceActsGrad(deviceID, mem.clone(_prev[0][_actsGradTarget]->getName())); + } +} + +MemoryView& Layer::getMemorySourceActs(int deviceID) { + return *_memSrcActs[deviceID]; +} + +MemoryView& Layer::getMemorySourceActsGrad(int deviceID) { + return *_memSrcActsGrad[deviceID]; +} + +int Layer::getNumOutputs() { + return _numOutputs; +} + +void Layer::setInputIdx(std::string& parentName, int idx) { + _inputIndices[parentName] = idx; +} + +int Layer::getInputIdx(std::string& parentName) { + return _inputIndices[parentName]; +} + +/* + * ======================= + * NeuronLayer + * ======================= + */ +NeuronLayer::NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : Layer(convNetThread, paramsDict, replicaID, true) { + PyObject* neuronDict = PyDict_GetItemString(paramsDict, "neuron"); + _neuronType = pyDictGetString(neuronDict, "type"); + _neuron = &Neuron::makeNeuron(neuronDict); +} + +NeuronLayer::~NeuronLayer() { + delete _neuron; +} + +void NeuronLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(inpIdx == 0); + if (!bpropSpecial(v, replicaIdx, inpIdx, scaleTargets, passType)) { + _neuron->computeInputGrad(v, _prev[replicaIdx][0]->getActsGrad(), scaleTargets > 0); + } +} + +bool NeuronLayer::bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + // Special optimization for cross-entropy objective with logistic units. + // Better to just compute the input gradient in one go to avoid division by small numbers. + bool doCrossEntGrad = _neuronType == "logistic" && _next.size() == 1 + && (_next[0]->getType() == "cost.bce" || _next[0]->getType() == "cost.dce") + && _next[0]->getDeviceID() == getDeviceID() + && _next[0]->getNumReplicas() == getNumReplicas(); + LayerV& prev = _prev[replicaIdx]; + if (doCrossEntGrad) { + NVMatrix& labels = _next[0]->getPrev()[replicaIdx][0]->getActs(getDeviceID()); + BinomialCrossEntropyCostLayer& cost = *static_cast(_next[0]); + float gradCoeff = cost.getCoeff(); + labels.transpose(_trans); + if (cost.getPosWeight() == 1) { + if (scaleTargets == 0) { + getActs().add(labels, -gradCoeff, gradCoeff, prev[0]->getActsGrad()); + } else { + getActs().applyTernary(AddGradientBinaryOperator(NVMatrixBinaryOps::WeightedAdd(-gradCoeff, gradCoeff)), + labels, prev[0]->getActsGrad(), prev[0]->getActsGrad()); + } + } else { + if (scaleTargets == 0) { + getActs().applyBinary(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight()), labels, prev[0]->getActsGrad()); + } else { + getActs().applyTernary(AddGradientBinaryOperator(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight())), + labels, prev[0]->getActsGrad(), prev[0]->getActsGrad()); + } + } + } + return doCrossEntGrad; +} + +void NeuronLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + _neuron->activate(*_inputs[0], getActs()); +} + +std::string& NeuronLayer::getNeuronType() { + return _neuronType; +} + +/* + * ======================= + * WeightLayer + * ======================= + * + * The useGrad parameter here merely expresses a preference by the subclass. It may + * be overridden by the superclass (WeightLayer) and in that case the subclass must follow its wishes. + * So when computing gradient updates, the subclass must always first check weights.isUseGrad(). + * + * Note: biases always useGrad. + */ +WeightLayer::WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad) : + Layer(convNetThread, paramsDict, replicaID, trans) { + _weightUpdatePassPeriod = pyDictGetInt(paramsDict, "updatePeriod"); + + MatrixV& hWeights = *pyDictGetMatrixV(paramsDict, "weights"); + MatrixV& hWeightsInc = *pyDictGetMatrixV(paramsDict, "weightsInc"); + Matrix& hBiases = *pyDictGetMatrix(paramsDict, "biases"); + Matrix& hBiasesInc = *pyDictGetMatrix(paramsDict, "biasesInc"); + PyObject* pyEpsWList = PyDict_GetItemString(paramsDict, "epsW"); + PyObject* pyEpsB = PyDict_GetItemString(paramsDict, "epsB"); + floatv& momW = *pyDictGetFloatV(paramsDict, "momW"); + float momB = pyDictGetFloat(paramsDict, "momB"); + floatv& wc = *pyDictGetFloatV(paramsDict, "wc"); + floatv& wball = *pyDictGetFloatV(paramsDict, "wballNormed"); + + /* + * When there are multiple replicas, the present implementation + * requires that useGrad is true. This is because weights.update() + * performs a simultaneous write to both replicas' weightsInc matrix, + * which means that the read should come from somewhere else (i.e. a + * grads matrix). + */ + useGrad |= _numReplicas > 1; + + // Source layers for shared weights + stringv& weightSourceLayers = *pyDictGetStringV(paramsDict, "weightSourceLayers"); + + // Weight matrix indices (inside the above source layers) for shared weights + intv& weightSourceMatrixIndices = *pyDictGetIntV(paramsDict, "weightSourceMatrixIndices"); + _weights = new WeightList(); + for (int i = 0; i < weightSourceLayers.size(); i++) { + std::string& srcLayerName = weightSourceLayers[i]; + int matrixIdx = weightSourceMatrixIndices[i]; + PyObject* pyEpsW = PyList_GetItem(pyEpsWList, i); + ParameterSchedule& lrs = ParameterSchedule::make(pyEpsW); // Learning rate schedule + if (srcLayerName == _name) { // Current layer + _weights->addWeights(*new Weights(_weights->at(matrixIdx), lrs, *this)); + } else if (srcLayerName != "") { + WeightLayer& srcLayer = *static_cast(&convNetThread->getLayer(srcLayerName)); + Weights* srcWeights = &srcLayer.getWeights(matrixIdx); + _weights->addWeights(*new Weights(*srcWeights, lrs, *this)); + } else { + _weights->addWeights(*new Weights(*hWeights[i], *hWeightsInc[i], lrs, *this, wc[i], wball[i], momW[i], useGrad)); + } + } + _biases = new Weights(hBiases, hBiasesInc, ParameterSchedule::make(pyEpsB), *this, 0, 0, momB, true); + + delete &weightSourceLayers; + delete &weightSourceMatrixIndices; + delete &hWeights; + delete &hWeightsInc; + delete &momW; + delete &wc; + delete &wball; + + _wStep = 0.02; + _bStep = 0.005; +} + +WeightLayer::~WeightLayer() { + delete _weights; + delete _biases; +} + +bool WeightLayer::postInit() { + if (Layer::postInit()) { + _weightUpdatePassPeriod = max(_weightUpdatePassPeriod, getActivePassPeriod()); + assert(_weightUpdatePassPeriod % getActivePassPeriod() == 0); + return true; + } + return false; +} + +void WeightLayer::fpropCommon(PASS_TYPE passType) { +} + +void WeightLayer::bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) { + if (_biases->getLearningRateSchedule().getBaseValue() > 0) { + if (v.getNumElements() > 0) { + bpropBiases(v, passType); + } else { + _biases->getGrad().resize(_biases->getW()); + _biases->getGrad().scale(getBIncScale()); + } + _biases->incNumUpdates(); + } + for (int i = 0; i < _weights->getSize(); i++) { + if (_weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { + if (v.getNumElements() > 0) { + bpropWeights(v, replicaIdx, i, passType); + } else { + _weights->at(i).getGrad().resize(_weights->at(i).getW()); + // This will cause it to forget momentum when shown 0 training cases + // and _useGrad = false but it's not too important. + _weights->at(i).getGrad().scale(getIncScale(i, passType)); + } + // Increment its number of updates + _weights->at(i).incNumUpdates(); + } + } +} + +bool WeightLayer::updateWeights() { + if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) { + _weights->update(getConvNet().getTrainingProgress()); + _biases->update(getConvNet().getTrainingProgress()); +// constrainWeights(); + return true; + } + return false; +} + +bool WeightLayer::constrainWeights() { + if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) { + _constrainWeights(); + return true; + } + return false; +} + +void WeightLayer::_constrainWeights() { +} + +void WeightLayer::copyToCPU() { + _weights->copyToCPU(); + _biases->copyToCPU(); +} + +void WeightLayer::copyToGPU() { + _weights->copyToGPU(); + _biases->copyToGPU(); +} + +void WeightLayer::checkGradient() { + for (int i = 0; i < _weights->getSize(); i++) { + getConvNet().checkGradient(_name + " weights[" + tostr(i) + "]", _wStep, _weights->at(i)); + } + getConvNet().checkGradient(_name + " biases", _bStep, *_biases); +} + +void WeightLayer::addReplica(Layer& l) { + Layer::addReplica(l); + _weights->addReplica(*static_cast(&l)->_weights); + _biases->addReplica(*static_cast(&l)->_biases); +} + +Weights& WeightLayer::getWeights(int idx) { + return _weights->at(idx); +} + +float WeightLayer::getGradScale(int inpIdx, PASS_TYPE passType) { + // weight update period must be multiple of activation period + // TODO: simply accumulate # of cases seen between weight updates. simpler and more accurate. + double numCases = _weightUpdatePassPeriod * (getConvNet().getMinibatchSize() / double(getConvNet().getNumPasses())); + if (_weights->at(inpIdx).isUseGrad()) { + return passType == PASS_GC ? 1.0f : 1.0f / numCases; + } + return passType == PASS_GC ? 1.0f : _weights->at(inpIdx).getEps(getConvNet().getTrainingProgress()) / numCases; +} + +float WeightLayer::getIncScale(int inpIdx, PASS_TYPE passType) { + if (_weights->at(inpIdx).isUseGrad()) { + return _weights->at(inpIdx).getNumUpdates() > 0; + } + return (passType == PASS_GC ? _weights->at(inpIdx).getNumUpdates() > 0 + : (_weights->at(inpIdx).getNumUpdates() == 0 ? _weights->at(inpIdx).getMom() : 1.0f)); +} + +NVMatrix& WeightLayer::getGradTarget(int inpIdx) { + return _weights->at(inpIdx).getGrad(); +} + +float WeightLayer::getBGradScale(PASS_TYPE passType) { + int numCases = _weightUpdatePassPeriod * DIVUP(getConvNet().getMinibatchSize(), getConvNet().getNumPasses()); + return passType == PASS_GC ? 1.0f : 1.0f / numCases; +} + +float WeightLayer::getBIncScale() { + return _biases->getNumUpdates() > 0; +} + +NVMatrix& WeightLayer::getWeightMatrix(PASS_TYPE passType, int inpIdx) { + return _weights->at(inpIdx).getW(); +} + +NVMatrix& WeightLayer::getBiasMatrix(PASS_TYPE passType) { + return _biases->getW(); +} + +/* + * ======================= + * FCLayer + * ======================= + */ +FCLayer::FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad) + : WeightLayer(convNetThread, paramsDict, replicaID, true, useGrad) { + _wStep = 0.01; + _bStep = 0.01; +} + +void FCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + getActs().addProduct(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), scaleTargets, 1); + if (scaleTargets == 0) { + getActs().addVector(getBiasMatrix(passType), 1, getActs()); + } +} + +void FCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose(); + _prev[replicaIdx][inpIdx]->getActsGrad().addProduct(v, weights_T, scaleTargets, 1); + delete &weights_T; +} + +void FCLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) { + _biases->getGrad().addSum(v, 0, getBIncScale(), getBGradScale(passType)); +} + +void FCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { + NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose(); + float scaleGrad = getGradScale(inpIdx, passType); + float scaleInc = getIncScale(inpIdx, passType); + getGradTarget(inpIdx).addProduct(prevActs_T, v, scaleInc, scaleGrad); + delete &prevActs_T; +} + +void FCLayer::_constrainWeights() { + for (int i = 0; i < _weights->getSize(); i++) { + if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { +// NVMatrix norm2; // Unfortunate extra weight matrix... + _weights->at(i).getW().sumOfSquares(0, _norm2); +// norm2.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall())); + _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall())); + _weights->at(i).getW().eltwiseMultByVector(_norm2); + } + } +} + +/* + * ======================= + * SplitFCLayer + * ======================= + */ +SplitFCLayer::SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad) + : FCLayer(convNetThread, paramsDict, replicaID, useGrad) { + _numParts = pyDictGetInt(paramsDict, "parts"); +} + +void SplitFCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + getActs().resize(_inputs[inpIdx]->getNumRows(), _numOutputs, true); + NVMatrixV& splitInput = _inputs[inpIdx]->splitCols(_numParts); + NVMatrixV& splitWeights = getWeightMatrix(passType, inpIdx).splitRows(_numParts); + NVMatrixV& splitTarget = getActs().splitCols(_numParts); + + NVMatrix::batchedMatrixMultiply(splitInput, splitWeights, splitTarget, scaleTargets, 1); + if (scaleTargets == 0) { + getActs().addVector(getBiasMatrix(passType), 1, getActs()); + } + + deleteElements(splitInput, true); + deleteElements(splitWeights, true); + deleteElements(splitTarget, true); +} + +void SplitFCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose(); + _prev[replicaIdx][inpIdx]->getActsGrad().resize(*_inputs[inpIdx]); + + NVMatrixV& splitV = v.splitCols(_numParts); + NVMatrixV& splitWeights_T = weights_T.splitCols(_numParts); + NVMatrixV& splitTarget = _prev[replicaIdx][inpIdx]->getActsGrad().splitCols(_numParts); + + NVMatrix::batchedMatrixMultiply(splitV, splitWeights_T, splitTarget, scaleTargets, 1); + + delete &weights_T; + deleteElements(splitV, true); + deleteElements(splitWeights_T, true); + deleteElements(splitTarget, true); +} + +void SplitFCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { + NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose(); + NVMatrixV& splitPrevActs_T = prevActs_T.splitRows(_numParts); + NVMatrixV& splitV = v.splitCols(_numParts); + NVMatrixV& splitGradTarget = getGradTarget(inpIdx).splitRows(_numParts); + + NVMatrix::batchedMatrixMultiply(splitPrevActs_T, splitV, splitGradTarget, getIncScale(inpIdx, passType), getGradScale(inpIdx, passType)); + + delete &prevActs_T; + deleteElements(splitPrevActs_T, true); + deleteElements(splitV, true); + deleteElements(splitGradTarget, true); +} + +/* + * ======================= + * TwoDLayerInterface + * ======================= + */ +TwoDLayerInterface::TwoDLayerInterface(PyObject* paramsDict) { + _channels = pyDictGetInt(paramsDict, "channels"); + _imgSize = pyDictGetInt(paramsDict, "imgSize"); + _imgPixels = _imgSize * _imgSize; +} + +/* + * ======================= + * LocalLayer + * ======================= + */ +LocalLayer::LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad) + : WeightLayer(convNetThread, paramsDict, replicaID, false, useGrad) { + _padding = pyDictGetIntV(paramsDict, "padding"); + _stride = pyDictGetIntV(paramsDict, "stride"); + _filterSize = pyDictGetIntV(paramsDict, "filterSize"); + _channels = pyDictGetIntV(paramsDict, "channels"); + _imgSize = pyDictGetIntV(paramsDict, "imgSize"); + _numFilters = pyDictGetInt(paramsDict, "filters"); + _groups = pyDictGetIntV(paramsDict, "groups"); + _filterChannels = pyDictGetIntV(paramsDict, "filterChannels"); + _filterPixels = pyDictGetIntV(paramsDict, "filterPixels"); + _imgPixels = pyDictGetIntV(paramsDict, "imgPixels"); + + _modulesX = pyDictGetInt(paramsDict, "modulesX"); + _modules = pyDictGetInt(paramsDict, "modules"); +} + +LocalLayer::~LocalLayer() { + delete _padding; + delete _stride; + delete _filterSize; + delete _channels; + delete _imgSize; + delete _groups; + delete _filterChannels; + delete _filterPixels; + delete _imgPixels; +} + +/* + * ======================= + * ConvLayer + * ======================= + */ +ConvLayer::ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : LocalLayer(convNetThread, paramsDict, replicaID, true) { + _sumWidth = pyDictGetInt(paramsDict, "sumWidth"); + _sharedBiases = pyDictGetInt(paramsDict, "sharedBiases"); + _weightContrastNormMin = pyDictGetFloatV(paramsDict, "wcNormMin"); + _weightContrastNormMax = pyDictGetFloatV(paramsDict, "wcNormMax"); +} + +ConvLayer::~ConvLayer() { + delete _weightContrastNormMin; + delete _weightContrastNormMax; +} + +void ConvLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx), + _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); + + if (scaleTargets == 0) { + if (_sharedBiases) { + getActs().reshape(_numFilters, getActs().getNumElements() / _numFilters); + getActs().addVector(getBiasMatrix(passType)); + getActs().reshape(_numFilters * _modules, getActs().getNumElements() / (_numFilters * _modules)); + } else { + getActs().addVector(getBiasMatrix(passType)); + } + } +} + +void ConvLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) { + float scaleBGrad = getBGradScale(passType); + float scaleInc = getBIncScale(); + if (_sharedBiases) { + v.reshape(_numFilters, v.getNumElements() / _numFilters); + _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad); + v.reshape(_numFilters * _modules, v.getNumElements() / (_numFilters * _modules)); + } else { + _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad); + } +} + +void ConvLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { + assert(_weights->at(inpIdx).isUseGrad()); + bool doPartialSum = _sumWidth < _modulesX; + NVMatrix& tgt = doPartialSum ? _weightGradTmp : _weights->at(inpIdx).getGrad(); + + float scaleWGrad = getGradScale(inpIdx, passType); + float scaleTargets = getIncScale(inpIdx, passType) * !doPartialSum; + + convWeightActs(*_inputs[inpIdx], v, tgt, _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx), + _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), _sumWidth, scaleTargets, scaleWGrad); + + if (doPartialSum) { + scaleTargets = _weights->at(inpIdx).getNumUpdates() > 0; + int outWidth = DIVUP(_modulesX, _sumWidth); + _weightGradTmp.reshape(outWidth*outWidth, _filterChannels->at(inpIdx) * _filterPixels->at(inpIdx) * _numFilters); + _weights->at(inpIdx).getGrad().addSum(_weightGradTmp, 0, scaleTargets, 1); + _weights->at(inpIdx).getGrad().reshape(_filterChannels->at(inpIdx) * _filterPixels->at(inpIdx), _numFilters); + } +} + +void ConvLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(), _imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX, + _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); +} + +void ConvLayer::truncBwdActs() { + LocalLayer::truncBwdActs(); + _weightGradTmp.truncate(); +} + +void ConvLayer::_constrainWeights() { + for (int i = 0; i < _weights->getSize(); i++) { + if (_weightContrastNormMax->at(i) > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { + float fz = _weights->at(i).getW().getNumRows(); + NVMatrix tmp; + _weights->at(i).getW().sum(0, tmp); + _weights->at(i).getW().addVector(tmp, -1.0f / fz, _weights->at(i).getGrad()); + // Now _weights->at(i).getGrad() contains zero-mean filters + _weights->at(i).getGrad().apply(NVMatrixOps::Square()); + _weights->at(i).getGrad().sum(0, tmp); + + tmp.apply(WeightContrastNormOperator(_weightContrastNormMin->at(i), _weightContrastNormMax->at(i), 1.0f / fz)); + // Now tmp has the stdev + _weights->at(i).getW().eltwiseMultByVector(tmp); + } + // It's pretty silly to do both these things but whatever + if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { +// NVMatrix norm2; + _weights->at(i).getW().sumOfSquares(0, _norm2); + +// norm.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall())); + _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall())); + _weights->at(i).getW().eltwiseMultByVector(_norm2); + } + } +} + +/* + * ======================= + * LocalUnsharedLayer + * ======================= + */ +LocalUnsharedLayer::LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : LocalLayer(convNetThread, paramsDict, replicaID, false) { +} + +void LocalUnsharedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + localFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx), + _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); + if (scaleTargets == 0) { + getActs().addVector(getBiasMatrix(passType)); + } +} + +void LocalUnsharedLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) { + _biases->getGrad().addSum(v, 1, getBIncScale(), getBGradScale(passType)); +} + +void LocalUnsharedLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { + float scaleWGrad = getGradScale(inpIdx, passType); + float scaleInc = getIncScale(inpIdx, passType); + localWeightActs(*_inputs[inpIdx], v, getGradTarget(inpIdx), _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx), + _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleInc, scaleWGrad); +} + +void LocalUnsharedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + localImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(),_imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX, + _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); +} + +void LocalUnsharedLayer::_constrainWeights() { + for (int i = 0; i < _weights->getSize(); i++) { + if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { + normalizeLocalWeights(*_weights->at(i), _modules, _weights->at(i).getWBall()); + } + } +} + +/* + * ======================= + * SoftmaxLayer + * ======================= + */ +SoftmaxLayer::SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : Layer(convNetThread, paramsDict, replicaID, true), _doUpperGrad(false) { +} + +void SoftmaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + NVMatrix& input = *_inputs[0]; + input.max(1, _max); + input.addVector(_max, -1, getActs()); + getActs().apply(NVMatrixOps::Exp()); + getActs().sum(1, _sum); + getActs().eltwiseDivideByVector(_sum); +} + +void SoftmaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(inpIdx == 0); + LayerV& prev = _prev[replicaIdx]; + if (_doUpperGrad) { + // Todo: rethink replica IDs or idxes... this here doesn't make a huge amount of sense + for (int i = 0; i < _next.size(); ++i) { + if (_next[i]->isGradProducer(getName())) { + NVMatrix& labels = _next[i]->getPrev()[replicaIdx][0]->getActs(getDeviceID()); // Get cost's labels + float gradCoeff = dynamic_cast(_next[i])->getCoeff(); + + computeLogregSoftmaxGrad(labels, getActs(), prev[0]->getActsGrad(), scaleTargets == 1, gradCoeff); + break; + } + } + + } else { + computeSoftmaxGrad(getActs(), v, prev[0]->getActsGrad(), scaleTargets, 1); + } +} + +void SoftmaxLayer::setDoUpperGrad(bool b) { + _doUpperGrad = b; +} + +/* + * ======================= + * ConcatenationLayer + * ======================= + */ +ConcatenationLayer::ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : Layer(convNetThread, paramsDict, replicaID, false) { + _copyOffsets = pyDictGetIntV(paramsDict, "copyOffsets"); + _copyOffsets->push_back(_numOutputs); +} + +ConcatenationLayer::~ConcatenationLayer() { + delete _copyOffsets; +} + +void ConcatenationLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + getActs().resize(_numOutputs, _inputs[inpIdx]->getNumCols()); + _inputs[inpIdx]->copy(getActs(), 0, -1, 0, -1, _copyOffsets->at(inpIdx), 0); +} + +void ConcatenationLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + NVMatrix& copySrc = v.sliceRows(_copyOffsets->at(inpIdx), _copyOffsets->at(inpIdx + 1)); // view + _prev[replicaIdx][inpIdx]->getActsGrad().add(copySrc, scaleTargets, 1); + delete ©Src; +} + +/* + * ======================= + * PassThroughLayer + * ======================= + */ +PassThroughLayer::PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : Layer(convNetThread, paramsDict, replicaID, false) { +} + +void PassThroughLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + // No-op +} + +void PassThroughLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + // No-op +} + +bool PassThroughLayer::postInit() { + if (Layer::postInit()) { + assert(getNumInputReplicas() == 1); + for (int i = 0, offset = 0; i < _prev[0].size(); offset += _prev[0][i]->getNumOutputs(), i++) { + MemoryView& vActs = _memSrcActs[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair(offset, offset + _prev[0][i]->getNumOutputs())); + MemoryView& vActsGrad = _memSrcActsGrad[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair(offset, offset + _prev[0][i]->getNumOutputs())); + _prev[0][i]->setMemorySourceActs(getDeviceID(), vActs); + _prev[0][i]->setMemorySourceActsGrad(getDeviceID(), vActsGrad); + } + return true; + } + return false; +} + + +/* + * ======================= + * EltwiseSumLayer + * ======================= + */ +EltwiseSumLayer::EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { + _coeffs = pyDictGetFloatV(paramsDict, "coeffs"); +} + +EltwiseSumLayer::~EltwiseSumLayer() { + delete _coeffs; +} + +void EltwiseSumLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + getActs().add(*_inputs[inpIdx], scaleTargets, _coeffs->at(inpIdx)); +} + +void EltwiseSumLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + _prev[replicaIdx][inpIdx]->getActsGrad().add(v, scaleTargets, _coeffs->at(inpIdx)); +} + +/* + * ======================= + * EltwiseMaxLayer + * ======================= + */ +EltwiseMaxLayer::EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { +} + +void EltwiseMaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + if (inpIdx == 1) { // First input, do nothing + _inputs[inpIdx]->applyBinary(NVMatrixAggs::Max(), *_inputs[0], getActs()); + } else if (inpIdx > 1) { + getActs().applyBinary(NVMatrixAggs::Max(), *_inputs[inpIdx]); + } +} + +void EltwiseMaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + computeEltwiseMaxGrad(v, *_inputs[inpIdx], getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), scaleTargets != 0); +} + + +/* + * ======================= + * DropoutLayer + * ======================= + * + * TODO: optimize away the case when using dopout over relus. Don't need the keepmask. + */ +DropoutLayer::DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { + _enable = pyDictGetInt(paramsDict, "enable"); + _keep = pyDictGetFloat(paramsDict, "keep"); +} + +void DropoutLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + if (_enable && passType == PASS_TRAIN) { + _keepMask.resize(*_inputs[inpIdx]); + _keepMask.randomizeUniform(); + _keepMask.apply(DropoutSmallerThanOperator(_keep)); + _inputs[inpIdx]->eltwiseMult(_keepMask, getActs()); + } else { + _inputs[inpIdx]->copy(getActs()); + } +} + +void DropoutLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + LayerV& prev = _prev[replicaIdx]; + if (_enable && passType == PASS_TRAIN) { + if (scaleTargets != 0) { + v.applyTernary(AddGradientBinaryOperator(NVMatrixBinaryOps::Multiply()), + _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad()); + } else { + v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad()); + } + } else { + prev[inpIdx]->getActsGrad().add(v, scaleTargets, 1); + } +} + +void DropoutLayer::truncBwdActs() { + Layer::truncBwdActs(); + _keepMask.truncate(); +} + + +/* + * ======================= + * Dropout2Layer + * ======================= + * + * TODO: optimize away the case when using dopout over relus. Don't need the keepmask. + */ +Dropout2Layer::Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : DropoutLayer(convNetThread, paramsDict, replicaID) { +} + +void Dropout2Layer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + if (_enable && passType == PASS_TRAIN) { + _keepMask.resize(*_inputs[inpIdx]); + _keepMask.randomizeUniform(); + _keepMask.smallerThanScalar(_keep); + _inputs[inpIdx]->eltwiseMult(_keepMask, getActs()); + } else { + _inputs[inpIdx]->scale(_keep, getActs()); + } +} + +void Dropout2Layer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + LayerV& prev = _prev[replicaIdx]; + if (_enable && passType == PASS_TRAIN) { + if (scaleTargets != 0) { + v.applyTernary(AddGradientBinaryOperator(NVMatrixBinaryOps::Multiply()), + _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad()); + } else { + v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad()); + } + } else { + if (scaleTargets != 0) { + v.applyBinary(AddGradientOperator(NVMatrixOps::MultByScalar(_keep)), + prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad()); + } else { + v.scale(_keep, prev[inpIdx]->getActsGrad()); + } + } +} + +/* + * ======================= + * DataLayer + * ======================= + */ +DataLayer::DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID) : Layer(NULL, paramsDict, replicaID, false) { + _dataIdx = pyDictGetInt(paramsDict, "dataIdx"); + _start = pyDictGetInt(paramsDict, "start"); + _end = pyDictGetInt(paramsDict, "end"); + _useBuffer = false; + _outstandingCopyRequest = false; + _convNet = convNet; +} + +DataLayer::~DataLayer() { + for (map::const_iterator it = _copyStreams.begin(); it != _copyStreams.end(); ++it) { + checkCudaErrors(cudaStreamDestroy(it->second)); + } + for (std::map::iterator it = _memSrcActs2.begin(); it != _memSrcActs2.end(); ++it) { + if (it->second->getMemorySource().truncate(_name)) { + delete &it->second->getMemorySource(); + } + } + _copier->stop(); + delete _copier; +} + +void DataLayer::fprop(PASS_TYPE passType, int passIdx, bool fromBuffer) { + waitForCopyFinish(); + if (fromBuffer && getFwdActiveInputReplicaIdx(passIdx) >= 0) { + _useBuffer = !_useBuffer; + } + + for (int i = 0; i < _next.size(); i++) { + _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx)); + } +} + +void DataLayer::waitForCopyFinish() { + if (_outstandingCopyRequest) { + _copyFinishQueue.dequeue(); + assert(_copyFinishQueue.getNumElements() == 0); + _outstandingCopyRequest = false; + } +} + +cudaStream_t DataLayer::getCopyStream(int deviceID) { + if (_copyStreams.count(deviceID) == 0) { + NVMatrix::setDeviceID(deviceID); + checkCudaErrors(cudaStreamCreateWithFlags(&_copyStreams[deviceID], cudaStreamNonBlocking)); + } + return _copyStreams[deviceID]; +} + +void DataLayer::copyData(CPUData& data, bool other, int passIdx) { + assert(!_outstandingCopyRequest); + assert(_copyFinishQueue.getNumElements() == 0); + _copier->getQueue().enqueue(new DataCopyMessage(data, other, passIdx)); + _outstandingCopyRequest = true; +} + +int DataLayer::getNumInputReplicas() { + return _convNet->getNumReplicasMax() / getNumReplicas(); +} + +void DataLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + +} + +NVMatrix& DataLayer::getActs(int deviceID) { + return getActs(deviceID, false, -1); +} + +NVMatrix& DataLayer::getActs(int deviceID, bool other, int numCases) { +// printf("%s[%d] getActs(%d, %d, %d)\n", _name.c_str(), getReplicaID(), deviceID, other, numCases); + assert(_memSrcActs.count(deviceID) > 0); + assert(_memSrcActs2.count(deviceID) > 0); + return (_useBuffer != other ? _memSrcActs2[deviceID]->getMemory(numCases) : _memSrcActs[deviceID]->getMemory(numCases)); +} + +ConvNet& DataLayer::getConvNet() { + return *_convNet; +} + +bool DataLayer::postInit() { + if (Layer::postInit()) { + for (int i = 0; i < _next.size(); ++i) { + int d = _next[i]->getDeviceID(); + if (_memSrcActs2.count(d) == 0) { + _memSrcActs2[d] = &MemorySource::make(_numOutputs, d, getName()); + } + } + intv cpus = getDeviceCPUs(_next[0]->getDeviceID()); + _copier = new DataCopyThread(*this, cpus); + _copier->start(); + return true; + } + return false; +} + +bool DataLayer::isGradProducer() { + return false; +} + +/* + * ======================= + * DataCopyThread + * ======================= + */ +DataCopyThread::DataCopyThread(DataLayer& parent, intv& cpus) : _parent(&parent), _sleepUsec(0), Thread(true, cpus) { +} + +Queue& DataCopyThread::getQueue() { + return _queue; +} + +void DataCopyThread::stop() { + getQueue().enqueue(new DataCopyExitMessage()); + join(); +} + +void* DataCopyThread::run() { + NVMatrix::setDeviceID(*_parent->getNextDeviceIDs().begin()); + bool exit = false; + while(!exit) { + DataCopyMessage& msg = *_queue.dequeue(); + exit = msg.getType() == DataCopyMessage::EXIT; + if (!exit) { + CPUData& data = msg.getData(); + int passIdx = msg.getPassIdx(); + bool other = msg.isOther(); + + Matrix& dataMatrix = data.getData(_parent->getDataIdx()); + // How many times is this layer going to process microbatches from this minibatch? + assert(_parent->getNumReplicasNext() == _parent->getNumReplicas()); + int microIdx = _parent->getFwdActiveInputReplicaIdx(passIdx); + + if (microIdx >= 0) { + if (_requestTimer.isStarted()) { + double requestIntervalMsec = _requestTimer.stop(); + // Sleep for up to 1/20th the average request interval + _sleepUsec = int(round(0.95 * _sleepUsec + 0.05 * (_parent->getReplicaID() / double(_parent->getNumReplicas())) * requestIntervalMsec * 1000.0 / 20.0)); + } + _requestTimer.start(); + if (other) { + // Sleeping a bit is helpful because in typical nets, copying input data + // as soon as it's available will produce contention with other communications + // that are happening at the time. This is very much a hack, so in the future + // it might be good to replace it with something smarter which schedules access + // to communication links. + usleep(_sleepUsec); + } + microIdx += _parent->getReplicaID() * _parent->getNumInputReplicas(); + // Safer to divup because this way you won't get a minibatch size of 0 + int microbatchSize = DIVUP(data.getNumCases(), _parent->getConvNet().getNumReplicasMax()); + int microStart = microIdx * microbatchSize; + int microEnd = min(data.getNumCases(), (microIdx + 1) * microbatchSize); + // Check that this replica has some data. This can be false when, for example, + // there are only 7 examples in the minibatch but 8 replicas. + if (microStart < microEnd) { + assert(dataMatrix.isView() == dataMatrix.isTrans()); + int pipe = _parent->getConvNet().getDataCopyPD().getPipe(_parent->getReplicaID()/2); + if (dataMatrix.isTrans()) { + Matrix& replicaDataMatrix = dataMatrix.sliceCols(microStart, microEnd); + // In this case, dataMatrix is a view on memory allocated by Python. + //_hostMemFwd.copyFromHost(replicaDataMatrix, true); + _hostMemFwd.resize(replicaDataMatrix.getNumRows(), replicaDataMatrix.getNumCols(), true); + memcpy(_hostMemFwd.getDevData(), replicaDataMatrix.getData(), replicaDataMatrix.getNumDataBytes()); + delete &replicaDataMatrix; // view + NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd()); + for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { + int deviceID = *it; + // Copy my output to this guy's GPU + NVMatrix::setDeviceID(deviceID); + // Note to self: this is the path that gets executed in practice + // in my models. It does a transpose & copy simultaneously. + hostMemFwdSlice.flipTrans(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID)); + } + delete &hostMemFwdSlice; + } else { + // Hacky way to copy a slice to _hostMemFwd + _hostMemFwd.resize(dataMatrix.getNumRows(), microEnd - microStart); + Matrix tmp(_hostMemFwd.getDevData(), _hostMemFwd.getNumRows(), _hostMemFwd.getNumCols(), _hostMemFwd.isTrans()); + dataMatrix.sliceCols(microStart, microEnd, tmp); + NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd()); + for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { + int deviceID = *it; + // Copy my output to this guy's GPU + NVMatrix::setDeviceID(deviceID); + hostMemFwdSlice.copy(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID)); + } + delete &hostMemFwdSlice; + } + + for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { + int deviceID = *it; + NVMatrix::setDeviceID(deviceID); + NVMatrix::syncStream(_parent->getCopyStream(deviceID)); + } + _parent->getConvNet().getDataCopyPD().freePipe(pipe); + } else { + for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { + int deviceID = *it; + _parent->getActs(deviceID, other, 0); + } + } + } + _parent->getCopyFinishQueue().enqueue(1); + } + delete &msg; + } + return NULL; +} + +/* + * ===================== + * PoolLayer + * ===================== + */ +PoolLayer::PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) + : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) { + _sizeX = pyDictGetInt(paramsDict, "sizeX"); + _start = pyDictGetInt(paramsDict, "start"); + _stride = pyDictGetInt(paramsDict, "stride"); + _outputsX = pyDictGetInt(paramsDict, "outputsX"); + _pool = pyDictGetString(paramsDict, "pool"); +} + +PoolLayer& PoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) { + std::string _pool = pyDictGetString(paramsDict, "pool"); + if (_pool == "max") { + return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, false); + } else if(_pool == "maxabs") { + return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, true); + } else if(_pool == "avg") { + return *new AvgPoolLayer(convNetThread, paramsDict, replicaID); + } + throw std::string("Unknown pooling layer type ") + _pool; +} + +/* + * ===================== + * AvgPoolLayer + * ===================== + */ +AvgPoolLayer::AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : PoolLayer(convNetThread, paramsDict, replicaID, false) { + _sum = pyDictGetInt(paramsDict, "sum"); +} + +void AvgPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + if (_sum) { + convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler()); + } else { + convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler()); + } +} + +void AvgPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convLocalAvgUndo(v, _prev[replicaIdx][0]->getActsGrad(), _sizeX, _start, _stride, _outputsX, _imgSize, _sum, scaleTargets, 1); +} + +/* + * ===================== + * MaxPoolLayer + * ===================== + */ +MaxPoolLayer::MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs) : PoolLayer(convNetThread, paramsDict, replicaID, false), _abs(abs) { +} + +void MaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + if (_abs) { + convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxAbsPooler()); + } else { + convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxPooler()); + } +} + +void MaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(inpIdx == 0); + convLocalMaxUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _sizeX, _start, _stride, _outputsX, scaleTargets, 1); +} + +/* + * ===================== + * CrossMapPoolLayer + * ===================== + */ +CrossMapPoolLayer::CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) + : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) { + _size = pyDictGetInt(paramsDict, "size"); + _start = pyDictGetInt(paramsDict, "start"); + _stride = pyDictGetInt(paramsDict, "stride"); + _outputs = pyDictGetInt(paramsDict, "outputChannels"); + _pool = pyDictGetString(paramsDict, "pool"); +} + +CrossMapPoolLayer& CrossMapPoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) { + std::string _pool = pyDictGetString(paramsDict, "pool"); + if (_pool == "max") { + return *new CrossMapMaxPoolLayer(convNetThread, paramsDict, replicaID); + } + throw std::string("Unknown pooling layer type ") + _pool; +} + +/* + * ===================== + * CrossMapMaxPoolLayer + * ===================== + */ +CrossMapMaxPoolLayer::CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CrossMapPoolLayer(convNetThread, paramsDict, replicaID, false) { +} + +void CrossMapMaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convPoolCrossMap(*_inputs[0], getActs(), _start, _size, _outputs, _stride, _imgSize, MaxPooler()); +} + +void CrossMapMaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(inpIdx == 0); + convCrossMapMaxPoolUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][0]->getActsGrad(), _imgSize, _start, _size, _stride, scaleTargets, 1); +} + +/* + * ===================== + * RandomScaleLayer + * ===================== + */ +RandomScaleLayer::RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + _maxScale = pyDictGetFloat(paramsDict, "maxScale"); + _tgtSize = pyDictGetInt(paramsDict, "tgtSize"); + // The smallest size the image could be after rescaling + _minScaledSize = _imgSize / _maxScale; + + // The number of discrete scales we're considering + int numScales = _imgSize - _minScaledSize + 1; + + // The total number of squares of size _tgtSize that we can extract + // from all these scales + double numCrops = numScales * (numScales + 1) * (2 * numScales + 1) / 6; + + // For each scale, record the fraction of the squares that it has. + // This will be the probability of sampling this scale. + _scaleProbs.push_back(1.0 / numCrops); + for (int s = 1; s < numScales; ++s) { + _scaleProbs.push_back(_scaleProbs[s-1] + (s + 1) * (s + 1) / numCrops); + } +} + +void RandomScaleLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + if (IS_TRAIN(passType)) { + // _maxScale is in the range [1, 2) + float r = randf; + int rescaledSize = _tgtSize; + float scaleFactor = _maxScale; + // Find which scale we have sampled + for (int s = 0; s < _scaleProbs.size(); ++s) { + if (r <= _scaleProbs[s]) { + rescaledSize += s; + float scaleFactorEnd = _imgSize / float(rescaledSize); + float scaleFactorStart = max(1.0, _imgSize / (1.0 + rescaledSize)); + scaleFactor = scaleFactorStart + randf * (scaleFactorEnd - scaleFactorStart); + break; + } + } + assert(rescaledSize >= _tgtSize); + int maxStart = rescaledSize - _tgtSize; + int startY = rand() % (1 + maxStart), startX = rand() % (1 + maxStart); + if (rescaledSize == _imgSize) { + convCrop(*_inputs[0], getActs(), rescaledSize, _tgtSize, startY, startX); + } else { + convResizeBilinear(*_inputs[0], _rescaledActs, _imgSize, rescaledSize, scaleFactor); + convCrop(_rescaledActs, getActs(), rescaledSize, _tgtSize, startY, startX); + } + _rescaledActs.truncate(); // this'll have a different size each time so may as well truncate it. + } else if (IS_MULTIVIEW_TEST(passType)) { // for now... + _inputs[0]->copy(getActs()); + } else if (IS_TEST(passType)) { // Test on center patch + convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _maxScale); + } +} + +void RandomScaleLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(false); +} + +/* + * ===================== + * CropLayer + * ===================== + */ +CropLayer::CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + _startX = pyDictGetInt(paramsDict, "startX"); + _startY = pyDictGetInt(paramsDict, "startY"); + _tgtSize = pyDictGetInt(paramsDict, "sizeX"); +} + +void CropLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convCrop(*_inputs[0], getActs(), _imgSize, _tgtSize, _startY, _startX); +} + +void CropLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(false); +} + +/* + * ===================== + * NailbedLayer + * ===================== + */ +NailbedLayer::NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + _start = pyDictGetInt(paramsDict, "start"); + _stride = pyDictGetInt(paramsDict, "stride"); + _outputsX = pyDictGetInt(paramsDict, "outputsX"); +} + +void NailbedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convBedOfNails(*_inputs[0], getActs(), _channels, _imgSize, _start, _stride, 0, 1); +} + +void NailbedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convBedOfNailsUndo(v, _prev[replicaIdx][0]->getActsGrad(), _channels, _imgSize, _start, _stride, scaleTargets, 1); +} + +/* + * ===================== + * GaussianBlurLayer + * ===================== + */ +GaussianBlurLayer::GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + _hFilter = pyDictGetMatrix(paramsDict, "filter"); +} + +GaussianBlurLayer::~GaussianBlurLayer() { + delete _hFilter; +} + +void GaussianBlurLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convGaussianBlur(*_inputs[0], _filter, getActs(), true, _channels, 0, 1); + convGaussianBlur(getActs(), _filter, getActs(), false, _channels, 0, 1); +} + +void GaussianBlurLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + NVMatrix& tgt = _prev[replicaIdx][0]->getNumComputedActsGrads(getDeviceID()) > 0 ? _actGradsTmp : _prev[replicaIdx][0]->getActsGrad(); + convGaussianBlur(v, _filter, tgt, true, _channels, 0, 1); + convGaussianBlur(tgt, _filter, _prev[replicaIdx][0]->getActsGrad(), false, _channels, scaleTargets, 1); +} + +void GaussianBlurLayer::copyToGPU() { + _filter.copyFromHost(*_hFilter, true); +} + + /* + * ===================== + * HorizontalReflectionLayer + * ===================== + */ +HorizontalReflectionLayer::HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID) : Layer(convNet, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + assert(_channels >= 1 && _channels <= 3); +} + +void HorizontalReflectionLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convReflectHorizontal(*_inputs[0], getActs(), _imgSize); +} + +void HorizontalReflectionLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convReflectHorizontal(v, _prev[replicaIdx][0]->getActsGrad(), _imgSize); +} + +/* + * ===================== + * ResizeLayer + * ===================== + */ +ResizeLayer::ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + _tgtSize = pyDictGetInt(paramsDict, "tgtSize"); + _scale = pyDictGetFloat(paramsDict, "scale"); +} + +void ResizeLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _scale); +} + +// Can't do this +void ResizeLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(false); +} + +/* + * ===================== + * RGBToYUVLayer + * ===================== + */ +RGBToYUVLayer::RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { +} + +void RGBToYUVLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convRGBToYUV(*_inputs[0], getActs()); +} + +// Can't do this +void RGBToYUVLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(false); +} + +/* + * ===================== + * RGBToLABLayer + * ===================== + */ +RGBToLABLayer::RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { + _center = pyDictGetInt(paramsDict, "center"); +} + +void RGBToLABLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convRGBToLAB(*_inputs[0], getActs(), _center); +} + +// Can't do this +void RGBToLABLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(false); +} + +/* + * ===================== + * ResponseNormLayer + * ===================== + */ +ResponseNormLayer::ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) +: Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { + _size = pyDictGetInt(paramsDict, "size"); + _scale = pyDictGetFloat(paramsDict, "scale"); + _pow = pyDictGetFloat(paramsDict, "pow"); + _minDiv = pyDictGetFloat(paramsDict, "minDiv"); +} + +void ResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + convResponseNorm(*_inputs[0], _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv); +} + +void ResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convResponseNormUndo(v, _denoms, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1); +} + +void ResponseNormLayer::truncBwdActs() { + Layer::truncBwdActs(); + _denoms.truncate(); +} + +/* + * ===================== + * CrossMapResponseNormLayer + * ===================== + */ +CrossMapResponseNormLayer::CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) +: ResponseNormLayer(convNetThread, paramsDict, replicaID) { + _blocked = pyDictGetInt(paramsDict, "blocked"); +} + +void CrossMapResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + assert(inpIdx == 0); + convResponseNormCrossMap(*_inputs[0], getActs(), _channels, _size, _scale, _pow, _minDiv, _blocked); +} + +void CrossMapResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convResponseNormCrossMapUndo(v, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, _minDiv, _blocked, scaleTargets, 1); +} + +/* + * ===================== + * ContrastNormLayer + * ===================== + */ +ContrastNormLayer::ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : ResponseNormLayer(convNetThread, paramsDict, replicaID) { +} + +void ContrastNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + NVMatrix& images = *_inputs[0]; + convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler()); + _meanDiffs.add(images, -1, 1); + convContrastNorm(images, _meanDiffs, _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv); +} + +void ContrastNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + convContrastNormUndo(v, _denoms, _meanDiffs, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1); +} + +void ContrastNormLayer::truncBwdActs() { + ResponseNormLayer::truncBwdActs(); + _meanDiffs.truncate(); +} + +/* + * ===================== + * CostLayer + * ===================== + */ +CostLayer::CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) + : Layer(convNetThread, paramsDict, replicaID, trans) { + _coeff = pyDictGetFloat(paramsDict, "coeff"); + _numCases = 0; + _aggregated = pyDictGetInt(paramsDict, "aggregated") != 0; +} + +float CostLayer::getCoeff() { + return _coeff; +} + +void CostLayer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) { + if (_coeff != 0) { + Layer::bprop(v, passType, passIdx); + } +} + +bool CostLayer::fprop(PASS_TYPE passType, int passIdx) { + if (Layer::fprop(passType, passIdx)) { + syncStream(); + getConvNet().getMessageQueue().enqueue(new Message(FPROP_TERMINAL)); + return true; + } + return false; +} + +void CostLayer::fpropCommon(PASS_TYPE passType) { + _numCases = Layer::getNumCases(*_inputs[0]); +} + +int CostLayer::getNumCases() { + return _numCases; +} + +bool CostLayer::isGradProducer() { + return _coeff != 0; +} + +doublev& CostLayer::getCost() { + return *new doublev(_costv); +} + +// This is called between microbatches +void CostLayer::resetPassIdx() { + Layer::resetPassIdx(); + _costv.clear(); +} + +CostLayer& CostLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID) { + if (type == "cost.crossent") { + return *new CrossEntCostLayer(convNetThread, paramsDict, replicaID); + } else if (type == "cost.bce") { + return *new BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID); + } else if (type == "cost.dce") { + return *new DetectionCrossEntropyCostLayer(convNetThread, paramsDict, replicaID); + } else if (type == "cost.logreg") { + return *new LogregCostLayer(convNetThread, paramsDict, replicaID); + } else if (type == "cost.sum2") { + return *new SumOfSquaresCostLayer(convNetThread, paramsDict, replicaID); + } + throw std::string("Unknown cost layer type ") + type; +} + +/* + * ===================== + * CrossEntCostLayer + * ===================== + */ +CrossEntCostLayer::CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { +} + +void CrossEntCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + // This layer uses its two inputs together + if (inpIdx == 0) { + NVMatrix& labels = *_inputs[0]; + NVMatrix& probs = *_inputs[1]; + int numCases = labels.getLeadingDim(); + computeCrossEntCost(labels, probs, _trueLabelLogProbs, _correctProbs); + _costv.clear(); + _costv.push_back(-_trueLabelLogProbs.sum()); + _costv.push_back(numCases - _correctProbs.sum()); + } +} + +void CrossEntCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(inpIdx == 1); + LayerV& prev = _prev[replicaIdx]; + NVMatrix& labels = *_inputs[0]; + NVMatrix& probs = *_inputs[1]; + NVMatrix& target = prev[1]->getActsGrad(); + // Numerical stability optimization: if the layer below me is a softmax layer, let it handle + // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity. + bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax" || prev[1]->getDeviceID() != getDeviceID(); + if (doWork) { + computeCrossEntGrad(labels, probs, target, scaleTargets == 1, _coeff); + } +} + +/* + * ===================== + * BinomialCrossEntropyCostLayer + * ===================== + */ +BinomialCrossEntropyCostLayer::BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { + _computeSoftmaxErrorRate = pyDictGetInt(paramsDict, "computeSoftmaxErrorRate"); + _posWeight = pyDictGetFloat(paramsDict, "posWeight"); +} + +void BinomialCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + // This layer uses its two inputs together + if (inpIdx == 0) { + NVMatrix& labels = *_inputs[0]; + NVMatrix& probs = *_inputs[1]; + int numCases = labels.getLeadingDim(); + labels.applyBinary(BinomialCrossEntOperator(_posWeight), probs, _tmpProbs); + _costv.clear(); + // Cross-entropy cost + _costv.push_back(-_tmpProbs.sum(_tmpbuf));// / labels.getFollowingDim()); + + // If aggregated, we don't produce these outputs because they're not additive. + // They have no meaning if this is just a partial cost. + if (!_aggregated) { + // "Correct" classifications. To compute these we threshold probs + // and just count the number of entries that agree with labels. + probs.biggerThanScalar(0.5, _tmpProbs); + _tmpProbs.equals(labels); + _costv.push_back((_tmpProbs.getNumElements() - _tmpProbs.sum(_tmpbuf)) / double(labels.getFollowingDim())); + + if (_computeSoftmaxErrorRate) { + // Also compute top-1 error as if this is softmax and there's only one correct class + probs.max(0, _tmpVec); + assert(_tmpVec.getNumElements() == numCases); // Make sure we did max on correct axis + probs.equalsVector(_tmpVec, _correctProbs); + _correctProbs.sum(0, _tmpVec); // Divide by the # of labels that we predict as being present + float m = _tmpVec.max(); + + _correctProbs.eltwiseDivideByVector(_tmpVec); + _correctProbs.eltwiseMult(labels); + + _costv.push_back(numCases - _correctProbs.sum(_tmpbuf)); + } + } + } +} + +void BinomialCrossEntropyCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + assert(inpIdx == 1); + LayerV& prev = _prev[replicaIdx]; + NVMatrix& labels = *_inputs[0]; + NVMatrix& probs = *_inputs[1]; + NVMatrix& target = prev[1]->getActsGrad(); + // Numerical stability optimization: if the layer below me is a logistic neuron layer, let it handle + // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity. + bool doWork = prev[1]->getNext().size() > 1 + || prev[1]->getType() != "neuron" + || static_cast(prev[1])->getNeuronType() != "logistic" + || prev[1]->getDeviceID() != getDeviceID() + || prev[1]->getNumReplicas() != getNumReplicas(); + if (doWork) { + printf("Computing cross-entropy gradient the stupid way\n"); + if (scaleTargets == 0) { + labels.applyBinary(BinomialCrossEntGradientOperator(_coeff, _posWeight), probs, target); + } else { + labels.applyTernary(AddGradientBinaryOperator(BinomialCrossEntGradientOperator(_coeff, _posWeight)), probs, target, target); + } + } +} + +float BinomialCrossEntropyCostLayer::getPosWeight() { + return _posWeight; +} +/* + * ===================== + * DetectionCrossEntropyCostLayer + * ===================== + */ +DetectionCrossEntropyCostLayer::DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) + : BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID) { + assert(!_aggregated); +} + +void DetectionCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + BinomialCrossEntropyCostLayer::fpropActs(inpIdx, scaleTargets, passType, passIdx); + // This layer uses its two inputs together + if (inpIdx == 0) { + NVMatrix& labels = *_inputs[0]; + NVMatrix& probs = *_inputs[1]; + int numCases = labels.getLeadingDim(); + + /* + * Add information sufficient to compute precision and recall for each class. + */ + // NOTE: _tmpProbs contains ((probs > 0.5) == labels) + labels.sum(1, _numPositive); // sum(labels, 1) + + _tmpProbs.eltwiseMult(labels); // labels * ((probs > 0.5) == labels) + _tmpProbs.sum(1, _numTruePositive); + + probs.biggerThanScalar(0.5, _tmpProbs); + _tmpProbs.sum(1, _numDeclaredPositive); + + _numDeclaredPositive.copyToHost(_hNumDeclaredPositive, true); + _numPositive.copyToHost(_hNumPositive, true); + _numTruePositive.copyToHost(_hNumTruePositive, true); + + for (int i = 0; i < labels.getFollowingDim(); ++i) { + _costv.push_back(_hNumDeclaredPositive(i, 0)); // 2 + _costv.push_back(_hNumPositive(i, 0)); // 3 + _costv.push_back(_hNumTruePositive(i, 0)); // 4 + } + + } +} + +/* + * ===================== + * LogregCostLayer + * ===================== + */ +LogregCostLayer::LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { + _topk = pyDictGetInt(paramsDict, "topk"); +// _numAccumed = 0; +} + +void LogregCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + // This layer uses its two inputs together + if (inpIdx == 0) { + NVMatrix& labels = *_inputs[0]; + NVMatrix* probs = _inputs[1]; + + _doCompute = !IS_MULTIVIEW_TEST(passType); + if (!_doCompute) { + if (IS_MULTIVIEW_TEST_START(passType)) { + if (_probsAccum.count(passIdx) == 0) { + _probsAccum[passIdx] = new NVMatrix(*probs); + } + probs->copy(*_probsAccum[passIdx]); + _numAccumed[passIdx] = 1; + } else { + _probsAccum[passIdx]->add(*probs); + _numAccumed[passIdx] += 1; + } + if (IS_MULTIVIEW_TEST_END(passType)) { + probs = _probsAccum[passIdx]; + probs->scale(1.0 / _numAccumed[passIdx]); + _doCompute = true; + } + } + if (_doCompute) { + int numCases = labels.getNumElements(); + probs->max(0,_maxProbs); + if (_topk == 1) { + computeLogregCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs); + } else { + computeMultiSoftmaxCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs, _topkProbs, _topk); + } + _costv.clear(); + double top1 = _correctProbs.sum(_tmpbuf); + + _costv.push_back(-_trueLabelLogProbs.sum(_tmpbuf)); + _costv.push_back(numCases - top1); + _costv.push_back(numCases - (_topk == 1 ? top1 : _topkProbs.sum(_tmpbuf))); + + } + } +} + +NVMatrix& LogregCostLayer::getProbsAccum(int replicaIdx) { + return *_probsAccum[replicaIdx]; +} + +void LogregCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + if (inpIdx == 1) { + LayerV& prev = _prev[replicaIdx]; + NVMatrix& labels = *_inputs[0]; + NVMatrix& probs = *_inputs[1]; + NVMatrix& target = prev[1]->getActsGrad(); + // Numerical stability optimization: if the layer below me is a softmax layer, let it handle + // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity. + bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax" + || prev[1]->getDeviceID() != getDeviceID() || prev[1]->getNumReplicas() != getNumReplicas(); + if (prev[1]->getType() == "softmax") { + static_cast(prev[1])->setDoUpperGrad(!doWork); + } + if (doWork) { + computeLogregGrad(labels, probs, target, scaleTargets == 1, _coeff); + } + } +} + +/* + * ===================== + * SumOfSquaresCostLayer + * ===================== + */ +SumOfSquaresCostLayer::SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { +} + +void SumOfSquaresCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { + _inputs[0]->apply(NVMatrixOps::Square(), _tmp); + _costv.clear(); + _costv.push_back(_tmp.sum()); +} + +void SumOfSquaresCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { + _prev[replicaIdx][inpIdx]->getActsGrad().add(*_inputs[0], scaleTargets, -2 * _coeff); +} + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu new file mode 100644 index 0000000..39995a6 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu @@ -0,0 +1,555 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../include/layer_kernels.cuh" + +using namespace std; + +/* + * E = -log(y_t) + * probs: (numOut, numCases) + * labels: (1, numCases) + * maxEnergies: (1, numCases) + * labelLogProbs: (1, numCases) (*out) + * correctProbs: (1, numCases) (*out) + * top5Probs: (1, numCases) (*out) + * + * target: (1, numCases) + * + */ +__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs, + float* labelLogProbs, float* correctProbs, float* top5Probs, + const int numCases, const int numOut, const int setSize) { + const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x; + + if (tx < numCases) { + const int label = int(labels[tx]); + const float maxp = maxProbs[tx]; + const float labelp = probs[label * numCases + tx]; + + labelLogProbs[tx] = __logf(labelp); + + int numBiggerProbs = 0, numEqualsProbs = 0; + for (int i = 0; i < numOut; ++i) { + numBiggerProbs += probs[i * numCases + tx] > labelp; + numEqualsProbs += probs[i * numCases + tx] == labelp; + } + + const int slotsLeft = setSize - numBiggerProbs; + + top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs); + correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs); + } +} + +/* + * E = -log(y_t) + * probs: (numOut, numCases) + * labels: (1, numCases) + * maxProbs: (1, numCases) + * labelLogProbs: (1, numCases) (*out) + * correctProbs: (1, numCases) (*out) + * top5Probs: (1, numCases) (*out) + * + * target: (1, numCases) == log(y_l[labels,:] + */ +void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, + NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) { + int numCases = probs.getNumCols(); + int numOut = probs.getNumRows(); + + assert(labels.getNumElements() == numCases); + assert(!labels.isTrans()); + assert(!probs.isTrans()); + assert(labels.isContiguous()); + assert(probs.isContiguous()); + +// NVMatrix& maxProbs = probs.max(0); + + labelLogProbs_out.resize(1, numCases); + correctProbs_out.resize(1, numCases); + top5Probs_out.resize(1, numCases); + dim3 threads(LOGREG_ERR_THREADS_X, 1); + dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1); + cudaStream_t stream = NVMatrix::getDefaultStream(); + + cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1); + kMultiSoftmaxCost<<>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(), + labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(), + numCases, numOut, setSize); + + getLastCudaError("kMultiSoftmaxCost: Kernel execution failed"); +// cudaThreadSynchronize(); +} + +/* + * E = sum(p_l * log(y_l)) + * probs: (numOut, numCases) + * labels: (numOut, numCases) + * maxProbs: (1, numCases) + * labelLogProbs: (1, numCases) (*out) + * correctProbs: (1, numCases) (*out) + * + * target: (1, numCases) + */ +__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs, + const int numCases, const int numOut) { + const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x; + + if (tx < numCases) { + probs += tx; + labels += tx; + maxProbs += tx; + labelLogProbs += tx; + correctProbs += tx; + + const float maxp = maxProbs[0]; + + /* + * Compute the probability of guessing the correct case if you take the most-probable label. + * + * This is done like this: + * + * - If the most probable label is not equal to the true label, then the probability is zero. + * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum). + * + * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned + * maximum probability. But it's a safety measure to prevent over-estimating your accuracy. + * Though it could never happen in reality. Well it could. But it wouldn't. Cool? + */ + float crossEnt = 0.0f; + int numMax = 0; + bool correctLabel = false; + for (int i = 0; i < numOut; i++) { + const float label_prob = labels[i * numCases]; + const float model_prob = probs[i * numCases]; + numMax += model_prob == maxp; + crossEnt += label_prob * safelog(model_prob); + correctLabel |= model_prob == maxp && label_prob > 0.0f; + } + labelLogProbs[0] = crossEnt; + if (!correctLabel) { + correctProbs[0] = 0.0f; + } else { + correctProbs[0] = 1.0f / float(numMax); + } + } +} + +/* + * E = sum(p_l * log(y_l)) + * y_l: (numOut, numCases) + * labels: (numOut, numCases) + * + * dE_dy_l: (numOut, numCases) + */ +template +__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases, + const int numOut, const float gradCoeff) { + const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; + const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; + const int tidx = ty * numCases + tx; + + if (ty < numOut && tx < numCases) { + const float label_prob = labels[tidx]; + const float model_prob = y_l[tidx]; + const float v = gradCoeff * __fdividef(label_prob, model_prob); + if (add) { + dE_dy_l[tidx] += v; + } else { + dE_dy_l[tidx] = v; + } + } +} + +/* + * E = sum(p_l * log(y_l)) + * y_l: (numOut, numCases) + * labels: (numOut, numCases) + * + * dE_dx_l: (numOut, numCases) + */ +template +__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases, + const int numOut, const float gradCoeff) { + const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; + const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; + const int tidx = ty * numCases + tx; + + if (ty < numOut && tx < numCases) { + const float model_prob = y_l[tidx]; + const float label_prob = labels[tidx]; + float v = gradCoeff * (label_prob - model_prob); + if (add) { + dE_dx_l[tidx] += v; + } else { + dE_dx_l[tidx] = v; + } + } +} + +/* + * E = -log(y_t) + * probs: (numOut, numCases) + * labels: (1, numCases) + * maxProbs: (1, numCases) + * labelLogProbs: (1, numCases) (*out) + * correctProbs: (1, numCases) (*out) + * + * target: (1, numCases) + */ +__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs, + const int numCases, const int numOut) { + const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x; + + if (tx < numCases) { + const int label = int(labels[tx]); + const float maxp = maxProbs[tx]; + const float labelp = probs[label * numCases + tx]; + + labelLogProbs[tx] = __logf(labelp); + + /* + * Compute the probability of guessing the correct case if you take the most-probable label. + * + * This is done like this: + * + * - If the most probable label is not equal to the true label, then the probability is zero. + * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum). + * + * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned + * maximum probability. But it's a safety measure to prevent over-estimating your accuracy. + * Though it could never happen in reality. Well it could. But it wouldn't. Cool? + */ + if (labelp != maxp) { + correctProbs[tx] = 0; + } else { + int numMax = 0; + for (int i = 0; i < numOut; i++) { + numMax += probs[i * numCases + tx] == maxp; + } + correctProbs[tx] = 1.0f / float(numMax); + } + } +} + +/* + * E = -log(y_t) + * y_l: (numOut, numCases) + * labels: (1, numCases) + * + * dE_dy_l: (numOut, numCases) + */ +template +__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases, + const int numOut, const float gradCoeff) { + const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; + const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; + const int tidx = ty * numCases + tx; + + if (ty < numOut && tx < numCases) { + const int label = int(labels[tx]); + float v = gradCoeff * (label == ty); + v = __fdividef(v, y_l[tidx]); + if (add) { + dE_dy_l[tidx] += v; + } else { + dE_dy_l[tidx] = v; + } + } +} + +/* + * E = -log(y_t) + * y_l: (numOut, numCases) + * labels: (1, numCases) + * + * dE_dx_l: (numOut, numCases) + */ +template +__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases, + const int numOut, const float gradCoeff) { + const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; + const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; + const int tidx = ty * numCases + tx; + + if (ty < numOut && tx < numCases) { + const int label = int(labels[tx]); + float v = gradCoeff * ((label == ty) - y_l[tidx]); + if (add) { + dE_dx_l[tidx] += v; + } else { + dE_dx_l[tidx] = v; + } + } +} + +/* + * dE_dy_l: (numOut, numCases) + * y_l: (numOut, numCases) + * + * dE_dx_l: (numOut, numCases) + */ +template +__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) { + const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; + const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; + const int tidx = ty * numCases + tx; + + if (ty < numOut && tx < numCases) { + float v = 0; + for (int j = 0; j < numOut; j++) { + v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]); + } + v *= y_l[tidx]; + + if (add) { + dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v; + } else { + dE_dx_l[tidx] = scaleGrad * v; + } + } +} + +template +__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target, + const int numElements) { + for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) { + if (add) { + target[i] += actGrad[i] * (output[i] == input[i]); + } else { + target[i] = actGrad[i] * (output[i] == input[i]); + } + } +} + +void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) { + assert(actGrad.isContiguous()); + assert(output.isContiguous()); + assert(input.isContiguous()); + assert(actGrad.isSameDims(input)); + assert(actGrad.isSameDims(output)); + + dim3 blocks(DIVUP(actGrad.getNumElements(), 128)); + dim3 threads(128); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (add) { + assert(actGrad.isSameDims(target)); + cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1); + kEltwiseMaxGrad<128, true><<>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements()); + } else { + target.resize(actGrad); + cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1); + kEltwiseMaxGrad<128, false><<>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements()); + } + + getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed"); +} + +/* + * E = sum_i{-p_i*log(y_i)} + * probs: (numOut, numCases) + * labels: (numOut, numCases) + * maxProbs: (1, numCases) + * labelLogProbs: (1, numCases) (*out) + * correctProbs: (1, numCases) (*out) + * + * target: (1, numCases) + */ +void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) { + int numCases = probs.getNumCols(); + int numOut = probs.getNumRows(); + + assert(labels.isSameDims(probs)); + assert(!labels.isTrans()); + assert(!probs.isTrans()); + assert(labels.isContiguous()); + assert(probs.isContiguous()); + + NVMatrix& maxProbs = probs.max(0); + + labelLogProbs_out.resize(1, numCases); + correctProbs_out.resize(1, numCases); + dim3 threads(LOGREG_ERR_THREADS_X, 1); + dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1); + cudaStream_t stream = NVMatrix::getDefaultStream(); + cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1); + kCrossEntCost<<>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(), + labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), + numCases, numOut); + getLastCudaError("kCrossEntCost: Kernel execution failed"); + + delete &maxProbs; +} + +void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { + int numCases = probs.getLeadingDim(); + int numOut = probs.getFollowingDim(); + assert(labels.isSameDims(probs)); + assert(probs.isContiguous()); + assert(target.isContiguous()); + assert(labels.isContiguous()); + assert(!labels.isTrans()); + assert(!probs.isTrans()); + + dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); + dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (!add) { + target.resize(probs); + kCrossEntGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } else { + kCrossEntGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } + + getLastCudaError("kCrossEntGrad: Kernel execution failed"); +} + +void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) { + int numCases = acts.getLeadingDim(); + int numOut = acts.getFollowingDim(); + + assert(acts.isSameDims(actsGrad)); + assert(acts.isContiguous()); + assert(actsGrad.isContiguous()); + assert(target.isContiguous()); + assert(acts.isTrans()); + assert(actsGrad.isTrans()); + + dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); + dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + + if (scaleTarget == 0) { + target.resize(acts); + kSoftmaxGrad<<>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad); + } else { + kSoftmaxGrad<<>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad); + } + getLastCudaError("computeSoftmaxGrad: Kernel execution failed"); +} + +void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { + int numCases = probs.getLeadingDim(); + int numOut = probs.getFollowingDim(); + assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim()); + assert(probs.isContiguous()); + assert(target.isContiguous()); + assert(labels.isContiguous()); + assert(probs.isTrans()); + assert(!labels.isTrans()); + + dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); + dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (!add) { + target.resize(probs); + cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad, cudaFuncCachePreferL1); + kCrossEntSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } else { + cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad, cudaFuncCachePreferL1); + kCrossEntSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } + getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed"); +} + +/* + * E = -log(y_t) + * probs: (numOut, numCases) + * labels: (1, numCases) + * maxProbs: (1, numCases) + * labelLogProbs: (1, numCases) (*out) + * correctProbs: (1, numCases) (*out) + * + * target: (1, numCases) == log(y_l[labels,:] + */ +void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) { + int numCases = probs.getNumCols(); + int numOut = probs.getNumRows(); + + assert(labels.getNumElements() == numCases); + assert(!labels.isTrans()); + assert(!probs.isTrans()); + assert(labels.isContiguous()); + assert(probs.isContiguous()); + + labelLogProbs_out.resize(1, numCases); + correctProbs_out.resize(1, numCases); + dim3 threads(LOGREG_ERR_THREADS_X, 1); + dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1); + cudaStream_t stream = NVMatrix::getDefaultStream(); + cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1); + kLogregCost<<>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(), + labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), + numCases, numOut); + getLastCudaError("computeLogregCost: Kernel execution failed"); +} + +void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { + int numCases = probs.getLeadingDim(); + int numOut = probs.getFollowingDim(); + assert(labels.getNumElements() == numCases); + assert(probs.isContiguous()); + assert(target.isContiguous()); + assert(labels.isContiguous()); + assert(!labels.isTrans()); + assert(!probs.isTrans()); + + dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); + dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (!add) { + target.resize(probs); + kLogregCostGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } else { + kLogregCostGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } + + getLastCudaError("computeLogregGrad: Kernel execution failed"); +} + +void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { + int numCases = probs.getLeadingDim(); + int numOut = probs.getFollowingDim(); + assert(labels.getNumElements() == numCases); + assert(probs.isContiguous()); + assert(target.isContiguous()); + assert(labels.isContiguous()); + assert(probs.isTrans()); + + dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); + dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); + cudaStream_t stream = NVMatrix::getDefaultStream(); + if (!add) { + target.resize(probs); + kLogregSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } else { + kLogregSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), + numCases, numOut, coeff); + } + + getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed"); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu new file mode 100644 index 0000000..1cea787 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu @@ -0,0 +1,114 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "../include/lr.cuh" +#include "../include/util.cuh" + +/* + * ================================== + * ParameterSchedule + * ================================== + */ +ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) { + std::string type = pyDictGetString(schedDict, "type"); + PyObject* paramsDict = PyDict_GetItemString(schedDict, "params"); + double base = pyDictGetFloat(paramsDict, "base"); + if (type == "const") { + return *new ParameterSchedule(base); + } else { + double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor"); + if (type == "linear") { + return *new LinearParameterSchedule(base, tgtFactor); + } else if (type == "exp") { + return *new ExpParameterSchedule(base, tgtFactor); + } else if (type == "dexp") { + double numSteps = pyDictGetInt(paramsDict, "numSteps"); + return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps); + } + } + throw std::string("Unknown learning rate schedule type ") + type; +} + +ParameterSchedule::ParameterSchedule(double baseRate) + : _baseRate(baseRate) { +} + +double ParameterSchedule::getValue(double progress) { + return _baseRate; +} + +double ParameterSchedule::getBaseValue() const { + return _baseRate; +} + +ParameterSchedule::~ParameterSchedule() { +} + +/* + * ================================== + * LinearParameterSchedule + * ================================== + */ +LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor) +: ParameterSchedule(baseRate) { + _finalRate = baseRate / tgtFactor; +} + +double LinearParameterSchedule::getValue(double progress) { + return _baseRate * (1 - progress) + _finalRate * progress; +} + +/* + * ================================== + * ExpParameterSchedule + * ================================== + */ +ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor) +: ParameterSchedule(baseRate) { + _powBase = 1.0 / tgtFactor; +} + +double ExpParameterSchedule::getValue(double progress) { + return _baseRate * std::pow(_powBase, progress); +} + +/* + * ================================== + * DiscreteExpParameterSchedule + * ================================== + */ +DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps) +: ParameterSchedule(baseRate) { + ExpParameterSchedule elrs(baseRate, tgtFactor); + double finalRate = baseRate / tgtFactor; + for (int i = 0; i < numSteps - 1; i++) { + double progress = double(i) / (numSteps - 1); + _rates.push_back(elrs.getValue(progress)); + } + _rates.push_back(finalRate); + //printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps); +} + +double DiscreteExpParameterSchedule::getValue(double progress) { + for (int i = 0; i < _rates.size(); ++i) { + if (progress <= double(i + 1) / _rates.size()) { + return _rates[i]; + } + } + return _rates.back(); +} + diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu new file mode 100644 index 0000000..cd2d299 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu @@ -0,0 +1,139 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/memorysource.cuh" + +using namespace std; + +/* + * ======================= + * MemoryView + * ======================= + */ +MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) { +} + +MemoryView::~MemoryView() { +// if (_src->truncate(_name)) { +// delete _src; +// } +} + +NVMatrix& MemoryView::getMemory(int numCases) { + return _src->getMemory(_name, numCases); +} + +NVMatrix& MemoryView::getMemory() { + return _src->getMemory(_name); +} + +MemorySource& MemoryView::getMemorySource() { + return *_src; +} + +bool MemoryView::isParent() { + return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize(); +} + +std::string& MemoryView::getName() { + return _name; +} + +MemoryView& MemoryView::clone(std::string& name) { + return _src->addUser(name, _src->getRange(_name)); +} + +/* + * ======================= + * MemorySource + * ======================= + */ +MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) { +} + +MemorySource::~MemorySource() { + // Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource. + // So this is a no-op. +} + +NVMatrix& MemorySource::getMemory(std::string& name) { + return getMemory(name, _memory.getLeadingDim()); +} + +// Deletes old view when appropriate +NVMatrix& MemorySource::getMemory(std::string& name, int numCases) { + numCases = numCases < 0 ? _memory.getLeadingDim() : numCases; + _lock.acquire(); + if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) { + int d = NVMatrix::getDeviceID(); + NVMatrix::setDeviceID(_deviceID); + _memory.resize(_size, numCases, false); + for (map::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) { + delete it->second; + } + _memoryViews.clear(); + if (d >= 0) { + NVMatrix::setDeviceID(d); + } + } + if (_memoryViews.count(name) == 0) { + assert(!_memory.isTrans()); + _memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second); + } + NVMatrix& view = *_memoryViews[name]; + assert(view.isContiguous()); + _lock.release(); + return view; +} + +MemoryView& MemorySource::addUser(std::string& name, std::pair range) { + assert(_viewRanges.count(name) == 0); + _viewRanges[name] = range; + return *new MemoryView(*this, name); +} + +MemoryView& MemorySource::addUser(std::string& name) { + return addUser(name, std::pair(0, _size)); +} + +MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) { + return (new MemorySource(size, deviceID))->addUser(parentUser); +} + +pair MemorySource::getRange(std::string& name) { + return _viewRanges[name]; +} + +int MemorySource::getSize() { + return _size; +} + +bool MemorySource::truncate(std::string& name) { + bool truncated = false; + _lock.acquire(); + _truncateRequests.insert(name); + if (_truncateRequests.size() == _viewRanges.size()) { + for (map::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) { + delete it->second; + } + _memoryViews.clear(); + _memory.truncate(); + _truncateRequests.clear(); + truncated = true; + } + _lock.release(); + return truncated; +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu new file mode 100644 index 0000000..bf6fd40 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu @@ -0,0 +1,75 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/neuron.cuh" +#include "../include/util.cuh" + +using namespace std; + +Neuron& Neuron::makeNeuron(PyObject* neuronDict) { + std::string type = pyDictGetString(neuronDict, "type"); + PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params"); + + if (type == "relu") { + return *new ReluNeuron(); + } + + if (type == "drelu") { + return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a")); + } + + if (type == "softrelu") { + return *new SoftReluNeuron(); + } + + if (type == "brelu") { + return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a")); + } + + if (type == "abs") { + return *new AbsNeuron(); + } + + if (type == "logistic") { + return *new LogisticNeuron(); + } + + if (type == "tanh") { + return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b")); + } + + if (type == "square") { + return *new SquareNeuron(); + } + + if (type == "sqrt") { + return *new SqrtNeuron(); + } + + if (type == "linear") { + return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b")); + } + + if (type == "log") { + return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a")); + } + + if (type == "ident") { + return *new Neuron(); + } + + throw std::string("Unknown neuron type: ") + type; +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu new file mode 100644 index 0000000..ed1aacf --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu @@ -0,0 +1,271 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../util/include/matrix.h" +#include "../../util/include/queue.h" +#include "../include/worker.cuh" +#include "../include/util.cuh" +#include "../include/cost.cuh" + +#include "../include/pyconvnet.cuh" +#include "../include/convnet.cuh" + +#include "../include/jpeg.h" + +using namespace std; +static ConvNet* model = NULL; + +static PyMethodDef _ConvNetMethods[] = {{ "initModel", initModel, METH_VARARGS }, + { "startBatch", startBatch, METH_VARARGS }, + { "finishBatch", finishBatch, METH_VARARGS }, + { "checkGradients", checkGradients, METH_VARARGS }, + { "startMultiviewTest", startMultiviewTest, METH_VARARGS }, + { "startFeatureWriter", startFeatureWriter, METH_VARARGS }, + { "startDataGrad", startDataGrad, METH_VARARGS }, + { "syncWithHost", syncWithHost, METH_VARARGS }, + { "decodeJpeg", decodeJpeg, METH_VARARGS }, + { NULL, NULL } +}; + +void init_ConvNet() { + (void) Py_InitModule("_ConvNet", _ConvNetMethods); + import_array(); +} + +void signalHandler(int sig) { + const size_t max_trace_size = 40; + void *array[max_trace_size]; + size_t trace_size = backtrace(array, max_trace_size); + fprintf(stderr, "Error signal %d:\n", sig); + backtrace_symbols_fd(array, trace_size, STDERR_FILENO); + exit(1); +} + +PyObject* initModel(PyObject *self, PyObject *args) { + assert(model == NULL); + signal(SIGSEGV, signalHandler); + signal(SIGABRT, signalHandler); + + PyDictObject* pyLayerParams; + PyListObject* pyDeviceIDs; + int pyMinibatchSize; + int conserveMem; + + if (!PyArg_ParseTuple(args, "O!O!ii", + &PyDict_Type, &pyLayerParams, + &PyList_Type, &pyDeviceIDs, + &pyMinibatchSize, + &conserveMem)) { + return NULL; + } + intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs); + + model = new ConvNet((PyObject*)pyLayerParams, + deviceIDs, + pyMinibatchSize, + conserveMem); + + model->start(); + return Py_BuildValue("i", 0); +} + +/* + * Starts training/testing on the given batch (asynchronous -- returns immediately). + */ +PyObject* startBatch(PyObject *self, PyObject *args) { + assert(model != NULL); +// printf("starting next batch\n"); + PyListObject* data; + double progress; + int test = 0; + if (!PyArg_ParseTuple(args, "O!d|i", + &PyList_Type, &data, + &progress, + &test)) { + return NULL; + } + CPUData* cpuData = new CPUData((PyObject*)data); + + TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test); + model->getWorkerQueue().enqueue(wr); + return Py_BuildValue("i", 0); +} + +/* + * Starts testing on the given batch (asynchronous -- returns immediately). + */ +PyObject* startMultiviewTest(PyObject *self, PyObject *args) { + assert(model != NULL); + PyListObject* data; + int numViews; + PyArrayObject* pyProbs = NULL; + char* logregName = NULL; + if (!PyArg_ParseTuple(args, "O!i|O!s", + &PyList_Type, &data, + &numViews, + &PyArray_Type, &pyProbs, + &logregName)) { + return NULL; + } + CPUData* cpuData = new CPUData((PyObject*)data); + MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews) + : new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName); + model->getWorkerQueue().enqueue(wr); + return Py_BuildValue("i", 0); +} + +PyObject* startFeatureWriter(PyObject *self, PyObject *args) { + assert(model != NULL); + PyListObject* data; + PyListObject* pyFtrs; + PyListObject* pyLayerNames; + if (!PyArg_ParseTuple(args, "O!O!O!", + &PyList_Type, &data, + &PyList_Type, &pyFtrs, + &PyList_Type, &pyLayerNames)) { + return NULL; + } + stringv* layerNames = getStringV((PyObject*)pyLayerNames); + CPUData* cpuData = new CPUData((PyObject*)data); + MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs); + + FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames); + model->getWorkerQueue().enqueue(wr); + return Py_BuildValue("i", 0); +} + +PyObject* startDataGrad(PyObject *self, PyObject *args) { +// assert(model != NULL); +// PyListObject* data; +// int dataLayerIdx, softmaxLayerIdx; +// if (!PyArg_ParseTuple(args, "O!ii", +// &PyList_Type, &data, +// &dataLayerIdx, &softmaxLayerIdx)) { +// return NULL; +// } +// CPUData* cpuData = new CPUData((PyObject*)data); +// Matrix& ftrs = *mvec.back(); +// mvec.pop_back(); +// +// DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx); +// model->getWorkerQueue().enqueue(wr); + return Py_BuildValue("i", 0); +} + +/* + * Waits for the trainer to finish training on the batch given to startBatch. + * This is a blocking call so lets release the GIL. + */ +PyObject* finishBatch(PyObject *self, PyObject *args) { + assert(model != NULL); + WorkResult* res = model->getResultQueue().dequeue(); + assert(res != NULL); + assert(res->getResultType() == WorkResult::BATCH_DONE); + + Cost& cost = res->getResults(); + PyObject* dict = PyDict_New(); + CostMap& costMap = cost.getCostMap(); + for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) { + PyObject* v = PyList_New(0); + for (vector::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) { + PyObject* f = PyFloat_FromDouble(*iv); + PyList_Append(v, f); + } + PyDict_SetItemString(dict, it->first.c_str(), v); + } + PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases()); + delete res; // Deletes cost too + + return retVal; +} + +PyObject* checkGradients(PyObject *self, PyObject *args) { + assert(model != NULL); + PyListObject* data; + if (!PyArg_ParseTuple(args, "O!", + &PyList_Type, &data)) { + return NULL; + } + CPUData* cpuData = new CPUData((PyObject*)data); + + GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData); + model->getWorkerQueue().enqueue(wr); + WorkResult* res = model->getResultQueue().dequeue(); + assert(res != NULL); + assert(res->getResultType() == WorkResult::BATCH_DONE); + delete res; + return Py_BuildValue("i", 0); +} + +/* + * Copies weight matrices from GPU to system memory. + */ +PyObject* syncWithHost(PyObject *self, PyObject *args) { + assert(model != NULL); + SyncWorker* wr = new SyncWorker(*model); + model->getWorkerQueue().enqueue(wr); + WorkResult* res = model->getResultQueue().dequeue(); + assert(res != NULL); + assert(res->getResultType() == WorkResult::SYNC_DONE); + + delete res; + return Py_BuildValue("i", 0); +} + +PyObject* decodeJpeg(PyObject *self, PyObject *args) { + PyListObject* pyJpegStrings; + PyArrayObject* pyTarget; + int img_size, inner_size, test, multiview; + if (!PyArg_ParseTuple(args, "O!O!iiii", + &PyList_Type, &pyJpegStrings, + &PyArray_Type, &pyTarget, + &img_size, + &inner_size, + &test, + &multiview)) { + return NULL; + } + + Thread* threads[NUM_JPEG_DECODER_THREADS]; + int num_imgs = PyList_GET_SIZE(pyJpegStrings); + int num_imgs_per_thread = DIVUP(num_imgs, NUM_JPEG_DECODER_THREADS); + Matrix& dstMatrix = *new Matrix(pyTarget); + for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) { + int start_img = t * num_imgs_per_thread; + int end_img = min(num_imgs, (t+1) * num_imgs_per_thread); + + threads[t] = new DecoderThread((PyObject*)pyJpegStrings, dstMatrix, start_img, end_img, img_size, inner_size, test, multiview); + threads[t]->start(); + } + + for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) { + threads[t]->join(); + delete threads[t]; + } + assert(dstMatrix.isView()); + delete &dstMatrix; + return Py_BuildValue("i", 0); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu new file mode 100644 index 0000000..e58c640 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu @@ -0,0 +1,350 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../include/reducepipeline.cuh" + +using namespace std; + +/* ========================= + * IReducerSegment + * ========================= + */ +// Null mat --> reducer on host +IReduceSegment::IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue* finishQueue) +: _deviceID(deviceID), _next(NULL), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getTgtDeviceID())) { +} + +IReduceSegment::~IReduceSegment() { +} + +NVMatrix& IReduceSegment::getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx) { + NVMatrix& line = mat.reshaped(1, mat.getNumElements()); + int start = chunkIdx * chunkSize; + int end = min((chunkIdx+1) * chunkSize, mat.getNumElements()); +// _mat->printShape("_mat"); + NVMatrix& chunk = line.sliceCols(start, end); + delete &line; +// chunk.printShape("chunk"); + return chunk; +} + +void* IReduceSegment::run() { + bool exit = false; + while (!exit) { + ReduceMessage& msg = *_queue.dequeue(); + if (msg.getType() == EXIT) { + exit = true; + } else { + bool term = processMessage(msg); + if (term) { + assert(_finishQueue); + _finishQueue->enqueue(1); + } + } + delete &msg; + } + return NULL; +} + +inline NVMatrix& IReduceSegment::getMatrix(ReduceMessage& msg) { + return msg.getMatrix(getDeviceID()); +} + +Queue& IReduceSegment::getQueue() { + return _queue; +} + +inline int IReduceSegment::getDeviceID() const { + return _deviceID; +} + +void IReduceSegment::addPrev(IReduceSegment& c) { + _prev.push_back(&c); +} + +void IReduceSegment::addNext(ReducePeer& c) { + assert(_next == NULL); + _next = &c; + c.addPrev(*this); +} + +bool IReduceSegment::isTerminal() const { + return _next == NULL; +} + +/* ========================= + * ReducerSource + * ========================= + */ +ReducerSource::ReducerSource(IEightGPUReducer& parent, int deviceID) : IReduceSegment(parent, deviceID, NULL) { +} + +bool ReducerSource::processMessage(ReduceMessage& msg) { + assert(msg.getType() == REDUCE_START); + int numChunks = min(getMatrix(msg).getNumElements(), max(REDUCE_MIN_CHUNKS, min(REDUCE_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), REDUCE_MIN_CHUNK_SIZE)))); + int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks); + //printf("num chunks: %d\n", numChunks); + for (int c = 0; c <= numChunks; ++c) { + _next->getQueue().enqueue(new ReduceChunkMessage(*this, c, chunkSize, numChunks, msg.getScaleIntermediates(), msg.getScaleTarget(), msg.getMatrices())); + } + return false; +} + +/* ========================= + * ReducerPeer + * ========================= + */ +ReducePeer::ReducePeer(IEightGPUReducer& parent,int deviceID, Queue* finishQueue) : IReduceSegment(parent, deviceID, finishQueue), _numInputsFinished(0) { + _add = deviceID != DEVICE_HOST; +} + +ReducePeer::ReducePeer(IEightGPUReducer& parent) : IReduceSegment(parent, DEVICE_HOST, NULL), _numInputsFinished(0), _add(false) { +} + +ReducePeer::~ReducePeer() { + for(std::map::iterator it = _streams.begin(); it != _streams.end(); ++it) { + checkCudaErrors(cudaStreamDestroy(it->second)); + } + _streams.clear(); +} + +inline cudaStream_t ReducePeer::getStream(int deviceID) { + if (deviceID < 0) { + return NULL; + } + if (_streams.count(deviceID) == 0) { + NVMatrix::setDeviceID(deviceID); + checkCudaErrors(cudaStreamCreateWithFlags(&_streams[deviceID], cudaStreamNonBlocking)); + } + return _streams[deviceID]; +} + +bool ReducePeer::processMessage(ReduceMessage& msg) { + assert(msg.getType() == REDUCE_CHUNK); + + ReduceChunkMessage& cmsg = *static_cast(&msg); +// if (_numInputsReceived.count(cmsg.getChunkIdx()) == 0) { +// _numInputsReceived[cmsg.getChunkIdx()] = 0; +// } + int& inputsRcvd = ++_numInputsReceived[cmsg.getChunkIdx()]; +// printf("reducer on device %d got msg chunk idx %d of %d, inputs rcvd for this chunk idx: %d/%d\n", +// getDeviceID(), cmsg.getChunkIdx(), cmsg.getNumChunks(),_numInputsReceived[cmsg.getChunkIdx()], _prev.size()); + if (cmsg.getChunkIdx() < cmsg.getNumChunks()) { + IReduceSegment& src = cmsg.getSource(); + float scalePrev = isTerminal() ? cmsg.getScaleIntermediates() : 1; + float scaleSelf = inputsRcvd == 1 ? _add * (isTerminal() ? cmsg.getScaleTarget() : 1): 1; + if (scaleSelf == 0 || isTerminal()) { + if (getDeviceID() >= 0) { + NVMatrix::setDeviceID(getDeviceID()); + } + getMatrix(msg).resize(src.getMatrix(msg)); + } + assert(getMatrix(msg).isSameDims(src.getMatrix(msg))); + NVMatrix& prevChunk = getChunk(src.getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); + NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); + int execDeviceID = getDeviceID() >= 0 ? getDeviceID() : src.getDeviceID(); + if (execDeviceID >= 0) { + NVMatrix::setDeviceID(execDeviceID); + prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, getStream(execDeviceID)); + NVMatrix::syncStream(getStream(execDeviceID)); + } else { + assert(!isTerminal()); + hostAdd(prevChunk.getDevData(), myChunk.getDevData(), prevChunk.getNumElements(), scaleSelf); + } + + delete &prevChunk; + delete &myChunk; + + } else { + _numInputsFinished++; + } + if (!isTerminal() && inputsRcvd == _prev.size()) { +// printf(" device %d enqueueing msg for next on device %d\n", getDeviceID(), _next->getDeviceID()); + _next->getQueue().enqueue( + new ReduceChunkMessage(*this, cmsg.getChunkIdx(), cmsg.getChunkSize(), cmsg.getNumChunks(), + cmsg.getScaleIntermediates(), cmsg.getScaleTarget(), cmsg.getMatrices())); + } + + bool finished = _numInputsFinished == _prev.size(); + if (finished) { + _numInputsFinished = 0; + _numInputsReceived.clear(); + } + return finished && isTerminal(); +} + +void ReducePeer::hostAdd(const float* src, float* tgt, const int n, const float scaleTgt) { + if (scaleTgt != 0) { + for (int i = 0; i < n; ++i) { + tgt[i] = scaleTgt * tgt[i] + src[i]; + } + } else { + for (int i = 0; i < n; ++i) { + tgt[i] = src[i]; + } + } +} + +inline NVMatrix& ReducePeer::getMatrix(ReduceMessage& msg) { + if (getDeviceID() != DEVICE_HOST) { + return IReduceSegment::getMatrix(msg); + } + return _mat; +} + +/* ========================= + * EightGPUReducer + * ========================= + */ +IEightGPUReducer::IEightGPUReducer(int tgtDeviceID) : _tgtDeviceID(tgtDeviceID) { +} + +IEightGPUReducer::~IEightGPUReducer() { + vector v; + v.insert(v.end(), _sources.begin(), _sources.end()); + v.insert(v.end(), _peers.begin(), _peers.end()); + for (vector::iterator it = v.begin(); it != v.end(); ++it) { + (*it)->getQueue().enqueue(new ReduceMessage(EXIT)); + (*it)->join(); + delete *it; + } +} + +IEightGPUReducer& IEightGPUReducer::construct() { + vector same, other; + for (int i = 0; i < 8; ++i) { + if (i != _tgtDeviceID) { + if (NVMatrix::canAccessPeer(_tgtDeviceID, i)) { + same.insert(same.begin() + rand() % (1 + same.size()), i); + } else { + other.insert(other.begin() + rand() % (1 + other.size()), i); + } + } + } + assert(same.size() == 3); + assert(other.size() == 4); + makeConnections(same, other); + for (vector::const_iterator it = _sources.begin(); it != _sources.end(); ++it) { + (*it)->start(); + } + for (vector::const_iterator it = _peers.begin(); it != _peers.end(); ++it) { + (*it)->start(); + } + return *this; +} + +void IEightGPUReducer::reduce(std::map& mats, float scaleIntermediates, float scaleTarget) { + assert(mats.size() == 8); + // Check if source matrices are 0-sized + bool zero = true; + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + if (it->first != _tgtDeviceID && it->second->getNumElements() != 0) { + zero = false; + break; + } + } + if (zero) { + mats[_tgtDeviceID]->resize(*mats[(_tgtDeviceID + 1) % 8]); + } else { + for (vector::const_iterator it = _sources.begin(); it != _sources.end(); ++it) { + (*it)->getQueue().enqueue(new ReduceStartMessage(scaleIntermediates, scaleTarget, mats)); + } + _finishQueue.dequeue(); + } + assert(_finishQueue.getNumElements() == 0); +} + +void IEightGPUReducer::reduce(std::map& mats, float scaleIntermediates) { + reduce(mats, scaleIntermediates, 1); +} + +void IEightGPUReducer::reduce(std::map& mats) { + reduce(mats, 1, 1); +} + +int IEightGPUReducer::getTgtDeviceID() const { + return _tgtDeviceID; +} + +/* ========================= + * EightGPUReducer1 + * ========================= + */ +EightGPUReducer1::EightGPUReducer1(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) { +} + +void EightGPUReducer1::makeConnections(vector& same, vector&other) { + // Setup segments on same truck + _peers.push_back(new ReducePeer(*this, _tgtDeviceID, &_finishQueue)); // peers[0] = tgt + _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue)); // peers[1] = same truck 1 + _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue)); // peers[2] = same truck 2 + _sources.push_back(new ReducerSource(*this,same[2])); // sources[0] = same truck 3 + + _sources[0]->addNext(*_peers[2]); + _peers[2]->addNext(*_peers[1]); + _peers[1]->addNext(*_peers[0]); + + // Setup segments on other truck + _sources.push_back(new ReducerSource(*this,other[0])); // sources[1] = other truck 1 + _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue)); // peers[3] = other truck 2 + _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue)); // peers[4] = other truck 3 + _sources.push_back(new ReducerSource(*this,other[3])); // sources[2] = other truck 4 + _peers.push_back(new ReducePeer(*this)); // peers[5] = host 1 + _peers.push_back(new ReducePeer(*this)); // peers[6] = host 2 + _peers.push_back(new ReducePeer(*this)); // peers[7] = host 3 + + _sources[1]->addNext(*_peers[3]); + _peers[3]->addNext(*_peers[5]); + _peers[5]->addNext(*_peers[7]); + _peers[7]->addNext(*_peers[0]); + _peers[4]->addNext(*_peers[6]); + _peers[6]->addNext(*_peers[7]); + _sources[2]->addNext(*_peers[4]); +} + +/* ========================= + * EightGPUReducer2 + * ========================= + */ +EightGPUReducer2::EightGPUReducer2(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) { +} + +void EightGPUReducer2::makeConnections(vector& same, vector&other) { + // Setup segments on same truck + _peers.push_back(new ReducePeer(*this,_tgtDeviceID, &_finishQueue)); // peers[0] = tgt + _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue)); // peers[1] = same truck 1 + _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue)); // peers[2] = same truck 2 + _sources.push_back(new ReducerSource(*this,same[2])); // sources[0] = same truck 3 + + _sources[0]->addNext(*_peers[2]); + _peers[2]->addNext(*_peers[1]); + _peers[1]->addNext(*_peers[0]); + + // Setup segments on other truck + _sources.push_back(new ReducerSource(*this,other[0])); // sources[1] = other truck 1 + _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue)); // peers[3] = other truck 2 + _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue)); // peers[4] = other truck 3 + _peers.push_back(new ReducePeer(*this,other[3], &_finishQueue)); // peers[5] = other truck 4 + _peers.push_back(new ReducePeer(*this)); // peers[6] = host 1 + + _sources[1]->addNext(*_peers[3]); + _peers[3]->addNext(*_peers[4]); + _peers[4]->addNext(*_peers[5]); + _peers[5]->addNext(*_peers[6]); + _peers[6]->addNext(*_peers[0]); +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu new file mode 100644 index 0000000..b8de719 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu @@ -0,0 +1,173 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/streambroadcast.cuh" + +using namespace std; + +/* + * ===================== + * StreamBroadcast + * ===================== + */ + +StreamBroadcast::StreamBroadcast(map& streams) { + _streams = streams; +} + +StreamBroadcast::StreamBroadcast() { +} + +void StreamBroadcast::toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice) { + src.copy(hostmem, _streams[srcDevice]); +} + +void StreamBroadcast::toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput) { + tgt.add(hostmem, scaleTarget, scaleOutput, tgt, _streams[tgtDevice]); +} + +void StreamBroadcast::init(map& mats) { + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + if (_streams.count(it->first) == 0) { + _ownedStreams.insert(it->first); + NVMatrix::setDeviceID(it->first); + checkCudaErrors(cudaStreamCreateWithFlags(&_streams[it->first], cudaStreamNonBlocking)); + } + } +} + +StreamBroadcast::~StreamBroadcast() { + for (set::const_iterator it = _ownedStreams.begin(); it != _ownedStreams.end(); ++it) { + checkCudaErrors(cudaStreamDestroy(_streams[*it])); + } +} + +cudaStream_t StreamBroadcast::getStream(int deviceID) { + return _streams[deviceID]; +} + +// Sync stream associated with given device id +void StreamBroadcast::sync(int deviceID) { + NVMatrix::syncStream(_streams[deviceID]); +} + +void StreamBroadcast::transfer(map& mats, int srcDevice) { + transfer(mats, _hostMem, srcDevice, 0, 1); +} + +void StreamBroadcast::transfer(map& mats, int srcDevice, float scaleTarget, float scaleOutput) { + transfer(mats, _hostMem, srcDevice, scaleTarget, scaleOutput); +} + +void StreamBroadcast::transfer(map& mats, HostNVMatrix& hostbuf, int srcDevice, float scaleTarget, float scaleOutput) { + int oldDeviceID = NVMatrix::getDeviceID(); + assert(mats.count(srcDevice) != 0); + init(mats); +// assert(_streams.count(srcDevice) != 0); + if (mats.size() > 1) { + if (mats[srcDevice]->getNumElements() == 0) { + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + it->second->resize(*mats[srcDevice]); + } + } else { + int tgtDevice = mats.begin()->first != srcDevice ? mats.begin()->first : (++mats.begin())->first; + // This case is a simple copy + if (mats.size() == 2 && NVMatrix::canAccessPeer(tgtDevice, srcDevice)) { + NVMatrix::setDeviceID(tgtDevice); + mats[tgtDevice]->add(*mats[srcDevice], scaleTarget, scaleOutput, *mats[tgtDevice], _streams[tgtDevice]); + } else { + NVMatrix& src = *mats[srcDevice]; + if (hostbuf.getNumElements() < src.getNumElements()) { + hostbuf.resize(1,src.getNumElements()); + } + hostbuf.setTrans(src.isTrans()); + + NVMatrix& hostmat = hostbuf.sliceCols(0, src.getNumElements()); + assert(hostmat.isView()); + hostmat.reshape(src.getNumRows(), src.getNumCols()); + + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + assert(it->second->isContiguous()); + NVMatrix::setDeviceID(it->first); + it->second->resize(src); + assert(it->second->isTrans() == src.isTrans()); + } + int numChunks = min(DIVUP(src.getNumElements(), SB_MIN_CHUNK_SIZE), SB_MAX_CHUNKS); + + if (numChunks == 1) { // This is a bit faster for small matrices + NVMatrix::setDeviceID(srcDevice); + toHostMem(src, hostmat, srcDevice); + NVMatrix::syncStream(_streams[srcDevice]); + + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + if (it->first != src.getDataDeviceID()) { + NVMatrix::setDeviceID(it->first); + toTarget(hostmat, *it->second, it->first, scaleTarget, scaleOutput); + } + } + } else { + int n = src.getNumElements(); + + map lines; + for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + lines[it->first] = &it->second->reshaped(1, n); + lines[it->first]->setTrans(src.isTrans()); + } + NVMatrix& srcLine = *lines[srcDevice]; + hostmat.reshape(1, n); + + int chunkSize = DIVUP(n, numChunks); + bool trans = src.isTrans(); + for (int i = 0; i < numChunks; ++i) { + int start = i * chunkSize; + int end = min((i+1) * chunkSize, n); + if (start < end) { + NVMatrix& tmpSrc = srcLine.sliceCols(start, end); // view + NVMatrix& tmpHostmem = hostmat.sliceCols(start, end); // view + + NVMatrix::setDeviceID(srcDevice); + toHostMem(tmpSrc, tmpHostmem, srcDevice); + NVMatrix::syncStream(_streams[srcDevice]); + + for (map::const_iterator it = lines.begin(); it != lines.end(); ++it) { + if (it->first != srcDevice) { + NVMatrix& tmpTgt = it->second->sliceCols(start, end); // view + NVMatrix::setDeviceID(it->first); + toTarget(tmpHostmem, tmpTgt, it->first, scaleTarget, scaleOutput); + delete &tmpTgt; + } + } + delete &tmpSrc; + delete &tmpHostmem; + } + } + for (map::const_iterator it = lines.begin(); it != lines.end(); ++it) { + delete it->second; + } + } + delete &hostmat; + } + for(map::const_iterator it = mats.begin(); it != mats.end(); ++it) { + if (it->first != srcDevice) { + NVMatrix::syncStream(_streams[it->first]); + } + } + } + } + if (oldDeviceID >= 0) { + NVMatrix::setDeviceID(oldDeviceID); + } +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu new file mode 100644 index 0000000..13a1533 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu @@ -0,0 +1,217 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../include/util.cuh" + +using namespace std; + +stringv* getStringV(PyObject* pyList) { + if (pyList == NULL) { + return NULL; + } + stringv* vec = new stringv(); + for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { + vec->push_back(std::string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i)))); + } + return vec; +} + +floatv* getFloatV(PyObject* pyList) { + if (pyList == NULL) { + return NULL; + } + floatv* vec = new floatv(); + for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { + vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i))); + } + return vec; +} + +intv* getIntV(PyObject* pyList) { + if (pyList == NULL) { + return NULL; + } + intv* vec = new intv(); + for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { + vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i))); + } + return vec; +} + +int* getIntA(PyObject* pyList) { + if (pyList == NULL) { + return NULL; + } + int* arr = new int[PyList_GET_SIZE(pyList)]; + for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { + arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)); + } + return arr; +} + +MatrixV* getMatrixV(PyObject* pyList) { + return getMatrixV(pyList, PyList_GET_SIZE(pyList)); +} + +MatrixV* getMatrixV(PyObject* pyList, int len) { + if (pyList == NULL) { + return NULL; + } + MatrixV* vec = new MatrixV(); + for (int i = 0; i < len; i++) { + vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i))); + } + return vec; +} + +PyObjectV* pyDictGetValues(PyObject* dict) { + PyObjectV* pov = new PyObjectV(); + PyObject* valuesList = PyDict_Values(dict); + int numValues = PyList_GET_SIZE(valuesList); + + for (int i = 0; i < numValues; i++) { + pov->push_back(PyList_GET_ITEM(valuesList, i)); + } + Py_DECREF(valuesList); + return pov; +} + +int pyDictGetInt(PyObject* dict, const char* key) { + return PyInt_AS_LONG(PyDict_GetItemString(dict, key)); +} + +intv* pyDictGetIntV(PyObject* dict, const char* key) { + return getIntV(PyDict_GetItemString(dict, key)); +} + +int* pyDictGetIntA(PyObject* dict, const char* key) { + return getIntA(PyDict_GetItemString(dict, key)); +} + +std::string pyDictGetString(PyObject* dict, const char* key) { + return std::string(PyString_AS_STRING(PyDict_GetItemString(dict, key))); +} + +float pyDictGetFloat(PyObject* dict, const char* key) { + return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key)); +} + +floatv* pyDictGetFloatV(PyObject* dict, const char* key) { + return getFloatV(PyDict_GetItemString(dict, key)); +} + +Matrix* pyDictGetMatrix(PyObject* dict, const char* key) { + return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key)); +} + +MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) { + return getMatrixV(PyDict_GetItemString(dict, key)); +} + +stringv* pyDictGetStringV(PyObject* dict, const char* key) { + return getStringV(PyDict_GetItemString(dict, key)); +} + +bool pyDictHasKey(PyObject* dict, const char* key) { + PyObject* str = PyString_FromString(key); + bool b = PyDict_Contains(dict, str); + Py_DECREF(str); + return b; +} + +template +void shuffleVector(vector& v, int start, int end) { + const int len = end - start; + for (int i = 0; i < len*5; ++i) { + int r1 = start + rand() % len; + int r2 = start + rand() % len; + int tmp = v[r1]; + v[r1] = v[r2]; + v[r2] = tmp; + } +} + +template +std::string tostr(T n) { + ostringstream result; + result << n; + return result.str(); +} + +template +void deleteElements(vector& v) { + deleteElements(v, false); +} + +template +void deleteElements(vector& v, bool deleteContainer) { + for (typename vector::const_iterator it = v.begin(); it != v.end(); ++it) { + delete *it; + } + if (deleteContainer) { + delete &v; + } +} + +static Lock deviceCPULock; +static std::map > deviceCPUs; + +std::vector& getDeviceCPUs(int deviceID) { + deviceCPULock.acquire(); + if (deviceCPUs.count(deviceID) == 0 && deviceID >= 0) { + struct cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, deviceID)); + char pciString[13]; + + sprintf(pciString, "%04x", props.pciDomainID); + pciString[4] = ':'; + sprintf(pciString + 5, "%02x", props.pciBusID); + pciString[7] = ':'; + sprintf(pciString + 8, "%02x", props.pciDeviceID); + pciString[10] = '.'; + pciString[11] = '0'; + pciString[12] = 0; + std::string path = std::string("/sys/bus/pci/devices/") + std::string(pciString) + "/local_cpulist"; + ifstream f(path.c_str()); + + if (f.is_open()) { + std::string cpuString; + while (getline(f, cpuString, ',')) { + int start, end; + int found = sscanf(cpuString.c_str(), "%d-%d", &start, &end); + end = found == 1 ? start : end; + if (found > 0) { + for (int i = start; i <= end; ++i) { + deviceCPUs[deviceID].push_back(i); + } + } + } + f.close(); + } else { + printf("Unable to open %s\n", path.c_str()); + } + } + vector& ret = deviceCPUs[deviceID]; + deviceCPULock.release(); + return ret; +} + +template void shuffleVector(std::vector& v, int start, int end); +template std::string tostr(int n); +template void deleteElements(std::vector& v, bool deleteContainer); diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu new file mode 100644 index 0000000..51cffa9 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu @@ -0,0 +1,460 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../include/weights.cuh" +#include "../include/lr.cuh" +#include "../include/worker.cuh" + +using namespace std; + +/* ======================== + * IWeightReducer + * ======================== + */ +int IWeightReducer::getDeviceID() { + return _replicas[_tgtReplicaID]->getDeviceID(); +} + +IWeightReducer::IWeightReducer(std::map& replicas, int tgtReplicaID) : _replicas(replicas), _tgtReplicaID(tgtReplicaID) { +} + +IWeightReducer::~IWeightReducer() { +} + +IWeightReducer& IWeightReducer::make(std::map& replicas, int tgtReplicaID) { + if (replicas.size() == 8) { + return *new ParallelWeightReducer(replicas, tgtReplicaID); + } + return *new SequentialWeightReducer(replicas, tgtReplicaID); +} + +/* ======================== + * SequentialWeightReducer + * ======================== + */ +SequentialWeightReducer::SequentialWeightReducer(std::map& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) { + _sb = new StreamBroadcast(); +} + +SequentialWeightReducer::~SequentialWeightReducer() { + delete _sb; +} + +void SequentialWeightReducer::reduce(std::map gradShards, float gradScale, bool toInc) { + std::map mats; // device id -> grad + mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad(); + for (int i = 0, r = _tgtReplicaID; i < _replicas.size(); ++i, r = (r + 1) % _replicas.size()) { + if (r != _tgtReplicaID) { + mats[_replicas[r]->getDeviceID()] = gradShards[r]; + _sb->transfer(mats, _replicas[r]->getDeviceID(), 1, gradScale); + mats.erase(_replicas[r]->getDeviceID()); + } + } +} + +/* ======================== + * ParallelWeightReducer + * ======================== + */ +ParallelWeightReducer::ParallelWeightReducer(std::map& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) { + _reducer = &(new EightGPUReducer1(getDeviceID()))->construct(); +} + +ParallelWeightReducer::~ParallelWeightReducer() { + delete _reducer; +} + +void ParallelWeightReducer::reduce(std::map gradShards, float gradScale, bool toInc) { + std::map mats; // device id -> grad + mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad(); + for (std::map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { + if (it->first != _tgtReplicaID) { + mats[it->second->getDeviceID()] = gradShards[it->first]; + } + } + _reducer->reduce(mats, gradScale, 1); +} + +// weights has pointer to layer, layer pointer to thread +// thread has sync (copy) object for every other thread +// weights uses copy object to sum grad contributions into inc matrix slice (phase 1) +// weights broadcasts inc matrix slice to other inc matrix replicas (phase 2) + +NVMatrix& Weights::operator*() const { + return getW(); +} + +/* + * TODO: get rid of this constructor duplication. + */ +Weights::Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent) { + init(srcWeights.getCPUW(), srcWeights.getCPUWInc(), lrs, parent, 0, 0, srcWeights.getMom(), srcWeights.isUseGrad(), false); + _srcWeights = &srcWeights; +} + +Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, + float wball, float mom, bool useGrad) { + init(hWeights, hWeightsInc, lrs, parent, wc, wball, mom, useGrad, true); +} + +void Weights::init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, + float wball, float mom, bool useGrad, bool cleanup) { + _srcWeights = NULL; + _hWeights = &hWeights; + _hWeightsInc = &hWeightsInc; + _numUpdates = 0; + _lrs = &lrs; + _parent = &parent; + _wc = wc; + _wball = wball; + _mom = mom; + _useGrad = useGrad; + _onGPU = false; + _weights = NULL; + _weightsInc = NULL; + _weightsGrad = NULL; + _cleanup = cleanup; + _reducer = NULL; + _broadcaster = NULL; +} + +Weights::~Weights() { + delete _lrs; + delete _reducer; + delete _broadcaster; + if (_cleanup) { + delete _hWeights; + delete _hWeightsInc; + if (_srcWeights == NULL) { + delete _weights; + delete _weightsInc; + delete _weightsGrad; + } + } +} + +NVMatrix& Weights::getW() const { + assert(_onGPU); + return *_weights; +} + +NVMatrix& Weights::getInc() const { + assert(_onGPU); + return *_weightsInc; +} + +/* + * TODO: This seems like pretty nasty behavior, I should change this. + */ +NVMatrix& Weights::getGrad() const { + assert(_onGPU); + return _useGrad ? *_weightsGrad : *_weightsInc; +} + +Matrix& Weights::getCPUW() const { + return *_hWeights; +} + +Matrix& Weights::getCPUWInc() const { + return *_hWeightsInc; +} + +int Weights::getNumRows() const { + return _hWeights->getNumRows(); +} + +int Weights::getNumCols() const { + return _hWeights->getNumCols(); +} + +map& Weights::getReplicas() { + return _replicas; +} + +template T& Weights::getShard(T& mat, int replicaID) { + const int n = mat.getNumElements(); + T& line = mat.reshaped(1, n); + const int shardStart = min(n, replicaID * _shardSize); + const int shardEnd = min(n, (replicaID + 1) * _shardSize); + T& slice = line.sliceCols(shardStart, shardEnd); + assert(slice.isView()); + delete &line; + return slice; +} + +template T& Weights::getShard(T& mat) { + return getShard(mat, getReplicaID()); +} + +ISafeBroadcastNetwork& Weights::getBroadcaster() { + if (_broadcaster == NULL) { + set devices; + for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { + devices.insert(it->second->getDeviceID()); + } + // NOTE: we must use safe broadcaster becasue we want to *add* our value to everyone else + _broadcaster = &ISafeBroadcastNetwork::make(devices, getDeviceID()); //&(new NaiveBroadcaster(devices, getDeviceID()))->construct(); + } + return *_broadcaster; +} + +IWeightReducer& Weights::getReducer() { + if (_reducer == NULL) { + _reducer = &IWeightReducer::make(_replicas, getReplicaID()); + } + return *_reducer; +} + +void Weights::copyToCPU() { + if (_srcWeights == NULL) { + assert(_onGPU); + NVMatrix::syncStream(); // for safety + if (getReplicaID() == 0) { + _weights->copyToHost(*_hWeights); + + // Synchronize weights amongst replicas while we're at it. + map weights; + for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { + weights[it->second->getDeviceID()] = &it->second->getW(); + } + // These things sync before returning. + getBroadcaster().broadcast(weights, 1, 0); + } + if (_useGrad) { + Matrix& hIncShard = getShard(*_hWeightsInc); + _weightsInc->copyToHost(hIncShard); + delete &hIncShard; + } else { // In this case there's definitely only one replica + _weightsInc->copyToHost(*_hWeightsInc); + } + } +} + +// This function is assumed to be called in the order in which the layers +// were defined +void Weights::copyToGPU() { + assert(!_onGPU); + // Copies are performed on the default (computation) stream, so that's fine. + if (_srcWeights == NULL) { + _weights = _weights == NULL ? new NVMatrix() : _weights; + _weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc; + _weights->copyFromHost(*_hWeights, true); + + if (_useGrad) { + // In this case there is no need to store the entire inc matrix. + // Just this replica's shard (for synchronization purposes) will do. + Matrix& hIncShard = getShard(*_hWeightsInc); + _weightsInc->copyFromHost(hIncShard, true); + delete &hIncShard; + } else { + _weightsInc->copyFromHost(*_hWeightsInc, true); + } + + _weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL; + } else { + _weights = _srcWeights->_weights; + _weightsInc = _srcWeights->_weightsInc; + _weightsGrad = _srcWeights->_weightsGrad; + } + _onGPU = true; +} + +void Weights::aggregateReplicaGradients(float progress) { + map gradShards; + map wShards; + for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { + gradShards[it->first] = &getShard(it->second->getGrad(), getReplicaID()); + wShards[it->first] = &getShard(it->second->getW(), getReplicaID()); + assert(wShards[it->first]->isContiguous() && gradShards[it->first]->isContiguous()); + } + + float gradScale = _lrs->getValue(progress); + NVMatrix::setDeviceID(getDeviceID()); + + if (_wc > 0) { + NVMatrixTernaryOps::WeightedAdd wadd = NVMatrixTernaryOps::WeightedAdd(_mom, gradScale, -_wc * _lrs->getValue(progress)); + _weightsInc->applyTernary(wadd, *gradShards[getReplicaID()], *wShards[getReplicaID()], *_weightsInc); + } else { + _weightsInc->add(*gradShards[getReplicaID()], _mom, gradScale); + } + + // Reduce everyone's gradient into my inc shard + NVMatrix::syncStream(); // Crucial since the reducer does everything in its own streams!! + getReducer().reduce(gradShards, gradScale, true); + + // Broadcast my inc -> all replicas + map mats; // device id -> grad + mats[getDeviceID()] = _weightsInc; + for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { + if (it->first != getReplicaID()) { + mats[it->second->getDeviceID()] = wShards[it->first]; + } + } + getBroadcaster().broadcast(mats, 1, 1); + + NVMatrix::setDeviceID(getDeviceID()); + wShards[getReplicaID()]->add(*_weightsInc); + + // Cleanup + for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { + delete gradShards[it->first]; + delete wShards[it->first]; + } +} + + +// When _useGrad is false, weightsInc is assumed to contain the +// entire, properly scaled weight increment. +// OTHERWISE, scale your gradient by 1 / numCases only. +// The scaling by epsW will be done in this routine. +void Weights::update(float progress) { + // Only true owner of weights updates +// printf("%s update weights\n", _parent->getName().c_str()); + if (_srcWeights == NULL && _lrs->getBaseValue() > 0) { + assert(_onGPU); + if (_useGrad) { + aggregateReplicaGradients(progress); + } else { // Definitely no replicas in this case + if (_wc > 0) { + _weightsInc->add(*_weights, -_wc * _lrs->getValue(progress)); + } + _weights->add(*_weightsInc); + } + _numUpdates = 0; + } +} + +int Weights::incNumUpdates() { + if (_srcWeights != NULL) { + return _srcWeights->incNumUpdates(); + } + return _numUpdates++; +} + +// Returns the number of times a gradient has been computed for this +// weight matrix during the current pass (interval between two calls of update()) +// through the net. This number will only be greater than 1 if this weight matrix +// is *shared* by multiple layers in the net. +int Weights::getNumUpdates() const { + if (_srcWeights != NULL) { + return _srcWeights->getNumUpdates(); + } + return _numUpdates; +} + +float Weights::getEps(float progress) const { + return _lrs->getValue(progress); +} + +float Weights::getMom() const { + return _mom; +} + +float Weights::getWC() const { + return _wc; +} + +float Weights::getWBall() const { + return _wball; +} + +bool Weights::isUseGrad() const { // is good grammar + return _useGrad; +} + +bool Weights::isOwner() const { + return _srcWeights == NULL; +} + +ParameterSchedule& Weights::getLearningRateSchedule() const { + return *_lrs; +} + +void Weights::addReplica(Weights& replica) { + _replicas[replica.getReplicaID()] = &replica; + + const int n = _hWeights->getNumElements(); + _shardSize = DIVUP(n, _replicas.size()); +} + +int Weights::getReplicaID() { + return _parent->getReplicaID(); +} + +int Weights::getDeviceID() { + return _parent->getDeviceID(); +} + +Layer& Weights::getParent() { + return *_parent; +} + +/* + * =============== + * WeightList + * =============== + */ +Weights& WeightList::operator[](const int i) const { + return *_weightList[i]; +} + +Weights& WeightList::at(const int i) const { + return *_weightList[i]; +} + +WeightList::~WeightList() { + for (int i = 0; i < _weightList.size(); i++) { + delete _weightList[i]; + } +} + +WeightList::WeightList() { +} + +void WeightList::addWeights(Weights& w) { + _weightList.push_back(&w); +} + + +void WeightList::update(float progress) { + for (int i = 0; i < getSize(); i++) { + _weightList[i]->update(progress); + } +} + +void WeightList::copyToCPU() { + for (int i = 0; i < getSize(); i++) { + _weightList[i]->copyToCPU(); + } +} + +void WeightList::copyToGPU() { + for (int i = 0; i < getSize(); i++) { + _weightList[i]->copyToGPU(); + } +} + +int WeightList::getSize() const { + return _weightList.size(); +} + +void WeightList::addReplica(WeightList& replica) { + for (int i = 0; i < getSize(); i++) { + _weightList[i]->addReplica(replica[i]); + } +} diff --git a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu new file mode 100644 index 0000000..50d9b8e --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu @@ -0,0 +1,320 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "../include/util.cuh" +#include "../include/worker.cuh" +#include "../include/timer.cuh" + +using namespace std; + +/* + * ==================== + * WorkResult + * ==================== + */ +WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) { +} + +WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) { +} + +WorkResult::~WorkResult() { + delete _results; // delete NULL is ok +} + +Cost& WorkResult::getResults() const { + return *_results; +} + +WorkResult::RESULTS WorkResult::getResultType() const { + return _resultType; +} + +/* + * ==================== + * Worker + * ==================== + */ +Worker::Worker(ConvNet& convNet) : _convNet(&convNet) { +} + +Worker::~Worker() { +} + +/* + * ==================== + * DataWorker + * ==================== + */ +DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data), _dp(NULL) { + assert(_data != NULL); +} + +bool DataWorker::run() { + _dp = &_convNet->getDataProvider(); + _dp->setData(*_data); + _run(); + _dp->clearData(); + return false; +} + +DataWorker::~DataWorker() { +} + +/* + * ==================== + * TrainingWorker + * ==================== + */ +TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test) + : DataWorker(convNet, data), _progress(progress), _test(test) { +} + +void TrainingWorker::_run() { + _convNet->setTrainingProgress(_progress); + Cost& batchCost = *new Cost(); + int numMinibatches = _dp->getNumMinibatches(); + for (int i = 0; i < numMinibatches; i++) { + for (int p = 0; p < _convNet->getNumPasses(); p++) { + _convNet->fprop(i, p, _test ? PASS_TEST : PASS_TRAIN); + _convNet->getCost(batchCost); + + if (!_test) { + _convNet->bprop(p, PASS_TRAIN); + _convNet->updateWeights(p); + } + } + } + _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); +} + +/* + * ==================== + * SyncWorker + * ==================== + */ +SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) { +} + +bool SyncWorker::run() { + _convNet->copyToCPU(); + _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE)); + return false; +} + +/* + * ==================== + * ExitWorker + * ==================== + */ +ExitWorker::ExitWorker(ConvNet& convNet) : Worker(convNet) { +} + +bool ExitWorker::run() { + return true; +} + +/* + * ==================== + * GradCheckWorker + * ==================== + */ +GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data) + : DataWorker(convNet, data) { +} + +void GradCheckWorker::_run() { + _convNet->checkGradients(); + exit(0); // eh +} + +/* + * ==================== + * MultiviewTestWorker + * ==================== + */ +MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName) + : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) { +// assert(_data->getNumCases() % _numViews == 0); +// assert(convNet.getNumReplicas() == 1); // For now? +} + +MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews) + : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") { +// assert(_data->getNumCases() % _numViews == 0); +} + +MultiviewTestWorker::~MultiviewTestWorker() { +// delete _cpuProbs; +} + +CPUData& MultiviewTestWorker::getMinibatch(int v, int i) { + int numCasesPerView = _dp->getNumCases() / _numViews; + int miniStart = v * numCasesPerView + i * _dp->getMinibatchSize(); + int miniEnd = v * numCasesPerView + min(numCasesPerView, (i + 1) * _dp->getMinibatchSize()); + CPUData& mini = _dp->getDataSlice(miniStart, miniEnd); + return mini; +} + +void MultiviewTestWorker::_run() { + int numCasesPerView = _dp->getNumCases() / _numViews; + int numMiniPerView = DIVUP(numCasesPerView, _dp->getMinibatchSize()); + + Cost& batchCost = *new Cost(); + for (int i = 0; i < numMiniPerView; i++) { + for (int v = 0; v < _numViews - 1; v++) { + for (int p = 0; p < _convNet->getNumPasses(); p++) { + _convNet->fprop(getMinibatch(v, i), p, v == 0 ? PASS_MULTIVIEW_TEST_START : PASS_MULTIVIEW_TEST); + } + } + for (int p = 0; p < _convNet->getNumPasses(); p++) { + _convNet->fprop(getMinibatch(_numViews - 1, i), p, PASS_MULTIVIEW_TEST_END); + _convNet->getCost(batchCost); + } +// if (_cpuProbs != NULL) { +// LogregCostLayer& logregLayer = *dynamic_cast(&_convNet->getLayer(_logregName, 0)); +// NVMatrix::setDeviceID(logregLayer.getDeviceID()); +// Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(), +// min(numCasesReal, (i + 1) * _dp->getMinibatchSize())); +// NVMatrix& acts = logregLayer.getProbsAccum(); +// NVMatrix acts_T; +// acts.transpose(acts_T); +// acts_T.copyToHost(miniProbs); +// +// delete &miniProbs; +// } + } + _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); +} + +/* + * ==================== + * FeatureWorker + * ==================== + */ +FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures) + : DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames), _deleteFeatures(deleteFeatures) { + assert(layerNames.size() == ftrs.size()); + for (int i = 0; i < layerNames.size(); i++) { + assert(ftrs[i]->getNumRows() == data.getNumCases()); + assert(!ftrs[i]->isTrans()); + } +} + +FeatureWorker::~FeatureWorker() { + if (_deleteFeatures) { + for (int i = 0; i < _ftrs->size(); i++) { + delete _ftrs->at(i); + } + delete _ftrs; + } + delete _layerNames; +} + +void FeatureWorker::_run() { + Cost& batchCost = *new Cost(); + map repStart; // Feature write start offsets within minibatch + for (int i = 0; i < _dp->getNumMinibatches(); i++) { + for (int f = 0; f < _layerNames->size(); f++) { + repStart[f] = 0; + } + + for (int p = 0; p < _convNet->getNumPasses(); p++) { + _convNet->fprop(i, p, PASS_FEATURE_GEN); + _convNet->getCost(batchCost); + for (int f = 0; f < _layerNames->size(); f++) { + + if (_convNet->getLayer(_layerNames->at(f), 0).getFwdActiveInputReplicaIdx(p) >= 0) { + Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(), + min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize())); + + for (int r = 0; r < _convNet->getLayer(_layerNames->at(f), 0).getNumReplicas(); ++r) { + Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f), r); + int d = ftrLayer.getDeviceID(); + NVMatrix::setDeviceID(d); + NVMatrix& acts = ftrLayer.getActs(); + + Matrix& repMiniFtrs = miniFtrs.sliceRows(repStart[f], + min(int(miniFtrs.getNumRows()), repStart[f] + acts.getLeadingDim())); + + NVMatrix acts_T; + acts.transpose(false); + acts.transpose(acts_T); + acts_T.copyToHost(repMiniFtrs); + NVMatrix::syncStream(); // eh why not + + delete &repMiniFtrs; + + repStart[f] += acts.getLeadingDim(); + } + delete &miniFtrs; + } + } + } + } + _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); +} + +/* + * ==================== + * DataGradWorker + * ==================== + */ +DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx) + : DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) { +// assert(dataGrads.getNumRows() == data.getNumCases()); +// assert(!dataGrads.isTrans()); +} + +DataGradWorker::~DataGradWorker() { +// delete _dataGrads; +} + +void DataGradWorker::_run() { +// DataLayer& dataLayer = *dynamic_cast(&_convNet->getLayer(_dataLayerIdx)); +// SoftmaxLayer& softmaxLayer = *dynamic_cast(&_convNet->getLayer(_softmaxLayerIdx)); +// softmaxLayer.setDoLogregGrad(false); +// Cost& batchCost = *new Cost(0); +// for (int i = 0; i < _dp->getNumMinibatches(); i++) { +// _convNet->fprop(i, PASS_TEST); +// _convNet->getCost(batchCost); +// softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad()); +// +// softmaxLayer.getActsGrad().addScalar(1); +// softmaxLayer.getActsGrad().scale(-1); +// softmaxLayer.incRcvdBInputs(); +// softmaxLayer.bprop(PASS_TEST); +// +// Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(), +// min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize())); +// NVMatrix& grads = dataLayer.getActsGrad(); +// NVMatrix grads_T; +// if (grads.isTrans()) { +// NVMatrix& soft_T = grads.getTranspose(); +// soft_T.transpose(grads_T); +// delete &soft_T; +// } else { +// grads.transpose(grads_T); +// } +// grads_T.copyToHost(miniDataGrads); +// delete &miniDataGrads; +// +// _convNet->reset(); +// } +// cudaThreadSynchronize(); +// _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); +} diff --git a/caffe2/contrib/cuda-convnet2/images/show-cost.png b/caffe2/contrib/cuda-convnet2/images/show-cost.png new file mode 100644 index 0000000..1e2ad5a Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-cost.png differ diff --git a/caffe2/contrib/cuda-convnet2/images/show-filters-no-rgb.png b/caffe2/contrib/cuda-convnet2/images/show-filters-no-rgb.png new file mode 100644 index 0000000..c2bc364 Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-filters-no-rgb.png differ diff --git a/caffe2/contrib/cuda-convnet2/images/show-filters.png b/caffe2/contrib/cuda-convnet2/images/show-filters.png new file mode 100644 index 0000000..ca275e9 Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-filters.png differ diff --git a/caffe2/contrib/cuda-convnet2/images/show-preds.png b/caffe2/contrib/cuda-convnet2/images/show-preds.png new file mode 100644 index 0000000..0d5550f Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/images/show-preds.png differ diff --git a/caffe2/contrib/cuda-convnet2/initw.py b/caffe2/contrib/cuda-convnet2/initw.py new file mode 100644 index 0000000..8f068a3 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/initw.py @@ -0,0 +1,54 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from python_util.gpumodel import * +import numpy as n +import numpy.random as nr + +def get_src(filename): + src = IGPUModel.load_checkpoint(filename) + return src['model_state']['layers'] + +# Initialize weight matrix by copying weight matrix of given layer +def makew(name, idx, shape, params): + src = get_src(params[0]) + return src[name]['weights'][idx] + +# Initialize bias vector by copying bias vector of given layer +def makeb(name, shape, params): + src = get_src(params[0]) + return src[name]['biases'] + +def concat(shape, src, src_layers, src_func): + mat = n.empty(shape, dtype=n.single, order='F') + start = 0 + for s in src_layers: + m = src_func(src[s]) + mat[:,start:start+m.shape[1]] = m + start += m.shape[1] + return mat + +# Initialize weight matrix by concatenating weight matrices of given layers +def makewcat(name, idx, shape, params): + src, src_layers = get_src(params[0]), params[1:] + return concat(shape, src, src_layers, lambda x: x['weights'][idx]) + +# Initialize bias vector by concatenating bias vectors of given layers +def makebcat(name, shape, params): + src, src_layers = get_src(params[0]), params[1:] + return concat(shape, src, src_layers, lambda x: x['biases']) + +# Initialize bias vector from tuple input +def makeb_vec(name, shape, params): + return n.array([n.single(x) for x in params], dtype=n.single).reshape((1, len(params))) diff --git a/caffe2/contrib/cuda-convnet2/layer.py b/caffe2/contrib/cuda-convnet2/layer.py new file mode 100644 index 0000000..8baef39 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layer.py @@ -0,0 +1,1537 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import exp +import sys +import ConfigParser as cfg +import os +import numpy as n +import numpy.random as nr +from math import ceil, floor +from collections import OrderedDict +from os import linesep as NL +from python_util.options import OptionsParser +import re + +class LayerParsingError(Exception): + pass + +# A neuron that doesn't take parameters +class NeuronParser: + def __init__(self, type, func_str, uses_acts=True, uses_inputs=True): + self.type = type + self.func_str = func_str + self.uses_acts = uses_acts + self.uses_inputs = uses_inputs + + def parse(self, type): + if type == self.type: + return {'type': self.type, + 'params': {}, + 'usesActs': self.uses_acts, + 'usesInputs': self.uses_inputs} + return None + +# A neuron that takes parameters +class ParamNeuronParser(NeuronParser): + neuron_regex = re.compile(r'^\s*(\w+)\s*\[\s*(\w+(\s*,\w+)*)\s*\]\s*$') + def __init__(self, type, func_str, uses_acts=True, uses_inputs=True): + NeuronParser.__init__(self, type, func_str, uses_acts, uses_inputs) + m = self.neuron_regex.match(type) + self.base_type = m.group(1) + self.param_names = m.group(2).split(',') + assert len(set(self.param_names)) == len(self.param_names) + + def parse(self, type): + m = re.match(r'^%s\s*\[([\d,\.\s\-]*)\]\s*$' % self.base_type, type) + if m: + try: + param_vals = [float(v.strip()) for v in m.group(1).split(',')] + if len(param_vals) == len(self.param_names): + return {'type': self.base_type, + 'params': dict(zip(self.param_names, param_vals)), + 'usesActs': self.uses_acts, + 'usesInputs': self.uses_inputs} + except TypeError: + pass + return None + +class AbsTanhNeuronParser(ParamNeuronParser): + def __init__(self): + ParamNeuronParser.__init__(self, 'abstanh[a,b]', 'f(x) = a * |tanh(b * x)|') + + def parse(self, type): + dic = ParamNeuronParser.parse(self, type) + # Make b positive, since abs(tanh(bx)) = abs(tanh(-bx)) and the C++ code + # assumes b is positive. + if dic: + dic['params']['b'] = abs(dic['params']['b']) + return dic + +class ParamParser: + lrs_regex = re.compile(r'^\s*(\w+)\s*(?:\[\s*(\w+(\s*;\w+)*)\s*\])?\s*$') + param_converters = {'i': int, + 'f': float} + def __init__(self, type): + m = self.lrs_regex.match(type) + self.base_type = m.group(1) + param_names_with_type = m.group(2).split(';') if m.group(2) is not None else [] + self.param_names = [p[1:] for p in param_names_with_type] + self.param_types = [self.param_converters[p[0]] for p in param_names_with_type] + self.param_regex_inner = ";".join([('\s*%s\s*=\s*[^;,\s=]+\s*' % p) for p in self.param_names]) + self.regex_str = ('^%s\s*(?:\[(%s)\])?\s*$') % (self.base_type, self.param_regex_inner) + assert len(set(self.param_names)) == len(self.param_names) + + def parse(self, type): + m = re.match(self.regex_str, type, flags=re.IGNORECASE) + if m: + try: + param_vals = [ptype(v.split('=')[1].strip()) for ptype,v in zip(self.param_types, m.group(1).split(';'))] if m.group(1) is not None else [] + if len(param_vals) == len(self.param_names): + return {'type': self.base_type, + 'params': dict(zip(self.param_names, param_vals))} + except TypeError: + pass + return None + +# Subclass that throws more convnet-specific exceptions than the default +class MyConfigParser(cfg.SafeConfigParser): + def safe_get(self, section, option, f=cfg.SafeConfigParser.get, typestr=None, default=None): + try: + return f(self, section, option) + except cfg.NoOptionError, e: + if default is not None: + return default + raise LayerParsingError("Layer '%s': required parameter '%s' missing" % (section, option)) + except ValueError, e: + if typestr is None: + raise e + raise LayerParsingError("Layer '%s': parameter '%s' must be %s" % (section, option, typestr)) + + def safe_get_list(self, section, option, f=str, typestr='strings', default=None): + v = self.safe_get(section, option, default=default) + if type(v) == list: + return v + try: + return [f(x.strip()) for x in v.split(',')] + except: + raise LayerParsingError("Layer '%s': parameter '%s' must be ','-delimited list of %s" % (section, option, typestr)) + + def safe_get_int(self, section, option, default=None): + return self.safe_get(section, option, f=cfg.SafeConfigParser.getint, typestr='int', default=default) + + def safe_get_float(self, section, option, default=None): + return self.safe_get(section, option, f=cfg.SafeConfigParser.getfloat, typestr='float', default=default) + + def safe_get_bool(self, section, option, default=None): + return self.safe_get(section, option, f=cfg.SafeConfigParser.getboolean, typestr='bool', default=default) + + def safe_get_float_list(self, section, option, default=None): + return self.safe_get_list(section, option, float, typestr='floats', default=default) + + def safe_get_int_list(self, section, option, default=None): + return self.safe_get_list(section, option, int, typestr='ints', default=default) + + def safe_get_bool_list(self, section, option, default=None): + return self.safe_get_list(section, option, lambda x: x.lower() in ('true', '1'), typestr='bools', default=default) + +# A class that implements part of the interface of MyConfigParser +class FakeConfigParser(object): + def __init__(self, dic): + self.dic = dic + + def safe_get(self, section, option, default=None): + if option in self.dic: + return self.dic[option] + return default + + def safe_get_int(self, section, option, default=None): + return int(self.safe_get(section, option, default)) + + def safe_get_int_list(self, section, option, default=None): + return list(self.safe_get(section, option, default)) + +class LayerParser: + def __init__(self): + self.dic = {} + self.set_defaults() + + # Post-processing step -- this is called after all layers have been initialized + def optimize(self, layers): + self.dic['actsTarget'] = -1 + self.dic['actsGradTarget'] = -1 + if len(set(len(l['gpu']) for l in layers.values() if 'inputs' in l and self.dic['name'] in l['inputs'])) > 1: +# print set(len(l['gpu']) for l in layers.values()) + raise LayerParsingError("Layer '%s': all next layers must have equal number of replicas." % (self.dic['name'])) + + def parse_params(self, vals, parsers, param_name, human_name, num_params=1): + dic, name = self.dic, self.dic['name'] + +# print vals + if len(vals) != num_params and len(vals) != 1: + raise LayerParsingError("Layer '%s': expected list of length %d for %s but got list of length %d."% (name, num_params, param_name, len(vals))) + parsed = [] +# print vals + for v in vals: + for p in parsers: + parsedv = p.parse(v) + if parsedv: + parsed += [parsedv] + break + if len(parsed) == 1 and num_params > 1: + parsed = parsed * num_params + if len(parsed) == num_params: + return parsed +# print parsed, vals + raise LayerParsingError("Layer '%s': unable to parse %s %s=%s." % (name, human_name, param_name, ",".join(vals))) + + # Add parameters from layer parameter file + def add_params(self, mcp): + pass +# self.dic['conserveMem'] = mcp.convnet.op.get_value('conserve_mem') if mcp.convnet is not None else 0 + + def init(self, dic): + self.dic = dic + return self + + def set_defaults(self): + self.dic['outputs'] = 0 + self.dic['parser'] = self + self.dic['requiresParams'] = False + # Does this layer use its own activity matrix + # for some purpose other than computing its output? + # Usually, this will only be true for layers that require their + # own activity matrix for gradient computations. For example, layers + # with logistic units must compute the gradient y * (1 - y), where y is + # the activity matrix. + # + # Layers that do not not use their own activity matrix should advertise + # this, since this will enable memory-saving matrix re-use optimizations. + # + # The default value of this property is True, for safety purposes. + # If a layer advertises that it does not use its own activity matrix when + # in fact it does, bad things will happen. + self.dic['usesActs'] = True + + # Does this layer use the activity matrices of its input layers + # for some purpose other than computing its output? + # + # Again true by default for safety + self.dic['usesInputs'] = True + + # Force this layer to use its own activity gradient matrix, + # instead of borrowing one from one of its inputs. + # + # This should be true for layers where the mapping from output + # gradient to input gradient is non-elementwise. + self.dic['forceOwnActs'] = True + + # Does this layer need the gradient at all? + # Should only be true for layers with parameters (weights). + self.dic['gradConsumer'] = False + + # The gpu indices on which this layer runs + self.dic['gpu'] = [-1] + + def parse(self, name, mcp, prev_layers, model=None): + self.prev_layers = prev_layers + self.dic['name'] = name + self.dic['type'] = mcp.safe_get(name, 'type') + self.dic['id'] = len(prev_layers) + + return self.dic + + def verify_float_range(self, v, param_name, _min, _max): + self.verify_num_range(v, param_name, _min, _max, strconv=lambda x: '%.3f' % x) + + def verify_num_range(self, v, param_name, _min, _max, strconv=lambda x:'%d' % x): + if type(v) == list: + for i,vv in enumerate(v): + self._verify_num_range(vv, param_name, _min, _max, i, strconv=strconv) + else: + self._verify_num_range(v, param_name, _min, _max, strconv=strconv) + + def _verify_num_range(self, v, param_name, _min, _max, input=-1, strconv=lambda x:'%d' % x): + layer_name = self.dic['name'] if input < 0 else '%s[%d]' % (self.dic['name'], input) + if _min is not None and _max is not None and (v < _min or v > _max): + raise LayerParsingError("Layer '%s': parameter '%s' must be in the range %s-%s" % (layer_name, param_name, strconv(_min), strconv(_max))) + elif _min is not None and v < _min: + raise LayerParsingError("Layer '%s': parameter '%s' must be greater than or equal to %s" % (layer_name, param_name, strconv(_min))) + elif _max is not None and v > _max: + raise LayerParsingError("Layer '%s': parameter '%s' must be smaller than or equal to %s" % (layer_name, param_name, strconv(_max))) + + def verify_divisible(self, value, div, value_name, div_name=None, input_idx=0): + layer_name = self.dic['name'] if len(self.dic['inputs']) == 0 else '%s[%d]' % (self.dic['name'], input_idx) + if value % div != 0: + raise LayerParsingError("Layer '%s': parameter '%s' must be divisible by %s" % (layer_name, value_name, str(div) if div_name is None else "'%s'" % div_name)) + + def verify_str_in(self, value, param_name, lst, input_idx=-1): + lname = self.dic['name'] if input_idx == -1 else ('%s[%d]' % (self.dic['name'], input_idx)) + if value not in lst: + raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (lname, param_name, ", ".join("'%s'" % s for s in lst))) + + def verify_int_in(self, value, param_name, lst): + if value not in lst: + raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst))) + + def verify_all_ints_in(self, values, param_name, lst): + if len([v for v in values if v not in lst]) > 0: + raise LayerParsingError("Layer '%s': all parameters to '%s' must be among %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst))) + + def verify_input_dims(self, dims): + for i,d in enumerate(dims): + if d is not None and self.dic['numInputs'][i] != d: # first input must be labels + raise LayerParsingError("Layer '%s': dimensionality of input %d must be %d" % (self.dic['name'], i, d)) + + # This looks for neuron=x arguments in various layers, and creates + # separate layer definitions for them. + @staticmethod + def detach_neuron_layers(layers): + for name,l in layers.items(): + if l['type'] != 'neuron' and 'neuron' in l and l['neuron']: + NeuronLayerParser().detach_neuron_layer(name, layers) + + @staticmethod + def parse_layers(layer_cfg_path, param_cfg_path, model, layers={}): + try: + if not os.path.exists(layer_cfg_path): + raise LayerParsingError("Layer definition file '%s' does not exist" % layer_cfg_path) + if not os.path.exists(param_cfg_path): + raise LayerParsingError("Layer parameter file '%s' does not exist" % param_cfg_path) + if len(layers) == 0: + mcp = MyConfigParser(dict_type=OrderedDict) + mcp.readfp(open(layer_cfg_path)) + for name in mcp.sections(): + if not mcp.has_option(name, 'type'): + raise LayerParsingError("Layer '%s': no type given" % name) + ltype = mcp.safe_get(name, 'type') + if ltype not in layer_parsers: + raise LayerParsingError("Layer '%s': Unknown layer type: '%s'" % (name, ltype)) + layers[name] = layer_parsers[ltype]().parse(name, mcp, layers, model) + + LayerParser.detach_neuron_layers(layers) + for l in layers.values(): + l['parser'].optimize(layers) + del l['parser'] + + for name,l in layers.items(): + if not l['type'].startswith('cost.'): + found = max(name in l2['inputs'] for l2 in layers.values() if 'inputs' in l2) + if not found: + raise LayerParsingError("Layer '%s' of type '%s' is unused" % (name, l['type'])) + + mcp = MyConfigParser(dict_type=OrderedDict) + mcp.readfp(open(param_cfg_path)) +# mcp.convnet = model + for name,l in layers.items(): + if not mcp.has_section(name) and l['requiresParams']: + raise LayerParsingError("Layer '%s' of type '%s' requires extra parameters, but none given in file '%s'." % (name, l['type'], param_cfg_path)) + lp = layer_parsers[l['type']]().init(l) + lp.add_params(mcp) + except LayerParsingError, e: + print e + sys.exit(1) + return layers + + @staticmethod + def register_layer_parser(ltype, cls): + if ltype in layer_parsers: + raise LayerParsingError("Layer type '%s' already registered" % ltype) + layer_parsers[ltype] = cls + +# Any layer that takes an input (i.e. non-data layer) +class LayerWithInputParser(LayerParser): + def __init__(self, num_inputs=-1): + LayerParser.__init__(self) + self.num_inputs = num_inputs + + def verify_num_params(self, params, auto_expand=True): + for param in params: + if len(self.dic[param]) != len(self.dic['inputs']): + if auto_expand and len(self.dic[param]) == 1: + self.dic[param] *= len(self.dic['inputs']) + else: + raise LayerParsingError("Layer '%s': %s list length does not match number of inputs" % (self.dic['name'], param)) + + # layers: dictionary: name -> layer + def optimize(self, layers): + LayerParser.optimize(self, layers) + dic = self.dic + + # Check if I have an input that no one else uses. + #print "Layer %s optimizing" % dic['name'] + if not dic['forceOwnActs']: + for i, inp in enumerate(dic['inputLayers']): + if inp['outputs'] == dic['outputs'] and sum(('inputs' in ll) and (inp['name'] in ll['inputs']) for ll in layers.itervalues()) == 1: + # I can share my activity matrix with this layer + # if it does not use its activity matrix, and I + # do not need to remember my inputs. + # TODO: a dropout layer should always be able to overwrite + # its input. Make it so. +# print "Layer %s(uses inputs=%d), input %s(uses acts = %d)" % (dic['name'], dic['usesInputs'], inp['name'], inp['usesActs']) + if not inp['usesActs'] and not dic['usesInputs']: + dic['actsTarget'] = i + print "Layer %s using acts from layer %s" % (dic['name'], inp['name']) +# print "Layer '%s' sharing activity matrix with layer '%s'" % (dic['name'], l['name']) + # I can share my gradient matrix with this layer if we're on the same GPU. + # This is different from the logic for actsTarget because this guy doesn't + # have an actsGrad matrix on my GPU if our GPUs are different, so there's + # nothing to share. + if dic['gpu'] == inp['gpu']: + dic['actsGradTarget'] = i +# print "Layer '%s' sharing activity gradient matrix with layer '%s'" % (dic['name'], l['name']) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerParser.parse(self, name, mcp, prev_layers, model) + + dic['inputs'] = [inp.strip() for inp in mcp.safe_get(name, 'inputs').split(',')] + + for inp in dic['inputs']: + if inp not in prev_layers: + raise LayerParsingError("Layer '%s': input layer '%s' not defined" % (name, inp)) + + dic['inputLayers'] = [prev_layers[inp] for inp in dic['inputs']] + dic['gpu'] = mcp.safe_get_int_list(name, 'gpu', default=dic['inputLayers'][0]['gpu']) + dic['gpus'] = ", ".join('%s' % d for d in dic['gpu']) + dic['numReplicas'] = len(dic['gpu']) + + if len(set(dic['gpu'])) != len(dic['gpu']): + raise LayerParsingError("Layer '%s': all replicas must run on different GPUs." % (name)) + + for inp in dic['inputs']: + # Data layers do not explicitly define how many replicas they have. + # The number of replicas for a data layer is given by the number of replicas + # in the next layer(s). So we set that here. + inpl = prev_layers[inp] + if inpl['type'] == 'data': + inpl['numReplicas'] = dic['numReplicas'] + if inpl['numReplicas'] % dic['numReplicas'] != 0: + raise LayerParsingError("Layer '%s': number of replicas (%d) must divide number of replicas in all input layers (input %s has %d replicas)." % (name, dic['numReplicas'], inpl['name'], inpl['numReplicas'])) + if len(set(inp['numReplicas'] for inp in dic['inputLayers'])) != 1: + raise LayerParsingError("Layer '%s': all input layers must have equal numbers of replicas." % (name)) + + # Need to also assert that all *next* layers have equal number of replicas but this is hard so it's done in Layer.optimize + for inp in dic['inputLayers']: + if inp['outputs'] == 0: + raise LayerParsingError("Layer '%s': input layer '%s' does not produce any output" % (name, inp['name'])) + dic['numInputs'] = [inp['outputs'] for inp in dic['inputLayers']] + + # Layers can declare a neuron activation function to apply to their output, as a shortcut + # to avoid declaring a separate neuron layer above themselves. + dic['neuron'] = mcp.safe_get(name, 'neuron', default="") + if self.num_inputs > 0 and len(dic['numInputs']) != self.num_inputs: + raise LayerParsingError("Layer '%s': number of inputs must be %d" % (name, self.num_inputs)) + + if model: + self.verify_all_ints_in(dic['gpu'], 'gpu', range(len(model.op.get_value('gpu')))) + return dic + + def verify_img_size(self): + dic = self.dic + if dic['numInputs'][0] % dic['imgPixels'] != 0 or dic['imgSize'] * dic['imgSize'] != dic['imgPixels']: + raise LayerParsingError("Layer '%s': has %-d dimensional input, not interpretable as %d-channel images" % (dic['name'], dic['numInputs'][0], dic['channels'])) + + @staticmethod + def grad_consumers_below(dic): + if dic['gradConsumer']: + return True + if 'inputLayers' in dic: + return any(LayerWithInputParser.grad_consumers_below(l) for l in dic['inputLayers']) + + def verify_no_grads(self): + if LayerWithInputParser.grad_consumers_below(self.dic): + raise LayerParsingError("Layer '%s': layers of type '%s' cannot propagate gradient and must not be placed over layers with parameters." % (self.dic['name'], self.dic['type'])) + +class NailbedLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['forceOwnActs'] = False + dic['usesActs'] = False + dic['usesInputs'] = False + + dic['channels'] = mcp.safe_get_int(name, 'channels') + dic['stride'] = mcp.safe_get_int(name, 'stride') + + self.verify_num_range(dic['channels'], 'channels', 1, None) + + # Computed values + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + dic['outputsX'] = (dic['imgSize'] + dic['stride'] - 1) / dic['stride'] + dic['start'] = (dic['imgSize'] - dic['stride'] * (dic['outputsX'] - 1)) / 2 + dic['outputs'] = dic['channels'] * dic['outputsX']**2 + + self.verify_num_range(dic['outputsX'], 'outputsX', 0, None) + + self.verify_img_size() + + print "Initialized bed-of-nails layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels']) + return dic + +class GaussianBlurLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['forceOwnActs'] = False + dic['usesActs'] = False + dic['usesInputs'] = False + dic['outputs'] = dic['numInputs'][0] + + dic['channels'] = mcp.safe_get_int(name, 'channels') + dic['filterSize'] = mcp.safe_get_int(name, 'filterSize') + dic['stdev'] = mcp.safe_get_float(name, 'stdev') + + self.verify_num_range(dic['channels'], 'channels', 1, None) + self.verify_int_in(dic['filterSize'], 'filterSize', [3, 5, 7, 9]) + + # Computed values + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + dic['filter'] = n.array([exp(-(dic['filterSize']/2 - i)**2 / float(2 * dic['stdev']**2)) + for i in xrange(dic['filterSize'])], dtype=n.float32).reshape(1, dic['filterSize']) + dic['filter'] /= dic['filter'].sum() + self.verify_img_size() + + if dic['filterSize'] > dic['imgSize']: + raise LayerParsingError("Later '%s': filter size (%d) must be smaller than image size (%d)." % (dic['name'], dic['filterSize'], dic['imgSize'])) + + print "Initialized Gaussian blur layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) + + return dic + +class HorizontalReflectionLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['outputs'] = dic['numInputs'][0] + dic['channels'] = mcp.safe_get_int(name, 'channels') + + self.verify_num_range(dic['channels'], 'channels', 1, 3) + + # Computed values + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + self.verify_img_size() + + print "Initialized horizontal reflection layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) + + return dic + +class ResizeLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['forceOwnActs'] = False + dic['usesActs'] = False + dic['usesInputs'] = False + + dic['channels'] = mcp.safe_get_int(name, 'channels') + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + + dic['scale'] = mcp.safe_get_float(name, 'scale') + dic['tgtSize'] = int(floor(dic['imgSize'] / dic['scale'])) + dic['tgtPixels'] = dic['tgtSize']**2 + self.verify_num_range(dic['channels'], 'channels', 1, None) + # Really not recommended to use this for such severe scalings + self.verify_float_range(dic['scale'], 'scale', 0.5, 2) + + dic['outputs'] = dic['channels'] * dic['tgtPixels'] + + self.verify_img_size() + self.verify_no_grads() + + print "Initialized resize layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels']) + + return dic + +class RandomScaleLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['forceOwnActs'] = False + dic['usesActs'] = False + dic['usesInputs'] = False + + dic['channels'] = mcp.safe_get_int(name, 'channels') + self.verify_num_range(dic['channels'], 'channels', 1, None) + + # Computed values + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + + dic['maxScale'] = mcp.safe_get_float(name, 'maxScale') + dic['tgtSize'] = mcp.safe_get_int(name, 'tgtSize') + min_size = int(floor(dic['imgSize'] / dic['maxScale'])) + max_size = dic['imgSize'] #int(floor(dic['imgSize'] * dic['maxScale'])) + if dic['tgtSize'] < min_size: + raise LayerParsingError("Layer '%s': target size must be greater than minimum image size after rescaling (%d)" % (name, min_size)) + if dic['tgtSize'] > max_size: + raise LayerParsingError("Layer '%s': target size must be smaller than maximum image size after rescaling (%d)" % (name, max_size)) + dic['tgtPixels'] = dic['tgtSize']**2 + + self.verify_float_range(dic['maxScale'], 'maxScale', 1, 2) + + dic['outputs'] = dic['channels'] * dic['tgtPixels'] + + self.verify_img_size() + self.verify_no_grads() + + print "Initialized random scale layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels']) + + return dic + +class CropLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['forceOwnActs'] = False + dic['usesActs'] = False + dic['usesInputs'] = False + + dic['channels'] = mcp.safe_get_int(name, 'channels') + self.verify_num_range(dic['channels'], 'channels', 1, None) + dic['startX'] = mcp.safe_get_int(name, 'startX') + dic['startY'] = mcp.safe_get_int(name, 'startY', default=dic['startX']) + dic['sizeX'] = mcp.safe_get_int(name, 'sizeX') + + # Computed values + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + + dic['outputs'] = dic['channels'] * (dic['sizeX']**2) + + self.verify_num_range(dic['startX'], 'startX', 0, dic['imgSize']-1) + self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize']) + self.verify_num_range(dic['startY'], 'startY', 0, dic['imgSize']-1) + self.verify_img_size() + self.verify_no_grads() + + if dic['startX'] + dic['sizeX'] > dic['imgSize']: + raise LayerParsingError("Layer '%s': startX (%d) + sizeX (%d) > imgSize (%d)" % (name, dic['startX'], dic['sizeX'], dic['imgSize'])) + + print "Initialized cropping layer '%s', producing %dx%d %d-channel output" % (name, dic['sizeX'], dic['sizeX'], dic['channels']) + + return dic + +class ColorTransformLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['forceOwnActs'] = False + dic['usesActs'] = False + dic['usesInputs'] = False + + # Computed values + dic['imgPixels'] = dic['numInputs'][0] / 3 + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + dic['channels'] = 3 + dic['outputs'] = dic['numInputs'][0] + + self.verify_img_size() + self.verify_no_grads() + + return dic + +class RGBToYUVLayerParser(ColorTransformLayerParser): + def __init__(self): + ColorTransformLayerParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model=None): + dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model) + print "Initialized RGB --> YUV layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) + return dic + +class RGBToLABLayerParser(ColorTransformLayerParser): + def __init__(self): + ColorTransformLayerParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model=None): + dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model) + dic['center'] = mcp.safe_get_bool(name, 'center', default=False) + print "Initialized RGB --> LAB layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) + return dic + +class NeuronLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + @staticmethod + def get_unused_layer_name(layers, wish): + if wish not in layers: + return wish + for i in xrange(1, 100): + name = '%s.%d' % (wish, i) + if name not in layers: + return name + raise LayerParsingError("This is insane.") + + def parse_neuron(self, neuron_str): + for n in neuron_parsers: + p = n.parse(neuron_str) + if p: # Successfully parsed neuron, return it + self.dic['neuron'] = p + self.dic['usesActs'] = self.dic['neuron']['usesActs'] + self.dic['usesInputs'] = self.dic['neuron']['usesInputs'] + + return + # Could not parse neuron + # Print available neuron types + colnames = ['Neuron type', 'Function'] + m = max(len(colnames[0]), OptionsParser._longest_value(neuron_parsers, key=lambda x:x.type)) + 2 + ntypes = [OptionsParser._bold(colnames[0].ljust(m))] + [n.type.ljust(m) for n in neuron_parsers] + fnames = [OptionsParser._bold(colnames[1])] + [n.func_str for n in neuron_parsers] + usage_lines = NL.join(ntype + fname for ntype,fname in zip(ntypes, fnames)) + + raise LayerParsingError("Layer '%s': unable to parse neuron type '%s'. Valid neuron types: %sWhere neurons have parameters, they must be floats." % (self.dic['name'], neuron_str, NL + usage_lines + NL)) + + def detach_neuron_layer(self, src_name, layers): + dic = self.dic +# self.set_defaults() + dic['name'] = NeuronLayerParser.get_unused_layer_name(layers, '%s_neuron' % src_name) + dic['type'] = 'neuron' + dic['inputs'] = src_name + dic['neuron'] = layers[src_name]['neuron'] + dic['gpu'] = layers[src_name]['gpu'] + + # Yes it's not entirely correct to pass all of layers as prev_layers, but it's harmless + dic = self.parse(dic['name'], FakeConfigParser(dic), layers) + dic['src_layer'] = src_name + + # Link upper layers to this new one + for l in layers.values(): + if 'inputs' in l: + l['inputs'] = [inp if inp != src_name else dic['name'] for inp in l['inputs']] + l['inputLayers'] = [inp if inp['name'] != src_name else dic for inp in l['inputLayers']] + layers[dic['name']] = dic + + def parse(self, name, mcp, prev_layers, model=None): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['outputs'] = dic['numInputs'][0] + self.parse_neuron(dic['neuron']) + dic['forceOwnActs'] = False + print "Initialized neuron layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class EltwiseSumLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self) + + def add_params(self, mcp): + LayerWithInputParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + dic['coeffs'] = mcp.safe_get_float_list(name, 'coeffs', default=[1.0] * len(dic['inputs'])) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + + if len(set(dic['numInputs'])) != 1: + raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs']))) + dic['outputs'] = dic['numInputs'][0] + dic['usesInputs'] = False + dic['usesActs'] = False + dic['forceOwnActs'] = False + dic['requiresParams'] = True + + print "Initialized elementwise sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class EltwiseMaxLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + if len(dic['inputs']) < 2: + raise LayerParsingError("Layer '%s': elementwise max layer must have at least 2 inputs, got %d." % (name, len(dic['inputs']))) + if len(set(dic['numInputs'])) != 1: + raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs']))) + dic['outputs'] = dic['numInputs'][0] + + print "Initialized elementwise max layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class SumLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + + dic['stride'] = mcp.safe_get_int(name, 'stride', default=1) + self.verify_divisible(dic['numInputs'][0], dic['stride'], 'input dimensionality', 'stride') + dic['outputs'] = dic['numInputs'][0] / dic['stride'] + + print "Initialized sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class DropoutLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def add_params(self, mcp): + LayerWithInputParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + dic['enable'] = mcp.safe_get_bool(name, 'enable', default=True) + dic['keep'] = mcp.safe_get_float(name, 'keep', default=0.5) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['requiresParams'] = True + dic['usesInputs'] = False + dic['usesActs'] = False + dic['forceOwnActs'] = False + dic['outputs'] = dic['numInputs'][0] + + print "Initialized %s layer '%s' on GPUs %s, producing %d outputs" % (dic['type'], name, dic['gpus'], dic['outputs']) + return dic + +class Dropout2LayerParser(DropoutLayerParser): + def __init__(self): + DropoutLayerParser.__init__(self) + +class WeightLayerParser(LayerWithInputParser): + LAYER_PAT = re.compile(r'^\s*([^\s\[]+)(?:\[(\d+)\])?\s*$') # matches things like layername[5], etc + + def __init__(self, num_inputs=-1): + LayerWithInputParser.__init__(self, num_inputs=num_inputs) + + @staticmethod + def get_layer_name(name_str): + m = WeightLayerParser.LAYER_PAT.match(name_str) + if not m: + return None + return m.group(1), m.group(2) + + def add_params(self, mcp): + LayerWithInputParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + dic['momW'] = mcp.safe_get_float_list(name, 'momW') + dic['momB'] = mcp.safe_get_float(name, 'momB') + dic['superEps'] = mcp.safe_get_float(name, 'superEps', default=0.0) + dic['superMom'] = mcp.safe_get_float(name, 'superMom', default=0.0) + dic['wc'] = mcp.safe_get_float_list(name, 'wc', default=[0.0] * len(dic['inputs'])) + dic['wball'] = mcp.safe_get_float_list(name, 'wball', default=[0.0] * len(dic['inputs'])) + self.verify_num_params(['momW', 'wc', 'wball']) +# dic['wballNormed'] = [wball * nweights for wball,nweights in zip(dic['wball'], dic['weightsPerFilter'])] + dic['wballNormed'] = dic['wball'] + + # Convert from old-style 0.001,0.02 hyperparam specification to new-stye + # const[base=0.001],const[base=0.02] and so forth + def convert_scalars_to_schedules(scalars): + parts = scalars.split(',') + for i,p in enumerate(parts): + p = p.strip() + if re.match('(?:\d*\.)?\d+$', p): + parts[i] = 'const[base=%s]' % p + return parts + + dic['epsW'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsW')), lrs_parsers, 'epsW', 'learning rate schedule', num_params=len(dic['inputs'])) + dic['epsB'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsB')), lrs_parsers, 'epsB', 'learning rate schedule', num_params=1)[0] + + dic['updatePeriod'] = mcp.safe_get_int(name, 'updatePeriod', default=0) # 0 means update as often as possible + # TODO: assert that updatePeriod is a multiple of active pass period, which is unknown here. + # the assert has to go in some post-processing step.. + dic['gradConsumer'] = dic['epsB']['params']['base'] > 0 or any(w['params']['base'] > 0 for w in dic['epsW']) + + @staticmethod + def unshare_weights(layer, layers, matrix_idx=None): + def unshare(layer, layers, indices): + for i in indices: + if layer['weightSourceLayers'][i] >= 0: + src_matrix_idx = layer['weightSourceMatrixIndices'][i] + layer['weightSourceLayers'][i] = "" + layer['weightSourceMatrixIndices'][i] = -1 + layer['weights'][i] = layer['weights'][i].copy() + layer['weightsInc'][i] = n.zeros_like(layer['weights'][i]) + print "Unshared weight matrix %s[%d] from %s[%d]." % (layer['name'], i, layer['weightSourceLayers'][i], src_matrix_idx) + else: + print "Weight matrix %s[%d] already unshared." % (layer['name'], i) + if 'weightSourceLayers' in layer: + unshare(layer, layers, range(len(layer['inputs'])) if matrix_idx is None else [matrix_idx]) + + # Load weight/biases initialization module + def call_init_func(self, param_name, shapes, input_idx=-1): + dic = self.dic + func_pat = re.compile('^([^\.]+)\.([^\(\)]+)\s*(?:\(([^,]+(?:,[^,]+)*)\))?$') + m = func_pat.match(dic[param_name]) + if not m: + raise LayerParsingError("Layer '%s': '%s' parameter must have format 'moduleName.functionName(param1,param2,...)'; got: %s." % (dic['name'], param_name, dic['initWFunc'])) + module, func = m.group(1), m.group(2) + params = m.group(3).split(',') if m.group(3) is not None else [] + try: + mod = __import__(module) + return getattr(mod, func)(dic['name'], input_idx, shapes, params=params) if input_idx >= 0 else getattr(mod, func)(dic['name'], shapes, params=params) + except (ImportError, AttributeError, TypeError), e: + raise LayerParsingError("Layer '%s': %s." % (dic['name'], e)) + + def make_weights(self, initW, rows, cols, order='C'): + dic = self.dic + dic['weights'], dic['weightsInc'] = [], [] + if dic['initWFunc']: # Initialize weights from user-supplied python function + # Initialization function is supplied in the format + # module.func + for i in xrange(len(dic['inputs'])): + dic['weights'] += [self.call_init_func('initWFunc', (rows[i], cols[i]), input_idx=i)] + + if type(dic['weights'][i]) != n.ndarray: + raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], i, dic['initWFunc'], type(dic['weights'][i]))) + if dic['weights'][i].dtype != n.float32: + raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must weight matrices consisting of single-precision floats. Got: %s." % (dic['name'], i, dic['initWFunc'], dic['weights'][i].dtype)) + if dic['weights'][i].shape != (rows[i], cols[i]): + raise LayerParsingError("Layer '%s[%d]': weight matrix returned by weight initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], i, dic['initWFunc'], (rows[i], cols[i]), dic['weights'][i].shape)) + # Convert to desired order + dic['weights'][i] = n.require(dic['weights'][i], requirements=order) + dic['weightsInc'] += [n.zeros_like(dic['weights'][i])] + print "Layer '%s[%d]' initialized weight matrices from function %s" % (dic['name'], i, dic['initWFunc']) + else: + for i in xrange(len(dic['inputs'])): + if dic['weightSourceLayers'][i] != '': # Shared weight matrix + src_layer = self.prev_layers[dic['weightSourceLayers'][i]] if dic['weightSourceLayers'][i] != dic['name'] else dic + dic['weights'] += [src_layer['weights'][dic['weightSourceMatrixIndices'][i]]] + dic['weightsInc'] += [src_layer['weightsInc'][dic['weightSourceMatrixIndices'][i]]] + if dic['weights'][i].shape != (rows[i], cols[i]): + raise LayerParsingError("Layer '%s': weight sharing source matrix '%s' has shape %dx%d; should be %dx%d." + % (dic['name'], dic['weightSource'][i], dic['weights'][i].shape[0], dic['weights'][i].shape[1], rows[i], cols[i])) + print "Layer '%s' initialized weight matrix %d from %s" % (dic['name'], i, dic['weightSource'][i]) + else: + dic['weights'] += [n.array(initW[i] * nr.randn(rows[i], cols[i]), dtype=n.single, order=order)] + dic['weightsInc'] += [n.zeros_like(dic['weights'][i])] + + def make_biases(self, rows, cols, order='C'): + dic = self.dic + if dic['initBFunc']: + dic['biases'] = self.call_init_func('initBFunc', (rows, cols)) + if type(dic['biases']) != n.ndarray: + raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], dic['initBFunc'], type(dic['biases']))) + if dic['biases'].dtype != n.float32: + raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object consisting of single-precision floats. Got: %s." % (dic['name'], dic['initBFunc'], dic['biases'].dtype)) + if dic['biases'].shape != (rows, cols): + raise LayerParsingError("Layer '%s': bias vector returned by bias initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], dic['initBFunc'], (rows, cols), dic['biases'].shape)) + + dic['biases'] = n.require(dic['biases'], requirements=order) + print "Layer '%s' initialized bias vector from function %s" % (dic['name'], dic['initBFunc']) + else: + dic['biases'] = dic['initB'] * n.ones((rows, cols), order=order, dtype=n.single) + dic['biasesInc'] = n.zeros_like(dic['biases']) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['requiresParams'] = True + dic['gradConsumer'] = True + dic['usesActs'] = False + dic['initW'] = mcp.safe_get_float_list(name, 'initW', default=0.01) + dic['initB'] = mcp.safe_get_float(name, 'initB', default=0) + dic['initWFunc'] = mcp.safe_get(name, 'initWFunc', default="") + dic['initBFunc'] = mcp.safe_get(name, 'initBFunc', default="") + # Find shared weight matrices + + dic['weightSource'] = mcp.safe_get_list(name, 'weightSource', default=[''] * len(dic['inputs'])) + self.verify_num_params(['initW']) + self.verify_num_params(['weightSource'], auto_expand=False) + + dic['weightSourceLayers'] = [] + dic['weightSourceMatrixIndices'] = [] + + for i, src_name in enumerate(dic['weightSource']): + src_layer_matrix_idx = -1 + src_layer_name = '' + if src_name != '': + src_layer_match = WeightLayerParser.get_layer_name(src_name) + if src_layer_match is None: + raise LayerParsingError("Layer '%s': unable to parse weight sharing source '%s'. Format is layer[idx] or just layer, in which case idx=0 is used." % (name, src_name)) + src_layer_name = src_layer_match[0] + src_layer_matrix_idx = int(src_layer_match[1]) if src_layer_match[1] is not None else 0 + + if src_layer_name not in prev_layers and src_layer_name != name: + raise LayerParsingError("Layer '%s': weight sharing source layer '%s' does not exist." % (name, src_layer_name)) + +# src_layer_idx = prev_names.index(src_layer_name) if src_layer_name != name else len(prev_names) + src_layer = prev_layers[src_layer_name] if src_layer_name != name else dic + if src_layer['gpu'] != dic['gpu']: + raise LayerParsingError("Layer '%s': weight sharing source layer '%s' runs on GPUs %s, while '%s' runs on GPUs %s." % (name, src_layer_name, src_layer['gpu'], name, dic['gpu'])) + if src_layer['type'] != dic['type']: + raise LayerParsingError("Layer '%s': weight sharing source layer '%s' is of type '%s'; should be '%s'." % (name, src_layer_name, src_layer['type'], dic['type'])) + if src_layer_name != name and len(src_layer['weights']) <= src_layer_matrix_idx: + raise LayerParsingError("Layer '%s': weight sharing source layer '%s' has %d weight matrices, but '%s[%d]' requested." % (name, src_layer_name, len(src_layer['weights']), src_name, src_layer_matrix_idx)) + if src_layer_name == name and src_layer_matrix_idx >= i: + raise LayerParsingError("Layer '%s': weight sharing source '%s[%d]' not defined yet." % (name, name, src_layer_matrix_idx)) + + dic['weightSourceLayers'] += [src_layer_name] + dic['weightSourceMatrixIndices'] += [src_layer_matrix_idx] + + return dic + +class FCLayerParser(WeightLayerParser): + def __init__(self): + WeightLayerParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model) + + dic['outputs'] = mcp.safe_get_int(name, 'outputs') + dic['weightsPerFilter'] = dic['numInputs'] + self.verify_num_range(dic['outputs'], 'outputs', 1, None) + self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']] * len(dic['numInputs']), order='F') + self.make_biases(1, dic['outputs'], order='F') + + print "Initialized fully-connected layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class SplitFCLayerParser(WeightLayerParser): + def __init__(self): + WeightLayerParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model) + dic['parts'] = mcp.safe_get_int(name, 'parts') + dic['outputs'] = mcp.safe_get_int(name, 'outputs') * dic['parts'] + dic['weightsPerFilter'] = dic['numInputs'] + self.verify_num_range(dic['parts'], 'parts', 1, None) + + self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']/dic['parts']] * len(dic['numInputs']), order='F') + self.make_biases(1, dic['outputs'], order='F') + + for i in xrange(len(dic['numInputs'])): + self.verify_divisible(dic['numInputs'][i], dic['parts'], 'numInputs', 'parts', input_idx=i) + + print "Initialized split fully-connected layer '%s' on GPUs %s, producing %d outputs in %d parts" % (name, dic['gpus'], dic['outputs'], dic['parts']) + return dic + +class LocalLayerParser(WeightLayerParser): + def __init__(self): + WeightLayerParser.__init__(self) + + # Convert convolutional layer to unshared, locally-connected layer + @staticmethod + def conv_to_local(layers, lname): + layer = layers[lname] + if layer['type'] == 'conv': + layer['type'] = 'local' + for inp,inpname in enumerate(layer['inputs']): + src_layer_name = layer['weightSourceLayers'][inp] + if src_layer_name != '': + src_layer = layers[src_layer_name] + src_matrix_idx = layer['weightSourceMatrixIndices'][inp] + LocalLayerParser.conv_to_local(layers, src_layer_name) + for w in ('weights', 'weightsInc'): + layer[w][inp] = src_layer[w][src_matrix_idx] + else: + layer['weights'][inp] = n.require(n.reshape(n.tile(n.reshape(layer['weights'][inp], (1, n.prod(layer['weights'][inp].shape))), (layer['modules'], 1)), + (layer['modules'] * layer['filterChannels'][inp] * layer['filterPixels'][inp], layer['filters'])), + requirements='C') + layer['weightsInc'][inp] = n.zeros_like(layer['weights'][inp]) + if layer['sharedBiases']: + layer['biases'] = n.require(n.repeat(layer['biases'], layer['modules'], axis=0), requirements='C') + layer['biasesInc'] = n.zeros_like(layer['biases']) + + print "Converted layer '%s' from convolutional to unshared, locally-connected" % layer['name'] + + # Also call this function on any layers sharing my weights + for l in layers: + if 'weightSourceLayers' in l and lname in l['weightSourceLayers']: + LocalLayerParser.conv_to_local(layers, l) + return layer + + def parse(self, name, mcp, prev_layers, model): + dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model) + dic['requiresParams'] = True + dic['usesActs'] = False + # Supplied values + dic['channels'] = mcp.safe_get_int_list(name, 'channels') + dic['padding'] = mcp.safe_get_int_list(name, 'padding', default=[0]*len(dic['inputs'])) + dic['stride'] = mcp.safe_get_int_list(name, 'stride', default=[1]*len(dic['inputs'])) + dic['filterSize'] = mcp.safe_get_int_list(name, 'filterSize') + dic['filters'] = mcp.safe_get_int_list(name, 'filters') + dic['groups'] = mcp.safe_get_int_list(name, 'groups', default=[1]*len(dic['inputs'])) + dic['initW'] = mcp.safe_get_float_list(name, 'initW') + dic['initCFunc'] = mcp.safe_get(name, 'initCFunc', default='') + dic['modulesX'] = mcp.safe_get_int(name, 'modulesX', default=0) + + + self.verify_num_params(['channels', 'padding', 'stride', 'filterSize', \ + 'filters', 'groups', 'initW']) + + self.verify_num_range(dic['stride'], 'stride', 1, None) + self.verify_num_range(dic['filterSize'],'filterSize', 1, None) + self.verify_num_range(dic['padding'], 'padding', 0, None) + self.verify_num_range(dic['channels'], 'channels', 1, None) + self.verify_num_range(dic['groups'], 'groups', 1, None) + self.verify_num_range(dic['modulesX'], 'modulesX', 0, None) + for i in xrange(len(dic['filters'])): + self.verify_divisible(dic['filters'][i], 16, 'filters', input_idx=i) + + # Computed values + dic['imgPixels'] = [numInputs/channels for numInputs,channels in zip(dic['numInputs'], dic['channels'])] + dic['imgSize'] = [int(n.sqrt(imgPixels)) for imgPixels in dic['imgPixels']] + self.verify_num_range(dic['imgSize'], 'imgSize', 1, None) + dic['filters'] = [filters*groups for filters,groups in zip(dic['filters'], dic['groups'])] + dic['filterPixels'] = [filterSize**2 for filterSize in dic['filterSize']] + if dic['modulesX'] <= 0: + dic['modulesX'] = [1 + int(ceil((2*padding + imgSize - filterSize) / float(stride))) for padding,imgSize,filterSize,stride in zip(dic['padding'], dic['imgSize'], dic['filterSize'], dic['stride'])] + else: + dic['modulesX'] = [dic['modulesX']] * len(dic['inputs']) + + dic['filterChannels'] = [channels/groups for channels,groups in zip(dic['channels'], dic['groups'])] + + if len(set(dic['modulesX'])) != 1 or len(set(dic['filters'])) != 1: + raise LayerParsingError("Layer '%s': all inputs must produce equally-dimensioned output. Dimensions are: %s." % (name, ", ".join("%dx%dx%d" % (filters, modulesX, modulesX) for filters,modulesX in zip(dic['filters'], dic['modulesX'])))) + + dic['modulesX'] = dic['modulesX'][0] + dic['modules'] = dic['modulesX']**2 + dic['filters'] = dic['filters'][0] + dic['outputs'] = dic['modules'] * dic['filters'] +# dic['filterConns'] = [[]] * len(dic['inputs']) + for i in xrange(len(dic['inputs'])): + if dic['numInputs'][i] % dic['imgPixels'][i] != 0 or dic['imgSize'][i] * dic['imgSize'][i] != dic['imgPixels'][i]: + raise LayerParsingError("Layer '%s[%d]': has %-d dimensional input, not interpretable as square %d-channel images" % (name, i, dic['numInputs'][i], dic['channels'][i])) + if dic['channels'][i] > 3 and dic['channels'][i] % 4 != 0: + raise LayerParsingError("Layer '%s[%d]': number of channels must be smaller than 4 or divisible by 4" % (name, i)) +# if dic['filterSize'][i] > totalPadding[i] + dic['imgSize'][i]: +# raise LayerParsingError("Layer '%s[%d]': filter size (%d) greater than image size + padding (%d)" % (name, i, dic['filterSize'][i], dic['padding'][i] + dic['imgSize'][i])) + if -dic['padding'][i] + dic['stride'][i] * (dic['modulesX'] - 1) + dic['filterSize'][i] < dic['imgSize'][i]: + raise LayerParsingError("Layer '%s[%d]': %dx%d output map with padding=%d, stride=%d does not cover entire input image." % (name, i, dic['modulesX'], dic['outputsX'], dic['padding'][i], dic['stride'][i])) + + if dic['groups'][i] > 1: + self.verify_divisible(dic['channels'][i], 4*dic['groups'][i], 'channels', '4 * groups', input_idx=i) + self.verify_divisible(dic['channels'][i], dic['groups'][i], 'channels', 'groups', input_idx=i) + + self.verify_divisible(dic['filters'], 16*dic['groups'][i], 'filters * groups', input_idx=i) + + + dic['padding'][i] = -dic['padding'][i] +# dic['overSample'] = [groups*filterChannels/channels for groups,filterChannels,channels in zip(dic['groups'], dic['filterChannels'], dic['channels'])] + dic['weightsPerFilter'] = [fc * (fz**2) for fc, fz in zip(dic['filterChannels'], dic['filterSize'])] + + return dic + +class ConvLayerParser(LocalLayerParser): + def __init__(self): + LocalLayerParser.__init__(self) + + def add_params(self, mcp): + LocalLayerParser.add_params(self, mcp) + self.dic['wcNormMax'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMax', default=[0.0] * len(self.dic['inputs'])) + self.dic['wcNormMin'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMin', default=[0.0] * len(self.dic['inputs'])) + self.verify_num_params(['wcNormMax', 'wcNormMin']) + for min,max in zip(self.dic['wcNormMin'], self.dic['wcNormMax']): + if min > max: + raise LayerParsingError("Layer '%s': wcNormMin must be <= wcNormMax." % (self.dic['name'])) + + def parse(self, name, mcp, prev_layers, model): + dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model) + + dic['sumWidth'] = mcp.safe_get_int(name, 'sumWidth') + dic['sharedBiases'] = mcp.safe_get_bool(name, 'sharedBiases', default=True) + + num_biases = dic['filters'] if dic['sharedBiases'] else dic['modules']*dic['filters'] + + eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)] + self.make_weights(dic['initW'], eltmult(dic['filterPixels'], dic['filterChannels']), [dic['filters']] * len(dic['inputs']), order='C') + self.make_biases(num_biases, 1, order='C') + + print "Initialized convolutional layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters']) + return dic + +class LocalUnsharedLayerParser(LocalLayerParser): + def __init__(self): + LocalLayerParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model) + eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)] + scmult = lambda x, lst: [x * l for l in lst] + self.make_weights(dic['initW'], scmult(dic['modules'], eltmult(dic['filterPixels'], dic['filterChannels'])), [dic['filters']] * len(dic['inputs']), order='C') + self.make_biases(dic['modules'] * dic['filters'], 1, order='C') + + print "Initialized locally-connected layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters']) + return dic + +class DataLayerParser(LayerParser): + def __init__(self): + LayerParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerParser.parse(self, name, mcp, prev_layers, model) + dic['dataIdx'] = mcp.safe_get_int(name, 'dataIdx') + dic['start'] = mcp.safe_get_int(name, 'start', default=0) + dic['end'] = mcp.safe_get_int(name, 'end', default=model.train_data_provider.get_data_dims(idx=dic['dataIdx'])) + dic['outputs'] = dic['end'] - dic['start'] +# dic['usesActs'] = False + print "Initialized data layer '%s', producing %d outputs" % (name, dic['outputs']) + return dic + +class SoftmaxLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['outputs'] = dic['inputLayers'][0]['outputs'] + print "Initialized softmax layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class ConcatentionLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers']) + dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))] + print "Initialized concatenation layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class PassThroughLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self) + + # Note: this doesn't verify all the necessary constraints. Layer construction may still fail in C++ code. + # For example, it does not verify that every layer only has one pass-through parent. Obviously having + # two such parents is incoherent. + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) +# if len(dic['inputLayers']) == 1: +# raise LayerParsingError("Layer %s: pass-through layer must have more than one input." % dic['name']) + if len(dic['gpu']) != len(dic['inputLayers'][0]['gpu']): + raise LayerParsingError("Layer '%s': number of replicas in pass-through layer must be equivalent to number of replicas in input layers." % dic['name']) + for inp in dic['inputLayers']: + conflicting_layers = [l for l in prev_layers.values() if l['type'] == 'pass' and inp['name'] in l['inputs'] and len(set(dic['gpu']).intersection(set(l['gpu']))) > 0] + if len(conflicting_layers) > 0: + raise LayerParsingError("Layer '%s' conflicts with layer '%s'. Both pass-through layers take layer '%s' as input and operate on an overlapping set of GPUs." % (dic['name'], conflicting_layers[0]['name'], inp['name'])) + dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers']) +# dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))] + print "Initialized pass-through layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) + return dic + +class PoolLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def add_params(self, mcp): + LayerWithInputParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['channels'] = mcp.safe_get_int(name, 'channels') + dic['sizeX'] = mcp.safe_get_int(name, 'sizeX') + dic['start'] = mcp.safe_get_int(name, 'start', default=0) + dic['stride'] = mcp.safe_get_int(name, 'stride') + dic['outputsX'] = mcp.safe_get_int(name, 'outputsX', default=0) + dic['pool'] = mcp.safe_get(name, 'pool') + + # Avg pooler does not use its acts or inputs + dic['usesActs'] = dic['pool'] != 'avg' + dic['usesInputs'] = dic['pool'] != 'avg' + + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + + if dic['pool'] == 'avg': + dic['sum'] = mcp.safe_get_bool(name, 'sum', default=False) + + self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize']) + self.verify_num_range(dic['stride'], 'stride', 1, dic['sizeX']) + self.verify_num_range(dic['outputsX'], 'outputsX', 0, None) + self.verify_num_range(dic['channels'], 'channels', 1, None) + + if LayerWithInputParser.grad_consumers_below(dic): + self.verify_divisible(dic['channels'], 16, 'channels') + self.verify_str_in(dic['pool'], 'pool', ['max', 'maxabs', 'avg']) + + self.verify_img_size() + + if dic['outputsX'] <= 0: + dic['outputsX'] = int(ceil((dic['imgSize'] - dic['start'] - dic['sizeX']) / float(dic['stride']))) + 1; + dic['outputs'] = dic['outputsX']**2 * dic['channels'] + + print "Initialized %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels']) + return dic + + +class CrossMapPoolLayerParser(LayerWithInputParser): + def __init__(self): + LayerWithInputParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['channels'] = mcp.safe_get_int(name, 'channels') + dic['size'] = mcp.safe_get_int(name, 'size') + dic['start'] = mcp.safe_get_int(name, 'start', default=0) + dic['stride'] = mcp.safe_get_int(name, 'stride') + dic['outputChannels'] = mcp.safe_get_int(name, 'outputs', default=0) + dic['pool'] = mcp.safe_get(name, 'pool') + dic['requiresParams'] = False + + # Avg pooler does not use its acts or inputs + dic['usesActs'] = 'pool' != 'avg' + dic['usesInputs'] = 'pool' != 'avg' + + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + dic['outputs'] = dic['outputChannels'] * dic['imgPixels'] + + self.verify_num_range(dic['size'], 'size', 1, dic['channels']) + self.verify_num_range(dic['stride'], 'stride', 1, dic['size']) + self.verify_num_range(dic['outputChannels'], 'outputChannels', 0, None) + self.verify_num_range(dic['channels'], 'channels', 1, None) + self.verify_num_range(dic['start'], 'start', None, 0) + + self.verify_str_in(dic['pool'], 'pool', ['max']) + self.verify_img_size() + + covered_chans = dic['start'] + (dic['outputChannels'] - 1) * dic['stride'] + dic['size'] + if covered_chans < dic['channels']: + raise LayerParsingError("Layer '%s': cross-map pooling with start=%d, stride=%d, size=%d, outputs=%d covers only %d of %d input channels." % \ + (name, dic['start'], dic['stride'], dic['size'], dic['outputChannels'], covered_chans, dic['channels'])) + + print "Initialized cross-map %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['outputChannels']) + return dic + +class NormLayerParser(LayerWithInputParser): + RESPONSE_NORM = 'response' + CONTRAST_NORM = 'contrast' + CROSSMAP_RESPONSE_NORM = 'cross-map response' + + def __init__(self, norm_type): + LayerWithInputParser.__init__(self, num_inputs=1) + self.norm_type = norm_type + + def add_params(self, mcp): + LayerWithInputParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + dic['scale'] = mcp.safe_get_float(name, 'scale') + dic['scale'] /= dic['size'] if self.norm_type == self.CROSSMAP_RESPONSE_NORM else dic['size']**2 + dic['pow'] = mcp.safe_get_float(name, 'pow') + dic['minDiv'] = mcp.safe_get_float(name, 'minDiv', default=1.0) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['requiresParams'] = True + dic['channels'] = mcp.safe_get_int(name, 'channels') + dic['size'] = mcp.safe_get_int(name, 'size') + dic['blocked'] = mcp.safe_get_bool(name, 'blocked', default=False) + + dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] + dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) + + # Contrast normalization layer does not use its inputs + dic['usesInputs'] = self.norm_type != self.CONTRAST_NORM + + self.verify_num_range(dic['channels'], 'channels', 1, None) + if self.norm_type == self.CROSSMAP_RESPONSE_NORM: + self.verify_num_range(dic['size'], 'size', 2, dic['channels']) + if dic['channels'] % 16 != 0: + raise LayerParsingError("Layer '%s': number of channels must be divisible by 16 when using crossMap" % name) + else: + self.verify_num_range(dic['size'], 'size', 1, dic['imgSize']) + + if self.norm_type != self.CROSSMAP_RESPONSE_NORM and dic['channels'] > 3 and dic['channels'] % 4 != 0: + raise LayerParsingError("Layer '%s': number of channels must be smaller than 4 or divisible by 4" % name) + + self.verify_img_size() + + dic['outputs'] = dic['imgPixels'] * dic['channels'] + print "Initialized %s-normalization layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (self.norm_type, name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['channels']) + return dic + +class CostParser(LayerWithInputParser): + def __init__(self, num_inputs=-1): + LayerWithInputParser.__init__(self, num_inputs=num_inputs) + + def parse(self, name, mcp, prev_layers, model): + dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) + dic['requiresParams'] = True + # Stored as string because python can't pickle lambda functions + dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs]' + dic['children'] = mcp.safe_get_list(name, 'children', default=[]) + # Aggregated costs only produce outputs which are additive. + for c in dic['children']: + if c not in prev_layers: + raise LayerParsingError("Layer '%s': child cost layer '%s' not defined" % (name, c)) + if prev_layers[c]['type'] != dic['type']: + raise LayerParsingError("Layer '%s': child cost layer '%s' must have same type as parent" % (name, c)) + prev_layers[c]['aggregated'] = 1 + dic['aggregated'] = dic['children'] != [] + del dic['neuron'] + return dic + + def add_params(self, mcp): + LayerWithInputParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + dic['coeff'] = mcp.safe_get_float(name, 'coeff') + dic['gradConsumer'] = dic['coeff'] > 0 + +class CrossEntCostParser(CostParser): + def __init__(self): + CostParser.__init__(self, num_inputs=2) + + def parse(self, name, mcp, prev_layers, model): + dic = CostParser.parse(self, name, mcp, prev_layers, model) + if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels + raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name) + if dic['inputLayers'][1]['type'] != 'softmax': + raise LayerParsingError("Layer '%s': Second input must be softmax layer" % name) + if dic['numInputs'][1] != model.train_data_provider.get_num_classes(): + raise LayerParsingError("Layer '%s': Softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \ + % (name, dic['inputs'][1], model.train_data_provider.get_num_classes())) + + print "Initialized cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus']) + return dic + +class LogregCostParser(CostParser): + def __init__(self): + CostParser.__init__(self, num_inputs=2) + + def add_params(self, mcp): + CostParser.add_params(self, mcp) + dic, name = self.dic, self.dic['name'] + dic['topk'] = mcp.safe_get_int(name, 'topk', default=1) + if dic['topk'] > dic['numInputs'][1]: + raise LayerParsingError("Layer '%s': parameter 'topk'must not have value greater than the number of classess." % (name)) + + def parse(self, name, mcp, prev_layers, model): + dic = CostParser.parse(self, name, mcp, prev_layers, model) + dic['requiresParams'] = True + if dic['numInputs'][0] != 1: # first input must be labels + raise LayerParsingError("Layer '%s': dimensionality of first input must be 1" % name) + if dic['inputLayers'][1]['type'] != 'softmax': + raise LayerParsingError("Layer '%s': second input must be softmax layer" % name) + if dic['numInputs'][1] != model.train_data_provider.get_num_classes(): + raise LayerParsingError("Layer '%s': softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \ + % (name, dic['inputs'][1], model.train_data_provider.get_num_classes())) + + print "Initialized logistic regression cost '%s' on GPUs %s" % (name, dic['gpus']) + return dic + +class BinomialCrossEntCostParser(CostParser): + def __init__(self): + CostParser.__init__(self, num_inputs=2) + + def add_params(self, mcp): + CostParser.add_params(self, mcp) + self.dic['posWeight'] = mcp.safe_get_float(self.dic['name'], 'posWeight', default=1.0) + + def parse(self, name, mcp, prev_layers, model): + dic = CostParser.parse(self, name, mcp, prev_layers, model) + + if dic['numInputs'][0] != dic['numInputs'][1]: + raise LayerParsingError("Layer '%s': both inputs must produce the same number of outputs" % (name)) + + if 'neuron' not in dic['inputLayers'][1] or dic['inputLayers'][1]['neuron'] != 'logistic': + print "WARNING: Layer '%s': input '%s' is not logistic, results may not be what you intend." % (dic['name'], dic['inputs'][1]) + + if dic['type'] == 'cost.bce': + print "Initialized binomial cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus']) + + + dic['computeSoftmaxErrorRate'] = True + return dic + +class DetectionCrossEntCostParser(BinomialCrossEntCostParser): + def __init__(self): + BinomialCrossEntCostParser.__init__(self) + + def parse(self, name, mcp, prev_layers, model): + dic = BinomialCrossEntCostParser.parse(self, name, mcp, prev_layers, model) + if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels + raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name) + dic['computeSoftmaxErrorRate'] = False + dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs[:2]] + [(class_cost[2] / class_cost[j] if class_cost[j] > 0 else n.inf) for class_cost in [costs[2:][i*3:(i+1)*3] for i in range(len(costs[2:])/3)] for j in range(2)]' + dic['outputFilterFormatter'] = 'lambda self,costs: "(crossent) %.6f, (err) %.6f, " % (costs[0], costs[1]) + ", ".join("(%s) %.6f, %.6f" % (self.train_data_provider.batch_meta["label_names"][i/2-1],costs[i],costs[i+1]) for i in xrange(2, len(costs), 2))' + print "Initialized detection cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus']) + return dic + +class SumOfSquaresCostParser(CostParser): + def __init__(self): + CostParser.__init__(self, num_inputs=1) + + def parse(self, name, mcp, prev_layers, model): + dic = CostParser.parse(self, name, mcp, prev_layers, model) + print "Initialized sum-of-squares cost '%s' on GPUs %s" % (name, dic['gpus']) + return dic + +# All the layer parsers +layer_parsers = {'data' : lambda : DataLayerParser(), + 'fc': lambda : FCLayerParser(), + 'sfc': lambda : SplitFCLayerParser(), + 'conv': lambda : ConvLayerParser(), + 'local': lambda : LocalUnsharedLayerParser(), + 'softmax': lambda : SoftmaxLayerParser(), + 'eltsum': lambda : EltwiseSumLayerParser(), + 'eltmax': lambda : EltwiseMaxLayerParser(), + 'sum': lambda : SumLayerParser(), + 'neuron': lambda : NeuronLayerParser(), + 'pool': lambda : PoolLayerParser(), + 'cmpool': lambda : CrossMapPoolLayerParser(), + 'rnorm': lambda : NormLayerParser(NormLayerParser.RESPONSE_NORM), + 'cnorm': lambda : NormLayerParser(NormLayerParser.CONTRAST_NORM), + 'cmrnorm': lambda : NormLayerParser(NormLayerParser.CROSSMAP_RESPONSE_NORM), + 'nailbed': lambda : NailbedLayerParser(), + 'blur': lambda : GaussianBlurLayerParser(), + 'href': lambda : HorizontalReflectionLayerParser(), + 'resize': lambda : ResizeLayerParser(), + 'rgb2yuv': lambda : RGBToYUVLayerParser(), + 'rgb2lab': lambda : RGBToLABLayerParser(), + 'rscale': lambda : RandomScaleLayerParser(), + 'crop': lambda : CropLayerParser(), + 'concat': lambda : ConcatentionLayerParser(), + 'pass': lambda : PassThroughLayerParser(), + 'dropout': lambda : DropoutLayerParser(), + 'dropout2': lambda : Dropout2LayerParser(), + 'cost.logreg': lambda : LogregCostParser(), + 'cost.crossent': lambda : CrossEntCostParser(), + 'cost.bce': lambda : BinomialCrossEntCostParser(), + 'cost.dce': lambda : DetectionCrossEntCostParser(), + 'cost.sum2': lambda : SumOfSquaresCostParser()} + +# All the neuron parsers +# This isn't a name --> parser mapping as the layer parsers above because neurons don't have fixed names. +# A user may write tanh[0.5,0.25], etc. +neuron_parsers = sorted([NeuronParser('ident', 'f(x) = x', uses_acts=False, uses_inputs=False), + NeuronParser('logistic', 'f(x) = 1 / (1 + e^-x)', uses_acts=True, uses_inputs=False), + NeuronParser('abs', 'f(x) = |x|', uses_acts=False, uses_inputs=True), + NeuronParser('relu', 'f(x) = max(0, x)', uses_acts=True, uses_inputs=False), + NeuronParser('nrelu', 'f(x) = max(0, x) + noise', uses_acts=True, uses_inputs=False), + NeuronParser('softrelu', 'f(x) = log(1 + e^x)', uses_acts=True, uses_inputs=False), + NeuronParser('square', 'f(x) = x^2', uses_acts=False, uses_inputs=True), + NeuronParser('sqrt', 'f(x) = sqrt(x)', uses_acts=True, uses_inputs=False), + ParamNeuronParser('log[a]', 'f(x) = log(a + x)', uses_acts=False, uses_inputs=True), + ParamNeuronParser('tanh[a,b]', 'f(x) = a * tanh(b * x)', uses_acts=True, uses_inputs=False), + ParamNeuronParser('brelu[a]', 'f(x) = min(a, max(0, x))', uses_acts=True, uses_inputs=False), + ParamNeuronParser('linear[a,b]', 'f(x) = a * x + b', uses_acts=True, uses_inputs=False), + ParamNeuronParser('drelu[a]', 'f(x) = x - a * tanh(x / a)', uses_acts=False, uses_inputs=True)], + key=lambda x:x.type) + +# Learning rate schedules +lrs_parsers = sorted([ParamParser('const[fbase]'), + ParamParser('linear[fbase;ftgtFactor]'), + ParamParser('exp[fbase;ftgtFactor]'), + ParamParser('dexp[fbase;ftgtFactor;inumSteps]')]) diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg new file mode 100644 index 0000000..a24d538 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg @@ -0,0 +1,57 @@ +# 11% error on CIFAR-10 - layer parameter file +# Methodology: +# 1. Train on batches 1-4, use batch 5 for validation. +# 2. After about 350 epochs, validation error no longer making improvements. +# 3. Fold in batch 5. +# 4. Train on batches 1-5 for about 150 more epochs, until the batch 5 error is near the errors for batches 1-4. It takes forever to actually get there but after 150 epochs it's close enough. +# 5. Lower learning rates (epsW) by a factor of 10 to 0.0001, train for 10 more epochs. +# 6. Lower learning rates (epsW) by another factor of 10 to 0.00001, train for 10 more epochs. +# 7. Stop. Test on batch 6 with --test-range=6 --multiview-test=1 --logreg-name=logprob (read more about what this does here: http://code.google.com/p/cuda-convnet/wiki/TrainingNet#Training_on_image_translations ) + +# More details about methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology + +[conv1] +epsW=0.001 +epsB=0.002 +momW=0.9 +momB=0.9 +wc=0.000 + +[conv2] +epsW=0.001 +epsB=0.002 +momW=0.9 +momB=0.9 +wc=0.000 + +[local3] +epsW=0.001 +epsB=0.002 +momW=0.9 +momB=0.9 +wc=0.004 + +[local4] +epsW=0.001 +epsB=0.002 +momW=0.9 +momB=0.9 +wc=0.004 + +[fc10] +epsW=0.001 +epsB=0.002 +momW=0.9 +momB=0.9 +wc=0.01 + +[logprob] +coeff=1 + +[rnorm1] +scale=0.001 +pow=0.75 + +[rnorm2] +scale=0.001 +pow=0.75 diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg new file mode 100644 index 0000000..9462f5b --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg @@ -0,0 +1,93 @@ +[conv1] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv3] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv4] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv5] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc4096a] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc4096b] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1000] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[logprob] +coeff=1 +topk=5 + +[dropout1] +enable=true + +[dropout2] +enable=true + +[rnorm1] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm2] +scale=0.0001 +pow=0.75 +minDiv=2 + + diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg new file mode 100644 index 0000000..f06dda2 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg @@ -0,0 +1,93 @@ +[conv1] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv3] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv4] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv5] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc4096a] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc4096b] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1000] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[logprob] +coeff=1 +topk=5 + +[dropout1] +enable=true + +[dropout2] +enable=true + +[rnorm1] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm2] +scale=0.0001 +pow=0.75 +minDiv=2 + + diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg new file mode 100644 index 0000000..a4dba87 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg @@ -0,0 +1,182 @@ +[conv1a] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 + + +[conv1b] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 + + +[conv2a] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 + + +[conv2b] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 + + +[conv3a] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + + + +[conv3b] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + + +[conv4a] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 + + +[conv4b] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 + + +[conv5a] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 + + +[conv5b] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 + + +[fc2048a] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + + +[fc2048b] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + + +[fc2048ba] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + + +[fc2048bb] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + +[fc1000] +epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] +momW=0.9,0.9 +momB=0.9 +wc=0.0005,0.0005 +wball=0,0 + + +[logprob] +coeff=1 +topk=5 + +[dropout1a] +enable=true +keep=0.5 + +[dropout2a] +enable=true +keep=0.5 + +[dropout1b] +enable=true +keep=0.5 + +[dropout2b] +enable=true +keep=0.5 + +[rnorm1a] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm1b] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm2a] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm2b] +scale=0.0001 +pow=0.75 +minDiv=2 + +[cnorm2a] +scale=0.001 +pow=0.75 + +[cnorm2b] +scale=0.001 +pow=0.75 diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg new file mode 100644 index 0000000..4d1f078 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg @@ -0,0 +1,169 @@ +[conv1] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] + +[conv2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] + +[conv3] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] + +[conv4] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] + +[conv5] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] + +[fc1024a] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024b] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024c] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024d] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024ba] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024bb] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024bc] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1024bd] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + + +[fc1000] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[logprob] +coeff=1 +topk=5 + +[dropout1a] +enable=true +keep=0.5 + +[dropout1b] +enable=true +keep=0.5 + +[dropout1c] +enable=true +keep=0.5 + +[dropout1d] +enable=true +keep=0.5 + +[dropout2a] +enable=true +keep=0.5 + +[dropout2b] +enable=true +keep=0.5 + +[dropout2c] +enable=true +keep=0.5 + +[dropout2d] +enable=true +keep=0.5 + +[rnorm1] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm2] +scale=0.0001 +pow=0.75 +minDiv=2 + + diff --git a/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg new file mode 100644 index 0000000..b3febfd --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg @@ -0,0 +1,93 @@ +[conv1] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv2] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0.00 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv3] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv4] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[conv5] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc4096a] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc4096b] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[fc1000] +momW=0.9 +momB=0.9 +wc=0.0005 +wball=0 +epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] +epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] +updatePeriod=1 + +[logprob] +coeff=1 +topk=5 + +[dropout1] +enable=true + +[dropout2] +enable=true + +[rnorm1] +scale=0.0001 +pow=0.75 +minDiv=2 + +[rnorm2] +scale=0.0001 +pow=0.75 +minDiv=2 + + diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg new file mode 100644 index 0000000..44fc31a --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg @@ -0,0 +1,103 @@ +[data] +type=data +dataIdx=0 + +[labels] +type=data +dataIdx=1 + +[conv1] +type=conv +inputs=data +channels=3 +filters=64 +padding=2 +stride=1 +filterSize=5 +neuron=relu +initW=0.0001 +sumWidth=4 +sharedBiases=1 +gpu=0 + +[pool1] +type=pool +pool=max +inputs=conv1 +start=0 +sizeX=3 +stride=2 +outputsX=0 +channels=64 + +[rnorm1] +type=cmrnorm +inputs=pool1 +channels=64 +size=9 + +[conv2] +type=conv +inputs=rnorm1 +filters=64 +padding=2 +stride=1 +filterSize=5 +channels=64 +neuron=relu +initW=0.01 +sumWidth=2 +sharedBiases=1 + +[rnorm2] +type=cmrnorm +inputs=conv2 +channels=64 +size=9 + +[pool2] +type=pool +pool=max +inputs=rnorm2 +start=0 +sizeX=3 +stride=2 +outputsX=0 +channels=64 + +[local3] +type=local +inputs=pool2 +filters=64 +padding=1 +stride=1 +filterSize=3 +channels=64 +neuron=relu +initW=0.04 + +[local4] +type=local +inputs=local3 +filters=32 +padding=1 +stride=1 +filterSize=3 +channels=64 +neuron=relu +initW=0.04 + +[fc10] +type=fc +outputs=10 +inputs=local4 +initW=0.01 + +[probs] +type=softmax +inputs=fc10 + +[logprob] +type=cost.logreg +inputs=labels,probs +gpu=0 diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg new file mode 100644 index 0000000..0b549bb --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg @@ -0,0 +1,155 @@ +[data] +type=data +dataIdx=0 + +[labvec] +type=data +dataIdx=1 + +[conv1] +type=conv +inputs=data +channels=3 +filters=64 +padding=0 +stride=4 +filterSize=11 +initW=0.01 +sumWidth=4 +sharedBiases=1 +gpu=0 + +[rnorm1] +type=cmrnorm +inputs=conv1 +channels=64 +size=5 + +[pool1] +type=pool +pool=max +inputs=rnorm1 +sizeX=3 +stride=2 +channels=64 +neuron=relu + +[conv2] +type=conv +inputs=pool1 +filters=192 +padding=2 +stride=1 +filterSize=5 +channels=64 +initW=0.01 +initB=1 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[rnorm2] +type=cmrnorm +inputs=conv2 +channels=192 +size=5 + +[pool2] +type=pool +pool=max +inputs=rnorm2 +sizeX=3 +stride=2 +channels=192 + +[conv3] +type=conv +inputs=pool2 +filters=384 +padding=1 +stride=1 +filterSize=3 +channels=192 +initW=0.03 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[conv4] +type=conv +inputs=conv3 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=384 +neuron=relu +initW=0.03 +initB=1 +sumWidth=3 +sharedBiases=1 + +[conv5] +type=conv +inputs=conv4 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=256 +initW=0.03 +initB=1 +sumWidth=3 + +[pool3] +type=pool +pool=max +inputs=conv5 +sizeX=3 +stride=2 +channels=256 +neuron=relu + +[fc4096a] +type=fc +inputs=pool3 +outputs=4096 +initW=0.01 +initB=1 +neuron=relu +gpu=0 + +[dropout1] +type=dropout2 +inputs=fc4096a + +[fc4096b] +type=fc +inputs=dropout1 +outputs=4096 +initW=0.01 +initB=1 +neuron=relu +gpu=0 + +[dropout2] +type=dropout2 +inputs=fc4096b + +[fc1000] +type=fc +outputs=1000 +inputs=dropout2 +initW=0.01 +initB=-7 +gpu=0 + +[probs] +type=softmax +inputs=fc1000 + +[logprob] +type=cost.logreg +inputs=labvec,probs +gpu=0 + diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg new file mode 100644 index 0000000..f27093c --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg @@ -0,0 +1,152 @@ +[data] +type=data +dataIdx=0 + +[labvec] +type=data +dataIdx=1 + +[conv1] +type=conv +inputs=data +channels=3 +filters=64 +padding=0 +stride=4 +filterSize=11 +initW=0.01 +sumWidth=4 +sharedBiases=1 +gpu=0,1 + +[rnorm1] +type=cmrnorm +inputs=conv1 +channels=64 +size=5 + +[pool1] +type=pool +pool=max +inputs=rnorm1 +sizeX=3 +stride=2 +channels=64 +neuron=relu + +[conv2] +type=conv +inputs=pool1 +filters=192 +padding=2 +stride=1 +filterSize=5 +channels=64 +initW=0.01 +initB=1 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[rnorm2] +type=cmrnorm +inputs=conv2 +channels=192 +size=5 + +[pool2] +type=pool +pool=max +inputs=rnorm2 +sizeX=3 +stride=2 +channels=192 + +[conv3] +type=conv +inputs=pool2 +filters=384 +padding=1 +stride=1 +filterSize=3 +channels=192 +initW=0.03 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[conv4] +type=conv +inputs=conv3 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=384 +neuron=relu +initW=0.03 +initB=1 +sumWidth=3 +sharedBiases=1 + +[conv5] +type=conv +inputs=conv4 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=256 +initW=0.03 +initB=1 +sumWidth=3 + +[pool3] +type=pool +pool=max +inputs=conv5 +sizeX=3 +stride=2 +channels=256 +neuron=relu + +[fc4096a] +type=fc +inputs=pool3 +outputs=4096 +initW=0.01 +initB=1 +neuron=relu + +[dropout1] +type=dropout2 +inputs=fc4096a + +[fc4096b] +type=fc +inputs=dropout1 +outputs=4096 +initW=0.01 +initB=1 +neuron=relu + +[dropout2] +type=dropout2 +inputs=fc4096b + +[fc1000] +type=fc +outputs=1000 +inputs=dropout2 +initW=0.01 +initB=-7 + +[probs] +type=softmax +inputs=fc1000 + +[logprob] +type=cost.logreg +inputs=labvec,probs +gpu=0,1 + diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg new file mode 100644 index 0000000..5180134 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg @@ -0,0 +1,304 @@ +[data] +type=data +dataIdx=0 + +[labels] +type=data +dataIdx=1 + +[conv1a] +type=conv +inputs=data +channels=3 +filters=48 +padding=0 +stride=4 +filterSize=11 +initW=0.01 +sumWidth=3 +sharedBiases=1 +gpu=0 + +[conv1b] +type=conv +inputs=data +channels=3 +filters=48 +padding=0 +stride=4 +filterSize=11 +initW=0.01 +sumWidth=3 +sharedBiases=1 +gpu=1 + +[rnorm1a] +type=cmrnorm +inputs=conv1a +channels=48 +size=5 + +[rnorm1b] +type=cmrnorm +inputs=conv1b +channels=48 +size=5 + +[pool1a] +type=pool +pool=max +inputs=rnorm1a +sizeX=3 +stride=2 +channels=48 +neuron=relu + +[pool1b] +type=pool +pool=max +inputs=rnorm1b +sizeX=3 +stride=2 +channels=48 +neuron=relu + +[conv2a] +type=conv +inputs=pool1a +filters=128 +padding=2 +stride=1 +filterSize=5 +channels=48 +initW=0.01 +initB=1 +sumWidth=3 +sharedBiases=1 +neuron=relu +gpu=0 + +[conv2b] +type=conv +inputs=pool1b +filters=128 +padding=2 +stride=1 +filterSize=5 +channels=48 +initW=0.01 +initB=1 +sumWidth=3 +sharedBiases=1 +neuron=relu +gpu=1 + +[rnorm2a] +type=cmrnorm +inputs=conv2a +channels=128 +size=5 + +[rnorm2b] +type=cmrnorm +inputs=conv2b +channels=128 +size=5 + +[cnorm2a] +type=rnorm +inputs=rnorm2a +channels=128 +size=5 + +[cnorm2b] +type=rnorm +inputs=rnorm2b +channels=128 +size=5 + +[pool2a] +type=pool +pool=max +inputs=cnorm2a +sizeX=3 +stride=2 +channels=128 + +[pool2b] +type=pool +pool=max +inputs=cnorm2b +sizeX=3 +stride=2 +channels=128 + +[conv3a] +type=conv +inputs=pool2a,pool2b +filters=192,192 +padding=1,1 +stride=1,1 +filterSize=3,3 +channels=128,128 +initW=0.03,0.03 +sumWidth=2 +sharedBiases=1 +neuron=relu +gpu=0 + +[conv3b] +type=conv +inputs=pool2a,pool2b +filters=192,192 +padding=1,1 +stride=1,1 +filterSize=3,3 +channels=128,128 +initW=0.03,0.03 +sumWidth=2 +sharedBiases=1 +neuron=relu +gpu=1 + +[conv4a] +type=conv +inputs=conv3a +filters=192 +padding=1 +stride=1 +filterSize=3 +channels=192 +neuron=relu +initW=0.03 +initB=1 +sumWidth=2 +sharedBiases=1 + +[conv4b] +type=conv +inputs=conv3b +filters=192 +padding=1 +stride=1 +filterSize=3 +channels=192 +neuron=relu +initW=0.03 +initB=1 +sumWidth=2 +sharedBiases=1 + + +[conv5a] +type=conv +inputs=conv4a +filters=128 +padding=1 +stride=1 +filterSize=3 +channels=192 +initW=0.03 +initB=1 +sumWidth=2 +groups=1 +randSparse=0 + +[conv5b] +type=conv +inputs=conv4b +filters=128 +padding=1 +stride=1 +filterSize=3 +channels=192 +initW=0.03 +initB=1 +sumWidth=2 +groups=1 +randSparse=0 + +[pool3a] +type=pool +pool=max +inputs=conv5a +sizeX=3 +stride=2 +channels=128 +neuron=relu + +[pool3b] +type=pool +pool=max +inputs=conv5b +sizeX=3 +stride=2 +channels=128 +neuron=relu + +[fc2048a] +type=fc +inputs=pool3a,pool3b +outputs=2048 +initW=0.01,0.01 +initB=1 +neuron=relu +gpu=0 + +[fc2048b] +type=fc +inputs=pool3a,pool3b +outputs=2048 +initW=0.01,0.01 +initB=1 +neuron=relu +gpu=1 + +[dropout1a] +type=dropout +inputs=fc2048a + +[dropout1b] +type=dropout +inputs=fc2048b + +[fc2048ba] +type=fc +inputs=dropout1a,dropout1b +outputs=2048 +initW=0.01,0.01 +initB=1 +neuron=relu +gpu=0 + +[fc2048bb] +type=fc +inputs=dropout1b,dropout1a +outputs=2048 +initW=0.01,0.01 +initB=1 +neuron=relu +gpu=1 + +[dropout2a] +type=dropout +inputs=fc2048ba + +[dropout2b] +type=dropout +inputs=fc2048bb + +[fc1000] +type=fc +outputs=1000 +inputs=dropout2a,dropout2b +initW=0.01,0.01 +gpu=0 + +[probs] +type=softmax +inputs=fc1000 + +[logprob] +type=cost.logreg +inputs=labels,probs +gpu=0 diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg new file mode 100644 index 0000000..3d79b4d --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg @@ -0,0 +1,257 @@ +[data] +type=data +dataIdx=0 + +[labvec] +type=data +dataIdx=1 + +[conv1] +type=conv +inputs=data +channels=3 +filters=64 +padding=0 +stride=4 +filterSize=11 +initW=0.01 +sumWidth=4 +sharedBiases=1 +gpu=0,1,2,3 + +[rnorm1] +type=cmrnorm +inputs=conv1 +channels=64 +size=5 + +[pool1] +type=pool +pool=max +inputs=rnorm1 +sizeX=3 +stride=2 +channels=64 +neuron=relu + +[conv2] +type=conv +inputs=pool1 +filters=192 +padding=2 +stride=1 +filterSize=5 +channels=64 +initW=0.01 +initB=1 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[rnorm2] +type=cmrnorm +inputs=conv2 +channels=192 +size=5 + +[pool2] +type=pool +pool=max +inputs=rnorm2 +sizeX=3 +stride=2 +channels=192 + +[conv3] +type=conv +inputs=pool2 +filters=384 +padding=1 +stride=1 +filterSize=3 +channels=192 +initW=0.03 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[conv4] +type=conv +inputs=conv3 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=384 +neuron=relu +initW=0.03 +initB=1 +sumWidth=3 +sharedBiases=1 + +[conv5] +type=conv +inputs=conv4 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=256 +initW=0.03 +initB=1 +sumWidth=3 + +[pool3] +type=pool +pool=max +inputs=conv5 +sizeX=3 +stride=2 +channels=256 +neuron=relu + +[fc1024a] +type=fc +inputs=pool3 +outputs=1024 +initW=0.01 +initB=1 +neuron=relu +gpu=0 + +[fc1024b] +type=fc +inputs=pool3 +outputs=1024 +initW=0.01 +initB=1 +neuron=relu +gpu=1 + +[fc1024c] +type=fc +inputs=pool3 +outputs=1024 +initW=0.01 +initB=1 +neuron=relu +gpu=2 + +[fc1024d] +type=fc +inputs=pool3 +outputs=1024 +initW=0.01 +initB=1 +neuron=relu +gpu=3 + +[dropout1a] +type=dropout2 +inputs=fc1024a + +[dropout1b] +type=dropout2 +inputs=fc1024b + +[dropout1c] +type=dropout2 +inputs=fc1024c + +[dropout1d] +type=dropout2 +inputs=fc1024d + +# This is like a concatenation layer +[pass1a] +type=pass +inputs=dropout1a,dropout1b,dropout1c,dropout1d +gpu=0 + +# This is like a concatenation layer +[pass1b] +type=pass +inputs=dropout1a,dropout1b,dropout1c,dropout1d +gpu=1 + +# This is like a concatenation layer +[pass1c] +type=pass +inputs=dropout1a,dropout1b,dropout1c,dropout1d +gpu=2 + +# This is like a concatenation layer +[pass1d] +type=pass +inputs=dropout1a,dropout1b,dropout1c,dropout1d +gpu=3 + + +[fc1024ba] +type=fc +inputs=pass1a +outputs=1024 +initW=0.01 +initB=1 +neuron=relu + +[fc1024bb] +type=fc +inputs=pass1b +outputs=1024 +initW=0.01 +initB=1 +neuron=relu + +[fc1024bc] +type=fc +inputs=pass1c +outputs=1024 +initW=0.01 +initB=1 +neuron=relu + +[fc1024bd] +type=fc +inputs=pass1d +outputs=1024 +initW=0.01 +initB=1 +neuron=relu + +[dropout2a] +type=dropout2 +inputs=fc1024ba + +[dropout2b] +type=dropout2 +inputs=fc1024bb + +[dropout2c] +type=dropout2 +inputs=fc1024bc + +[dropout2d] +type=dropout2 +inputs=fc1024bd + +[pass2a] +inputs=dropout2a,dropout2b,dropout2c,dropout2d +type=pass +gpu=0 + +[fc1000] +type=fc +outputs=1000 +inputs=pass2a +initW=0.01 + +[probs] +type=softmax +inputs=fc1000 + +[logprob] +type=cost.logreg +inputs=labvec,probs +gpu=0 + diff --git a/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg new file mode 100644 index 0000000..e804fdc --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg @@ -0,0 +1,152 @@ +[data] +type=data +dataIdx=0 + +[labvec] +type=data +dataIdx=1 + +[conv1] +type=conv +inputs=data +channels=3 +filters=64 +padding=0 +stride=4 +filterSize=11 +initW=0.01 +sumWidth=4 +sharedBiases=1 +gpu=0,1,2,3 + +[rnorm1] +type=cmrnorm +inputs=conv1 +channels=64 +size=5 + +[pool1] +type=pool +pool=max +inputs=rnorm1 +sizeX=3 +stride=2 +channels=64 +neuron=relu + +[conv2] +type=conv +inputs=pool1 +filters=192 +padding=2 +stride=1 +filterSize=5 +channels=64 +initW=0.01 +initB=1 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[rnorm2] +type=cmrnorm +inputs=conv2 +channels=192 +size=5 + +[pool2] +type=pool +pool=max +inputs=rnorm2 +sizeX=3 +stride=2 +channels=192 + +[conv3] +type=conv +inputs=pool2 +filters=384 +padding=1 +stride=1 +filterSize=3 +channels=192 +initW=0.03 +sumWidth=3 +sharedBiases=1 +neuron=relu + +[conv4] +type=conv +inputs=conv3 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=384 +neuron=relu +initW=0.03 +initB=1 +sumWidth=3 +sharedBiases=1 + +[conv5] +type=conv +inputs=conv4 +filters=256 +padding=1 +stride=1 +filterSize=3 +channels=256 +initW=0.03 +initB=1 +sumWidth=3 + +[pool3] +type=pool +pool=max +inputs=conv5 +sizeX=3 +stride=2 +channels=256 +neuron=relu + +[fc4096a] +type=fc +inputs=pool3 +outputs=4096 +initW=0.01 +initB=1 +neuron=relu + +[dropout1] +type=dropout2 +inputs=fc4096a + +[fc4096b] +type=fc +inputs=dropout1 +outputs=4096 +initW=0.01 +initB=1 +neuron=relu + +[dropout2] +type=dropout2 +inputs=fc4096b + +[fc1000] +type=fc +outputs=1000 +inputs=dropout2 +initW=0.01 +initB=-7 + +[probs] +type=softmax +inputs=fc1000 + +[logprob] +type=cost.logreg +inputs=labvec,probs +gpu=0,1,2,3 + diff --git a/caffe2/contrib/cuda-convnet2/make-data/input_meta b/caffe2/contrib/cuda-convnet2/make-data/input_meta new file mode 100644 index 0000000..659b20b Binary files /dev/null and b/caffe2/contrib/cuda-convnet2/make-data/input_meta differ diff --git a/caffe2/contrib/cuda-convnet2/make-data/make-data.py b/caffe2/contrib/cuda-convnet2/make-data/make-data.py new file mode 100644 index 0000000..1861ceb --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/make-data/make-data.py @@ -0,0 +1,157 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################# + + +# This script makes batches suitable for training from raw ILSVRC 2012 tar files. + +import tarfile +from StringIO import StringIO +from random import shuffle +import sys +from time import time +from pyext._MakeDataPyExt import resizeJPEG +import itertools +import os +import cPickle +import scipy.io +import math +import argparse as argp + +# Set this to True to crop images to square. In this case each image will be +# resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels, and then the +# center OUTPUT_IMAGE_SIZE x OUTPUT_IMAGE_SIZE patch will be extracted. +# +# Set this to False to preserve image borders. In this case each image will be +# resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels. This was +# demonstrated to be superior by Andrew Howard in his very nice paper: +# http://arxiv.org/abs/1312.5402 +CROP_TO_SQUARE = True +OUTPUT_IMAGE_SIZE = 256 + +# Number of threads to use for JPEG decompression and image resizing. +NUM_WORKER_THREADS = 8 + +# Don't worry about these. +OUTPUT_BATCH_SIZE = 3072 +OUTPUT_SUB_BATCH_SIZE = 1024 + +def pickle(filename, data): + with open(filename, "w") as fo: + cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL) + +def unpickle(filename): + fo = open(filename, 'r') + contents = cPickle.load(fo) + fo.close() + return contents + +def partition_list(l, partition_size): + divup = lambda a,b: (a + b - 1) / b + return [l[i*partition_size:(i+1)*partition_size] for i in xrange(divup(len(l),partition_size))] + +def open_tar(path, name): + if not os.path.exists(path): + print "ILSVRC 2012 %s not found at %s. Make sure to set ILSVRC_SRC_DIR correctly at the top of this file (%s)." % (name, path, sys.argv[0]) + sys.exit(1) + return tarfile.open(path) + +def makedir(path): + if not os.path.exists(path): + os.makedirs(path) + +def parse_devkit_meta(ILSVRC_DEVKIT_TAR): + tf = open_tar(ILSVRC_DEVKIT_TAR, 'devkit tar') + fmeta = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/meta.mat')) + meta_mat = scipy.io.loadmat(StringIO(fmeta.read())) + labels_dic = dict((m[0][1][0], m[0][0][0][0]-1) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000) + label_names_dic = dict((m[0][1][0], m[0][2][0]) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000) + label_names = [tup[1] for tup in sorted([(v,label_names_dic[k]) for k,v in labels_dic.items()], key=lambda x:x[0])] + + fval_ground_truth = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt')) + validation_ground_truth = [[int(line.strip()) - 1] for line in fval_ground_truth.readlines()] + tf.close() + return labels_dic, label_names, validation_ground_truth + +def write_batches(target_dir, name, start_batch_num, labels, jpeg_files): + jpeg_files = partition_list(jpeg_files, OUTPUT_BATCH_SIZE) + labels = partition_list(labels, OUTPUT_BATCH_SIZE) + makedir(target_dir) + print "Writing %s batches..." % name + for i,(labels_batch, jpeg_file_batch) in enumerate(zip(labels, jpeg_files)): + t = time() + jpeg_strings = list(itertools.chain.from_iterable(resizeJPEG([jpeg.read() for jpeg in jpeg_file_batch], OUTPUT_IMAGE_SIZE, NUM_WORKER_THREADS, CROP_TO_SQUARE))) + batch_path = os.path.join(target_dir, 'data_batch_%d' % (start_batch_num + i)) + makedir(batch_path) + for j in xrange(0, len(labels_batch), OUTPUT_SUB_BATCH_SIZE): + pickle(os.path.join(batch_path, 'data_batch_%d.%d' % (start_batch_num + i, j/OUTPUT_SUB_BATCH_SIZE)), + {'data': jpeg_strings[j:j+OUTPUT_SUB_BATCH_SIZE], + 'labels': labels_batch[j:j+OUTPUT_SUB_BATCH_SIZE]}) + print "Wrote %s (%s batch %d of %d) (%.2f sec)" % (batch_path, name, i+1, len(jpeg_files), time() - t) + return i + 1 + +if __name__ == "__main__": + parser = argp.ArgumentParser() + parser.add_argument('--src-dir', help='Directory containing ILSVRC2012_img_train.tar, ILSVRC2012_img_val.tar, and ILSVRC2012_devkit_t12.tar.gz', required=True) + parser.add_argument('--tgt-dir', help='Directory to output ILSVRC 2012 batches suitable for cuda-convnet to train on.', required=True) + args = parser.parse_args() + + print "CROP_TO_SQUARE: %s" % CROP_TO_SQUARE + print "OUTPUT_IMAGE_SIZE: %s" % OUTPUT_IMAGE_SIZE + print "NUM_WORKER_THREADS: %s" % NUM_WORKER_THREADS + + ILSVRC_TRAIN_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_train.tar') + ILSVRC_VALIDATION_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_val.tar') + ILSVRC_DEVKIT_TAR = os.path.join(args.src_dir, 'ILSVRC2012_devkit_t12.tar.gz') + + assert OUTPUT_BATCH_SIZE % OUTPUT_SUB_BATCH_SIZE == 0 + labels_dic, label_names, validation_labels = parse_devkit_meta(ILSVRC_DEVKIT_TAR) + + with open_tar(ILSVRC_TRAIN_TAR, 'training tar') as tf: + synsets = tf.getmembers() + synset_tars = [tarfile.open(fileobj=tf.extractfile(s)) for s in synsets] + print "Loaded synset tars." + print "Building training set image list (this can take 10-20 minutes)..." + sys.stdout.flush() + + train_jpeg_files = [] + for i,st in enumerate(synset_tars): + if i % 100 == 0: + print "%d%% ..." % int(round(100.0 * float(i) / len(synset_tars))), + sys.stdout.flush() + train_jpeg_files += [st.extractfile(m) for m in st.getmembers()] + st.close() + + shuffle(train_jpeg_files) + train_labels = [[labels_dic[jpeg.name[:9]]] for jpeg in train_jpeg_files] + print "done" + + # Write training batches + i = write_batches(args.tgt_dir, 'training', 0, train_labels, train_jpeg_files) + + # Write validation batches + val_batch_start = int(math.ceil((i / 1000.0))) * 1000 + with open_tar(ILSVRC_VALIDATION_TAR, 'validation tar') as tf: + validation_jpeg_files = sorted([tf.extractfile(m) for m in tf.getmembers()], key=lambda x:x.name) + write_batches(args.tgt_dir, 'validation', val_batch_start, validation_labels, validation_jpeg_files) + + # Write meta file + meta = unpickle('input_meta') + meta_file = os.path.join(args.tgt_dir, 'batches.meta') + meta.update({'batch_size': OUTPUT_BATCH_SIZE, + 'num_vis': OUTPUT_IMAGE_SIZE**2 * 3, + 'label_names': label_names}) + pickle(meta_file, meta) + print "Wrote %s" % meta_file + print "All done! ILSVRC 2012 batches are in %s" % args.tgt_dir diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile b/caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile new file mode 100644 index 0000000..7b7ae56 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile @@ -0,0 +1,50 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDES := -I./include +COMMONFLAGS := +CC_ARGS := + +ifndef debug + CC_ARGS += -O3 +endif +CC=g++ + +OUT_DIR=./bin/$(OUT_SUFFIX) + +PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2) +LINK_LIBS := -L$(CUDA_INSTALL_PATH)/lib64 `pkg-config --libs python` `pkg-config --libs opencv` -lpthread + +INCLUDES += -I$(PYTHON_INCLUDE_PATH) +OUT_FILE=_MakeDataPyExt.so + +all: dir classes $(OUT_FILE) + +dir: + mkdir -p $(OUT_DIR)/src + +SOURCES = $(shell echo src/*.cpp) +CLASSES = $(SOURCES:.cpp=.o) + +classes: $(CLASSES) + +%.o: %.cpp + $(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o + +$(OUT_FILE): classes + cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS) + ln -sf $(OUT_DIR)/$(OUT_FILE) . + +clean: + rm -rf $(OUT_DIR)/* diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py b/caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py new file mode 100644 index 0000000..520b1ea --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h b/caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h new file mode 100644 index 0000000..6e4c655 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h @@ -0,0 +1,59 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PYEXT_H_ +#define INCLUDE_PYEXT_H_ + +#include +//#include +#include +#include +#include "../../../util/include/thread.h" + +#define JPEG_QUALITY 95 + +#ifndef DIVUP +#define DIVUP(a,b) (((a) + (b) - 1) / (b)) +#endif + +extern "C" { + void init_MakeDataPyExt(); +} +PyObject* resizeJPEG(PyObject *self, PyObject *args); + +class DecoderThread : public Thread { + protected: + PyObject* _py_list_src; + PyObject* _py_list_tgt; + int _start_img, _end_img; + int _target_size; + bool _crop_to_square; + + cv::Mat _resized_mat_buffer; + std::vector _output_jpeg_buffer; + std::vector _encode_params; + + void* run(); + void makeJPEG(int idx); + + public: + DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square); + virtual ~DecoderThread(); + PyObject* getTargetList(); +}; + + +#endif // INCLUDE_PYEXT_H_ diff --git a/caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp b/caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp new file mode 100644 index 0000000..0e3c0c7 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp @@ -0,0 +1,131 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/pyext.h" + +using namespace std; + +static PyMethodDef _MakeDataPyExtMethods[] = {{ "resizeJPEG", resizeJPEG, METH_VARARGS }, + { NULL, NULL } +}; + +void init_MakeDataPyExt() { + (void) Py_InitModule("_MakeDataPyExt", _MakeDataPyExtMethods); +} + +PyObject* resizeJPEG(PyObject *self, PyObject *args) { + + PyListObject* pyListSrc; + int tgtImgSize, numThreads; + int cropToSquare; + + if (!PyArg_ParseTuple(args, "O!iii", + &PyList_Type, &pyListSrc, + &tgtImgSize, + &numThreads, + &cropToSquare)) { + return NULL; + } + + DecoderThread* threads[numThreads]; + int num_imgs = PyList_GET_SIZE(pyListSrc); + int num_imgs_per_thread = DIVUP(num_imgs, numThreads); + for (int t = 0; t < numThreads; ++t) { + int start_img = t * num_imgs_per_thread; + int end_img = min(num_imgs, (t+1) * num_imgs_per_thread); + + threads[t] = new DecoderThread((PyObject*)pyListSrc, start_img, end_img, tgtImgSize, cropToSquare); + threads[t]->start(); + } + + PyObject* pyListTgt = PyList_New(0); + for (int t = 0; t < numThreads; ++t) { + threads[t]->join(); + PyList_Append(pyListTgt, threads[t]->getTargetList()); + delete threads[t]; // the thread's list too + } + + return pyListTgt; +} + +DecoderThread::DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square) +: Thread(true), _py_list_src(py_list_src), _start_img(start_img), _end_img(end_img), _target_size(target_size), _crop_to_square(crop_to_square) { + + _encode_params.push_back(CV_IMWRITE_JPEG_QUALITY); + _encode_params.push_back(JPEG_QUALITY); + _py_list_tgt = PyList_New(0); +} + +DecoderThread::~DecoderThread(){ + Py_DECREF(_py_list_tgt); +} + +void* DecoderThread::run() { + for (int i = _start_img; i < _end_img; ++i) { + makeJPEG(i); + } + return NULL; +} + +PyObject* DecoderThread::getTargetList() { + return _py_list_tgt; +} + +void DecoderThread::makeJPEG(int idx) { + /* + * Decompress JPEG + */ + PyObject* pySrc = PyList_GET_ITEM(_py_list_src, idx); + uchar* src = (unsigned char*)PyString_AsString(pySrc); + size_t src_len = PyString_GET_SIZE(pySrc); + vector src_vec(src, src + src_len); + + cv::Mat decoded_mat = cv::imdecode(cv::Mat(src_vec), CV_LOAD_IMAGE_COLOR); + assert(decoded_mat.channels() == 3); + + /* + * Resize + */ + double min_dim = std::min(decoded_mat.size().height, decoded_mat.size().width); + double scale_factor = _target_size / min_dim; + + int new_height = round(scale_factor * decoded_mat.size().height); + int new_width = round(scale_factor * decoded_mat.size().width); + assert((new_height == _target_size && new_width >= _target_size) + || (new_width == _target_size && new_height >= _target_size)); + int interpolation = scale_factor == 1 ? cv::INTER_LINEAR + : scale_factor > 1 ? cv::INTER_CUBIC : cv::INTER_AREA; + + cv::resize(decoded_mat, _resized_mat_buffer, cv::Size(new_width, new_height), 0, 0, interpolation); + + /* + * Conditionally crop and compress JPEG + */ + if (_crop_to_square) { + int crop_start_x = (new_width - _target_size) / 2; + int crop_start_y = (new_height - _target_size) / 2; + cv::Rect cropRect(crop_start_x, crop_start_y, _target_size, _target_size); + cv::Mat cropped_mat_buffer = _resized_mat_buffer(cropRect); + cv::imencode(".jpg", cropped_mat_buffer, _output_jpeg_buffer, _encode_params); + } else { + cv::imencode(".jpg", _resized_mat_buffer, _output_jpeg_buffer, _encode_params); + } + + char* output_jpeg_buffer_ptr = reinterpret_cast(&_output_jpeg_buffer[0]); + PyObject* pyStr = PyString_FromStringAndSize(output_jpeg_buffer_ptr, _output_jpeg_buffer.size()); + PyList_Append(_py_list_tgt, pyStr); + Py_DECREF(pyStr); +} diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/Makefile b/caffe2/contrib/cuda-convnet2/nvmatrix/Makefile new file mode 100644 index 0000000..81b8dd4 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/Makefile @@ -0,0 +1,108 @@ +################################################################################ +# +# Copyright 1993-2012 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ + +# Location of the CUDA Toolkit binaries and libraries +CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include +CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin +CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64 + +# Common binaries +NVCC = $(CUDA_BIN_PATH)/nvcc +GCC = g++ +AR = ar + +# CUDA code generation flags +GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 +GENCODE_FLAGS := $(GENCODE_SM35) + +LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart +CCFLAGS := -m64 +NVCCFLAGS := -m64 + +# Debug build flags +ifeq ($(dbg),1) + CCFLAGS += -g + NVCCFLAGS += -g -G + DBG := debug +else + DBG := release + NVCCFLAGS += -O3 + CCFLAGS += -O3 +endif + +# Add profiler output +ifeq ($(prof),1) + NVCCFLAGS += --ptxas-options=-v +endif + +TARGETDIR := ./bin/$(DBG) +OBJDIR := ./obj/$(DBG) + +########## USER STUFF ########### +LDFLAGS += -L../util -lutilpy -lcublas +INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include + +CUFILES := $(shell find . -name "*.cu") +CU_DEPS := $(shell find . -name "*.cuh") +CCFILES := $(shell find . -name "*.cpp") +C_DEPS := $(shell find . -name "*.h") + +NVCCFLAGS += --compiler-options '-fPIC' +LDFLAGS += -shared +CCFLAGS += -fPIC +TARGET := $(TARGETDIR)/libnvmatrix.so + +################################################################################ +# Set up target and object files +################################################################################ +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES)) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES)) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES)) + +# Target rules +all: makedirs $(TARGET) + +$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS) + $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $< + +$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS) + $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< + +$(TARGET): $(OBJS) + $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) + ln -sf $(TARGET) . + +makedirs: + mkdir -p $(TARGETDIR) + mkdir -p $(OBJDIR)/src + +clean: + rm -rf ./obj diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh new file mode 100644 index 0000000..5154a0d --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh @@ -0,0 +1,317 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MEMORY_CUH_H_ +#define MEMORY_CUH_H_ +#include +#include +#include +#include +#include + +#include +#include "../../util/include/sync.h" +#include "nvmatrix_kernels.cuh" + +#define GPU_ALLOC_FRACTION 0.95 // Take 95% of available GPU memory +#define HOST_ALLOC_CHUNK (1UL << 32) +#define SYNC_ON_FREE true +#define BUCKET_TYPE unsigned int + +// Allocte memory from up to this many buckets higher than desired without subdividing +#define BUCKET_DIVISION_THRESHOLD 1 +#define NUM_BUCKETS static_cast(sizeof(BUCKET_TYPE) * 8) +#define CLZ(x) ((x) == 0 ? (NUM_BUCKETS) : __builtin_clz(x)) +#define CEIL_LOG2(x) (NUM_BUCKETS - CLZ(x)) // Ceiling of log base 2 of (x + 1) +#define LOG_FIRST_BUCKET_SIZE 12 +#define FIRST_BUCKET_SIZE (1 << LOG_FIRST_BUCKET_SIZE) // First bucket is for 4K bytes +#define GET_ALLOC_BUCKET(size) (CEIL_LOG2(((size) - 1) >> LOG_FIRST_BUCKET_SIZE)) +#define GET_DEALLOC_BUCKET(size) (CEIL_LOG2((size) >> (1 + LOG_FIRST_BUCKET_SIZE))) +#define GET_BUCKET_SIZE(b) (1UL << (LOG_FIRST_BUCKET_SIZE + b)) + +#define BUCKET_MASK(b) (1UL << (b)) +#define PREV_BUCKETS_MASK(b) (BUCKET_MASK(b) - 1) +#define AVAILABLE_NEXT_MASK(b, buckets) ((buckets) & ~PREV_BUCKETS_MASK(b)) + +/* + * Returns the "best-matching" available bucket as defined by policy. + * The two policies are: + * + * TAKE_FROM_BIGGEST = true: If a bucket in the range + * b...{b + BUCKET_DIVISION_THRESHOLD} is available, return the smallest + * available bucket in that range. Otherwise return the *biggest* available + * bucket greater than or equal to b. + * + * TAKE_FROM_BIGGEST = false: Return the *smallest* available bucket greater + * than or equal to b. + * + * Returns -1 when no satisfactory bucket is available. + */ +#define TAKE_FROM_BIGGEST true +#if TAKE_FROM_BIGGEST +#define GET_AVAILABLE_BUCKET(b, buckets) \ + (-1 + (((AVAILABLE_NEXT_MASK(b, buckets)) \ + & (PREV_BUCKETS_MASK((b) + 1 + BUCKET_DIVISION_THRESHOLD))) \ + /* Smallest bucket >= b */ ? __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets)) \ + /* Biggest bucket >= b */ : CEIL_LOG2(AVAILABLE_NEXT_MASK(b, buckets)))) +#else +#define GET_AVAILABLE_BUCKET(b, buckets) __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets)) +#endif + +/* + * Bit get/set/clear. + */ +#define GET_BIT(x, bit) ((x) & (1 << (bit))) +#define SET_BIT(x, bit) ((x) |= (1 << (bit))) +#define CLEAR_BIT(x, bit) ((x) &= ~(1 << (bit))) + +typedef struct __align__(512) { + char data; +} DataType; + +#define SIZE_ROUNDUP(size) (sizeof(DataType) * DIVUP((size), sizeof(DataType))) + +class MemorySegment { + friend class FastMemoryManager; +protected: + DataType* _data; + size_t _size; + int _deviceID; + // Resizes itself to _size - size and + // returns pointer to new memory segment + MemorySegment* subdivide(size_t size) { + assert(size < _size); +// assert(size % sizeof(DataType) == 0); + _size -= size; + return new MemorySegment(_data + _size / sizeof(DataType), size, _deviceID); + } + + inline size_t getSize() const { + return _size; + } +public: + MemorySegment(DataType* data, size_t size, int deviceID) : _data(data), _size(size), _deviceID(deviceID) { + assert(size % sizeof(DataType) == 0); + } + // In some cases size is irrelevant + template MemorySegment(T* data) : _data(reinterpret_cast(data)), _size(0), _deviceID(-1) { + + } + + template + inline T* getData() const { + return reinterpret_cast(_data); + } + + template + inline T** getDataPtr() { + return reinterpret_cast(&_data); + } + + inline int getDeviceID() const { + return _deviceID; + } +}; + +class MemoryManager { +protected: + static Lock _globalLock; +public: + virtual MemoryManager* init() = 0; + virtual MemorySegment* malloc(size_t size) = 0; + virtual void free(MemorySegment* mem) = 0; + virtual ~MemoryManager() { + + } +}; + +class FastMemoryManager : public MemoryManager { +protected: + int _deviceID; + Lock _lock; + DataType* _data; + size_t _size; + BUCKET_TYPE _buckets; // Bucket availability bit vector + std::vector > _freeSegments; // bucket idx -> vector of segments + + static std::map _memoryManagers; + + virtual void allocateInitialSegment() { + assert(_deviceID >= 0); + assert(FIRST_BUCKET_SIZE % sizeof(DataType) == 0); + checkCudaErrors(cudaSetDevice(_deviceID)); + size_t memFree, memTotal; + checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal)); + _size = sizeof(DataType) * (size_t(round(double(memFree) * GPU_ALLOC_FRACTION)) / sizeof(DataType)); + printf("FastMemoryManager[%d] allocating %lu-byte initial segment\n", _deviceID, _size); + checkCudaErrors(cudaMalloc(&_data, _size)); + } + + virtual void freeInitialSegment() { + checkCudaErrors(cudaFree(_data)); + } + +public: + static MemoryManager& getInstance(int deviceID); + static void destroyInstance(int deviceID); + + FastMemoryManager(int deviceID) : _deviceID(deviceID), _data(NULL), _size(0), _buckets(0) { + } + + ~FastMemoryManager() { + freeInitialSegment(); + for (int i = 0; i < _freeSegments.size(); ++i) { + for (int j = 0; j < _freeSegments[i].size(); ++j) { + delete _freeSegments[i][j]; + } + } + } + + virtual MemoryManager* init() { + allocateInitialSegment(); + + for (int i = 0; i < NUM_BUCKETS; ++i) { + _freeSegments.push_back(std::vector()); + } + int bucket = GET_DEALLOC_BUCKET(_size); + SET_BIT(_buckets, bucket); + _freeSegments[bucket].push_back(new MemorySegment(_data, _size, _deviceID)); + return this; + } + + MemorySegment* malloc(size_t size) { + assert(size > 0); + int requestedBucket = GET_ALLOC_BUCKET(size); + _lock.acquire(); + + int bucket = GET_AVAILABLE_BUCKET(requestedBucket, _buckets); +// if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) { +// printf("MemoryManager[%d] requested size: %lu, requested bucket: %d, available bucket: %d\n", _deviceID, size, requestedBucket, bucket); +// } + + assert(bucket >= requestedBucket); // Out of memory + + MemorySegment* sourceSegment = _freeSegments[bucket].back(); + MemorySegment* ret = sourceSegment; + if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) { // We got a much bigger chunk than we wanted + ret = sourceSegment->subdivide(GET_BUCKET_SIZE(requestedBucket)); + int newSrcBucket = GET_DEALLOC_BUCKET(sourceSegment->getSize()); + if (newSrcBucket != bucket) { + _freeSegments[bucket].pop_back(); + _freeSegments[newSrcBucket].push_back(sourceSegment); + SET_BIT(_buckets, newSrcBucket); + } + } else { + _freeSegments[bucket].pop_back(); + } + if (_freeSegments[bucket].size() == 0) { + CLEAR_BIT(_buckets, bucket); + } + _lock.release(); + return ret; + } + + void free(MemorySegment* mem) { + assert(mem != NULL); + assert(mem->getSize() >= FIRST_BUCKET_SIZE); + int bucket = GET_DEALLOC_BUCKET(mem->getSize()); + // Synchronize for safety, so that we don't free memory that's being used. Not synchronizing + // could potentially cause a problem if we re-allocate the just-freed chunk and attempt to + // use it in a different stream. + if (SYNC_ON_FREE) { + int d; + checkCudaErrors(cudaGetDevice(&d)); + checkCudaErrors(cudaSetDevice(mem->getDeviceID())); + checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaSetDevice(d)); + } + _lock.acquire(); + _freeSegments[bucket].push_back(mem); + SET_BIT(_buckets, bucket); +// printf("MemoryManager[%d] Freed segment of size %lu into bucket %lu\n", _deviceID, mem->getSize(), bucket); + _lock.release(); + } +}; + +class FastHostMemoryManager : public FastMemoryManager { +protected: + static MemoryManager* _memoryManager; + void allocateInitialSegment() { + _size = HOST_ALLOC_CHUNK; + checkCudaErrors(cudaHostAlloc(&_data, _size, cudaHostAllocPortable)); + } + void freeInitialSegment () { + checkCudaErrors(cudaFreeHost(_data)); + } +public: + FastHostMemoryManager() : FastMemoryManager(DEVICE_HOST) { + } + + static MemoryManager& getInstance(); + static void destroyInstance(); +}; + +class CUDAMemoryManager : public MemoryManager { +protected: + static MemoryManager* _memoryManager; + + virtual void _malloc(DataType** data, size_t size) { + checkCudaErrors(cudaMalloc(data, size)); + } + virtual void _free(MemorySegment* mem) { + checkCudaErrors(cudaFree(mem->getData())); + } +public: + static MemoryManager& getInstance(int deviceID); + static void destroyInstance(int deviceID); + CUDAMemoryManager() { + } + + MemoryManager* init() { + return this; + } + + MemorySegment* malloc(size_t size) { + MemorySegment* seg = new MemorySegment(reinterpret_cast(NULL)); + DataType** data = seg->getDataPtr(); + _malloc(data, size); + return seg; + } + + void free(MemorySegment* mem) { + assert(mem != NULL); + _free(mem); + delete mem; + } +}; + +class CUDAHostMemoryManager : public CUDAMemoryManager { +protected: + static MemoryManager* _memoryManager; + void _free(MemorySegment* mem) { + checkCudaErrors(cudaFreeHost(mem->getData())); + } + void _malloc(DataType** data, size_t size) { + checkCudaErrors(cudaHostAlloc(data, size, cudaHostAllocPortable)); + } +public: + static MemoryManager& getInstance(); + static void destroyInstance(); + CUDAHostMemoryManager() : CUDAMemoryManager() { + + } +}; +#endif /* MEMORY_CUH_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh new file mode 100644 index 0000000..d878d74 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh @@ -0,0 +1,667 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVMATRIX_H_ +#define NVMATRIX_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "../../util/include/matrix.h" +#include "nvmatrix_kernels.cuh" +#include "nvmatrix_operators.cuh" +#include "memory.cuh" + +#ifdef WARNINGS +#define WARN(msg) printf("WARN: File %s, line %d: %s\n", __FILE__, __LINE__, msg); +#else +#define WARN(msg) ; +#endif + +#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \ + printf("CURAND Error at %s:%d\n",__FILE__,__LINE__);\ + exit(EXIT_FAILURE);}} while(0) + +#define CUBLAS_CALL(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \ + printf("CUBLAS Error at %s:%d\n",__FILE__,__LINE__);\ + exit(EXIT_FAILURE);}} while(0) + +/* + * Memory manager to use for GPU memory allocations. + * + * CUDAMemoryManager: Default Nvidia memory manager; just calls cudaMalloc / cudaFree. + * Allocating and freeing memory is slow. + * FastMemoryManager: A GPU memory manager with very fast (constant time) + * alloc / free, but possibly more wasteful of memory. + */ +#define DEVICE_MEMORY_MANAGER CUDAMemoryManager + +/* + * Memory manager to use for host memory allocations. + * + * CUDAHostMemoryManager: Default Nvidia memory manager; just calls cudaHostAlloc / cudaFreeHost. + * Allocating and freeing memory is slow. + * FastHostMemoryManager: A host memory manager with very fast (constant time) + * alloc / free, but possibly more wasteful of memory. + */ +#define HOST_MEMORY_MANAGER CUDAHostMemoryManager + +class NVMatrix; +typedef std::vector NVMatrixV; + +class NVMatrix { +protected: + int _numCols, _numRows; + int _numElements; + int _stride; +// float* getDevData(); + MemorySegment* _memSegment; + bool _isTrans; + bool _ownsData; + // This flag makes sure that the NVMatrix destructor does nothing + // when called on HostNVMatrix instance. + bool _deleted; + cudaTextureObject_t _texObj; + +// static std::map rndGen; + static std::map _rndDevStates; + static std::map _cublasHandles; + // Map from device id --> # of random streams initialized on that device + static std::map _rndDevThreads; + static pthread_mutex_t *_rndMutex, *_cublasMutex, *_streamMutex; + // Map from device id --> default stream + static std::map _defaultStreams; + + cublasOperation_t getTransChar() const { + /* + * not a typo! return opposite character because a + * non-transposed nvmatrix is in row-major order while a non-transposed + * cublas matrix is in column-major order. + */ + return _isTrans ? CUBLAS_OP_N : CUBLAS_OP_T; + } + + void _init(bool isTrans); + void _sum_setParams(int n, dim3* blocks, dim3* threads); + template float cpuAgg(Agg agg, cudaStream_t stream); + template float _totalAgg(Agg agg); + template float _totalAgg(Agg agg, cudaStream_t stream); + template float _totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream); + template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp); + template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream); + template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop); + template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream); + template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop); + template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream); + template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop); + template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream); + template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop); + + template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp); + template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp); + template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp); + template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp); + template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp); + template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp); + template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, NVMatrix& tmp); + + template void _unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream); + template void _unaryRandomize(NVMatrix& target, Randomizer rnd); + template void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd); + template void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream); + + virtual void alloc(int numElements); + virtual void dealloc(); + void deallocTexture(); + virtual NVMatrix& construct() const; + virtual NVMatrix& construct(bool isTrans) const; + virtual NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const; + virtual NVMatrix& construct(const Matrix& like, bool copy) const; + virtual NVMatrix& construct(const NVMatrix& like, bool copy) const; + virtual NVMatrix& construct(const NVMatrix& like) const; + virtual NVMatrix& construct(const Matrix& like) const; + virtual NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const; + static cublasHandle_t getCublasHandle(); + static cublasHandle_t getCublasHandle(int deviceID); +public: + NVMatrix(); + NVMatrix(bool isTrans); + NVMatrix(int numRows, int numCols, bool isTrans=false); + NVMatrix(const Matrix& like, bool copy); + NVMatrix(const NVMatrix& like, bool copy); + NVMatrix(const NVMatrix& like); + NVMatrix(const Matrix& like); + NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans); + virtual ~NVMatrix(); + + // Returns the device ID on which the data pointer is allocated + int getDataDeviceID() const; + static void initRandom(unsigned long long seed, int numStreams, cudaStream_t stream); + static void initRandom(unsigned long long seed, int numStreams); + static void initRandom(unsigned long long seed); + static void initRandom(); + static void initCublas(); + static void destroyCublas(); + static std::pair getCudaMemorySize(); + + // Returns the currently-active device ID for calling thread + static int getDeviceID(); + static void setDeviceID(int d); + static bool canAccessPeer(int srcDevice, int tgtDevice); + static bool isRndInitialized(); + static bool isRndInitialized(bool haveLock); + static curandState* getCurandState(); + static curandState* getCurandState(int numStreams); + static void destroyRandom(); + static pthread_mutex_t* makeMutex(); + static cudaStream_t getDefaultStream(int deviceID); + static cudaStream_t getDefaultStream(); + static void syncDevice(); + static void syncStream(); + static void syncStream(cudaStream_t stream); + + /* + * DO NOT DEREFERENCE IN HOST CODE! This is a device memory pointer. + */ + float* getCellPtr(int i, int j) const { + if (_isTrans) { + return &getDevData()[j * _numRows + i]; + } + return &getDevData()[i * _numCols + j]; + } + + bool isSameDims(const Matrix& m) const { + return m.getNumRows() == _numRows && m.getNumCols() == _numCols; + } + + bool isSameDims(const NVMatrix& m) const { + return m.getNumRows() == _numRows && m.getNumCols() == _numCols; + } + + int getNumRows() const { + return _numRows; + } + + int getNumCols() const { + return _numCols; + } + + int getStride() const { + return _stride; + } + + int getLeadingDim() const { + return _isTrans ? _numRows : _numCols; + } + + int getFollowingDim() const { + return !_isTrans ? _numRows : _numCols; + } + + /* + * FALSE: Row-major order. + * TRUE: Column-major order. + */ + bool isTrans() const { + return _isTrans; + } + + bool isView() const { + return !_ownsData; + } + + float* getDevData() const { + return _memSegment == NULL ? NULL : _memSegment->getData(); + } + + MemorySegment& getMemorySegment() const { + return *_memSegment; + } + + int getNumElements() const { + return _numElements; + } + + size_t getNumDataBytes() const { + return size_t(_numElements) * 4; + } + + /* + * Only use if you know what you're doing! + * Does not actually transpose matrix. + */ + void setTrans(bool trans) { + if (trans != _isTrans) { + assert(isContiguous()); + _isTrans = trans; + _stride = getLeadingDim(); + } + } + + /* + * Only use if you know what you're doing! + * This toggles whether this object will free its GPU memory when it's destroyed. + */ + void setIsView(bool isView) { + _ownsData = !isView; + } + + bool isContiguous() const { + return _stride == getLeadingDim() || getFollowingDim() == 1; + } + + void truncate() { + resize(0,0); + } + + virtual cudaTextureObject_t getTextureObject(); + + virtual void copyFromHost(const Matrix& hostMatrix); + virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget); + virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream); + virtual void copyToHost(Matrix& hostMatrix) const; + virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget) const; + virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const; + void copy(NVMatrix& dest) const; + void copy(NVMatrix& dest, cudaStream_t stream) const; + NVMatrix& copy() const; + void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream); + void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB); + void addProduct(NVMatrix& a, NVMatrix &b); + void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream); + void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target); + void rightMult(NVMatrix &b, NVMatrix &target); + void rightMult(NVMatrix &b, float scaleAB); + void randomizeUniform(); + void addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target); + void addGaussianNoise(float stdev, NVMatrix& target); + void addGaussianNoise(NVMatrix& stdevs, bool var); + void addGaussianNoise(NVMatrix& stdevs); + void addGaussianNoise(float stdev); + void addGaussianNoise(); + void randomizeGaussian(); + void randomizeGaussian(float stdev); + void randomizeGaussian(float mean, float stdev); + void randomizeGaussian(float mean, NVMatrix& stdevs); + void randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs); + void randomizeGaussian(NVMatrix& stdevs); + void randomizeGaussian(NVMatrix& stdevs, NVMatrix& target); + void binarizeProbs(); + void binarizeProbs(NVMatrix& target); + + void biggerThan(NVMatrix& m, NVMatrix& target); + void biggerThan(NVMatrix& m); + void biggerThanVector(NVMatrix& vec, NVMatrix& target); + void biggerThanVector(NVMatrix& vec); + void equals(NVMatrix& m, NVMatrix& target); + void equals(NVMatrix& m); + + void _checkBounds(int startRow, int endRow, int startCol, int endCol) const; + NVMatrix& slice(int startRow, int endRow, int startCol, int endCol) const; + void slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const; + NVMatrix& sliceRows(int startRow, int endRow) const; + void sliceRows(int startRow, int endRow, NVMatrix& target) const; + NVMatrix& sliceCols(int startCol, int endCol) const; + void sliceCols(int startCol, int endCol, NVMatrix& target) const; + + NVMatrixV& splitRows(int numParts); + NVMatrixV& splitCols(int numParts); + + template void apply(Op op, NVMatrix& target, cudaStream_t stream) { + if (!target.isSameDims(*this)) { + target.resize(*this); + } + if (getNumElements() > 0) { + int height = target.getFollowingDim(), width = target.getLeadingDim(); + + if (target.isTrans() == isTrans()) { + if (!isContiguous() || !target.isContiguous()) { + dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)), + std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y))); + dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); + kEltwiseUnaryOp<<>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op); + getLastCudaError("kEltwiseUnaryOp: Kernel execution failed"); + } else { + dim3 threads = dim3(ELTWISE_FLAT_THREADS_X); + dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X))); + kEltwiseUnaryOpFlat<<>>(getDevData(), target.getDevData(), _numElements, op); + getLastCudaError("kEltwiseUnaryOpFlat: Kernel execution failed"); + } + } else { + dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)), + std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y))); + dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); + bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0); + // printf("height: %d, width: %d, stride: %d, target stride: %d, check bounds: %d, threads.x: %d, threads.y: %d, blocks.x: %d, blocks.y: %d\n", + // height, width, getStride(), target.getStride(), checkBounds, threads.x, threads.y, blocks.x, blocks.y); + if (checkBounds) { + kEltwiseUnaryOpTrans<<>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op); + } else { + kEltwiseUnaryOpTrans<<>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op); + } + getLastCudaError("kEltwiseUnaryOpTrans: Kernel execution failed"); + } + } + } + + template void apply(Op op, cudaStream_t stream) { + apply(op, *this, stream); + } + + template void apply(Op op, NVMatrix& target) { + apply(op, target, getDefaultStream()); + } + + template void apply(Op op) { + apply(op, *this); + } + + template void applyBinary(Op op, NVMatrix& b) { + applyBinary(op, b, *this); + } + + template void applyBinary(Op op, NVMatrix& b, NVMatrix& target) { + applyBinary(op, b, target, getDefaultStream()); + } + + template void applyBinary(Op op, NVMatrix& b, NVMatrix& target, cudaStream_t stream) { + assert(this->isSameDims(b)); + + if (!target.isSameDims(*this)) { + target.resize(*this); + } + + if (getNumElements() > 0) { + int height = target.getFollowingDim(), width = target.getLeadingDim(); + if (target.isTrans() == isTrans() && target.isTrans() == b.isTrans()) { + if (!isContiguous() || !b.isContiguous() || !target.isContiguous()) { + dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)), + std::min(128, DIVUP(height, ELTWISE_THREADS_Y))); + dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); + kEltwiseBinaryOp<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width, getStride(), + b.getStride(), target.getStride(), op); + } else { + dim3 threads = dim3(ELTWISE_FLAT_THREADS_X); + dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X))); + kEltwiseBinaryOpFlat<<>>(getDevData(), b.getDevData(), target.getDevData(), _numElements, op); + } + getLastCudaError("kEltwiseBinaryOp: Kernel execution failed"); + } else { + + dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)), + std::min(128, DIVUP(height, ELTWISE_THREADS_Y))); + dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); + // both x here since y divides x + bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0); + if (target.isTrans() == isTrans() && target.isTrans() != b.isTrans()) { + if (checkBounds) { + kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), + b.getStride(), target.getStride(), op); + } else { + kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), + b.getStride(), target.getStride(), op); + } + } else if (target.isTrans() != isTrans() && target.isTrans() != b.isTrans()) { + if (checkBounds) { + kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), + b.getStride(), target.getStride(), op); + } else { + kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), + b.getStride(), target.getStride(), op); + } + } else if (target.isTrans() != isTrans() && target.isTrans() == b.isTrans()) { + if (checkBounds) { + kEltwiseBinaryOpTrans<<>>(b.getDevData(), getDevData(), target.getDevData(), height, width,b.getStride(), + getStride(), target.getStride(), op); + } else { + kEltwiseBinaryOpTrans<<>>(b.getDevData(), getDevData(), target.getDevData(), height, width, b.getStride(), + getStride(), target.getStride(), op); + } + } + getLastCudaError("kEltwiseBinaryOpTrans: Kernel execution failed"); + } + } + } + + template void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target) { + applyTernary(op, b, c, target, getDefaultStream()); + } + + template void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target, cudaStream_t stream) { + assert(isSameDims(b)); + assert(isSameDims(c)); + // For now ternary ops are only supported for matrices of same transposedness + assert(isTrans() == b.isTrans()); + assert(isTrans() == c.isTrans()); + if (!target.isSameDims(*this) || target.isTrans() != isTrans()) { + target.resize(*this); + } + if (getNumElements() > 0) { + int height = target.getFollowingDim(), width = target.getLeadingDim(); + if (!isContiguous() || !b.isContiguous() || !c.isContiguous() || !target.isContiguous()) { + dim3 blocks(std::min(512, DIVUP(width, ELTWISE_THREADS_X)), + std::min(512, DIVUP(height, ELTWISE_THREADS_Y))); + dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); + kEltwiseTernaryOp<<>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), height, width, + getStride(), b.getStride(), c.getStride(), target.getStride(), op); + getLastCudaError("kEltwiseTernaryOp: Kernel execution failed"); + } else { + dim3 threads = dim3(ELTWISE_FLAT_THREADS_X); + dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X))); + kEltwiseTernaryOpFlat<<>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), _numElements, op); + getLastCudaError("kEltwiseTernaryOpFlat: Kernel execution failed"); + } + } + } + + bool resize(int numRows, int numCols, bool trans); + bool resize(int numRows, int numCols); + bool resize(const NVMatrix &like); + bool resize(const Matrix &like); + void reshape(int numRows, int numCols); + NVMatrix& reshaped(int numRows, int numCols) const; + void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol) const; + void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol, cudaStream_t stream) const; + void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream); + void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target); + void add(NVMatrix& b, float scaleB, NVMatrix& target); + void add(NVMatrix& b, NVMatrix& target); + void add(NVMatrix& b, float scaleB); + void add(NVMatrix& b, float scaleA, float scaleB); + void add(NVMatrix& b); + void eltwiseMult(NVMatrix& b); + void eltwiseMult(NVMatrix& b, NVMatrix& target); + void eltwiseDivide(NVMatrix& b); + void eltwiseDivide(NVMatrix& b, NVMatrix& target); + void squaredDiff(NVMatrix& b); + void squaredDiff(NVMatrix& b, NVMatrix& target); + void subtract(NVMatrix& b, NVMatrix& target); + void subtract(NVMatrix& b); + void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream); + void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target); + void addVector(NVMatrix& vec); + void addVector(NVMatrix& vec, float scaleVec); + void addVector(NVMatrix& vec, NVMatrix& target); + void equalsVector(NVMatrix& vec, NVMatrix& target); + void equalsVector(NVMatrix& vec); + void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream); + void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target); + void eltwiseMultByVector(NVMatrix& vec); + void eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream); + void eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target); + void eltwiseDivideByVector(NVMatrix& vec); + void tile(int timesY, int timesX, NVMatrix& target); + void tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream); + + void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum); + void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream); + void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax); + void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream); + void sum(int axis, NVMatrix& target, cudaStream_t stream); + void sum(int axis, NVMatrix& target); + void sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp); + void sum(int axis, NVMatrix& target, NVMatrix& tmp); + NVMatrix& sum(int axis); + void max(int axis, NVMatrix& target); + void max(int axis, NVMatrix& target, NVMatrix& tmp); + NVMatrix& max(int axis); + void min(int axis, NVMatrix& target); + NVMatrix& min(int axis); + void sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream); + void sumOfSquares(int axis, NVMatrix& target); + NVMatrix& sumOfSquares(int axis); + float mean(); + float sum(); + float sum(NVMatrix& tmpbuf); + float max(); + float min(); + float countInf(); + float countNan(); + float norm2(); + float norm(); + + void inRangeInc(float lower, float upper); + void inRangeInc(float lower, float upper, NVMatrix& target); + void inRangeExc(float lower, float upper); + void inRangeExc(float lower, float upper, NVMatrix& target); + void biggerThanScalar(float scalar); + void biggerThanScalar(float scalar, NVMatrix& target); + void smallerThanScalar(float scalar); + void smallerThanScalar(float scalar, NVMatrix& target); + void addScalar(float scaleThis, float scalar, NVMatrix& target); + void addScalar(float scalar, NVMatrix& target); + void addScalar(float scalar); + void minWithScalar(float scalar, NVMatrix& target); + void minWithScalar(float scalar); + void maxWithScalar(float scalar, NVMatrix& target); + void maxWithScalar(float scalar); + void pow(float p, NVMatrix& target); + void pow(float p); + void scale(float _scale); + void scale(float _scale, NVMatrix& target); + void scale(float _scale, NVMatrix& target, cudaStream_t stream); + void scale(float _scale, cudaStream_t stream); + void zero(); + void zero(NVMatrix& like); + + float dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream); + float dotProduct(NVMatrix& b, cudaStream_t stream); + float dotProduct(NVMatrix& b); + + /* + * Does SOFT transpose and returns result, leaving this matrix unchanged + */ + NVMatrix& getTranspose(); + NVMatrix& getClone(); + + /* + * Does HARD transpose and puts result in target + */ + void transpose(NVMatrix& target); + + /* + * Does SOFT transpose + */ + void transpose(); + bool transpose(bool trans); + + void flipTrans(NVMatrix& target, cudaStream_t stream); + void flipTrans(NVMatrix& target); + NVMatrix& flipTrans(); + + void print(int startRow, int rows, int startCol, int cols) const; + void print(int rows, int cols) const; + void printShape(const char* name) const; + + template void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target) { + applyBinaryV(op, vec, target, getDefaultStream()); + } + + template void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target, cudaStream_t stream) { + assert(&target != &vec); // for now + if (isSameDims(vec)) { + applyBinary(op, vec, target, stream); + return; + } + assert(vec.getNumRows() == 1 || vec.getNumCols() == 1); + assert(vec.getNumRows() == _numRows || vec.getNumCols() == _numCols); + assert(vec.isContiguous()); + + target.resize(*this); // target must be same orientation as me for now + int width = getLeadingDim(); //_isTrans ? _numRows : _numCols; + int height = getFollowingDim(); //_isTrans ? _numCols : _numRows; + dim3 threads(ADD_VEC_THREADS_X, ADD_VEC_THREADS_Y); + + if ((vec.getNumRows() == _numRows && !isTrans()) || (vec.getNumCols() == _numCols && isTrans())) { + dim3 blocks(std::min(512, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y))); + kColVectorOp<<>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op); + } else { + dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y))); + kRowVectorOp<<>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op); + } + getLastCudaError("Kernel execution failed"); + // cudaThreadSynchronize(); + } + + template float argMax(UnaryOperator u) { + return _totalAgg(NVMatrixAggs::ArgMax(u)); + } + static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev); + static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream); + static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev); + static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB); + + static void assertSame(NVMatrixV& a); +}; + +class HostNVMatrix : public NVMatrix { +protected: + void alloc(int numElements); + void dealloc(); + NVMatrix& construct() const; + NVMatrix& construct(bool isTrans) const; + NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const; + NVMatrix& construct(const Matrix& like, bool copy) const; + NVMatrix& construct(const NVMatrix& like, bool copy) const; + NVMatrix& construct(const NVMatrix& like) const; + NVMatrix& construct(const Matrix& like) const; + NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const; +public: + ~HostNVMatrix(); + HostNVMatrix(); + HostNVMatrix(bool isTrans); + HostNVMatrix(int numRows, int numCols, bool isTrans=false); + HostNVMatrix(const Matrix& like, bool copy); + HostNVMatrix(const NVMatrix& like, bool copy); + HostNVMatrix(const NVMatrix& like); + HostNVMatrix(const Matrix& like); + HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans); + void copyFromHost(const Matrix& hostMatrix); + void copyFromHost(const Matrix& hostMatrix, bool resizeTarget); + void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream); + void copyToHost(Matrix& hostMatrix) const; + void copyToHost(Matrix& hostMatrix, bool resizeTarget) const; + void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const; + cudaTextureObject_t getTextureObject(); +}; + +#endif /* NVMATRIX_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh new file mode 100644 index 0000000..99b234a --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh @@ -0,0 +1,727 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVMATRIX_KERNEL_H_ +#define NVMATRIX_KERNEL_H_ + +#include + +#if defined(_WIN64) || defined(_WIN32) +#define uint unsigned int +#endif + +#define NUM_BLOCKS_MAX 65535 +#define TEXTURE_SIZE_MAX (1<<29) + +#define NUM_RND_BLOCKS 96 +#define NUM_RND_THREADS_PER_BLOCK 128 +#define NUM_RND_STREAMS (NUM_RND_BLOCKS * NUM_RND_THREADS_PER_BLOCK) + +/* + * Default grid/block sizes for the various functions. + */ +#define ADD_BLOCK_SIZE 16 + +#define NUM_TILE_BLOCKS 4096 +#define NUM_TILE_THREADS_PER_BLOCK 512 + +#define ELTWISE_THREADS_X 32 +#define ELTWISE_THREADS_Y 8 + +#define ELTWISE_FLAT_THREADS_X 128 + +#define NUM_SUM_COLS_THREADS_PER_BLOCK 128 + +#define AGG_SHORT_ROWS_THREADS_X 32 +#define AGG_SHORT_ROWS_THREADS_Y 8 +#define AGG_SHORT_ROWS_LOOPS_Y 32 + +#define DP_BLOCKSIZE 512 +#define CPUSUM_MAX 4096 + +#define ADD_VEC_THREADS_X 64 +#define ADD_VEC_THREADS_Y 4 + +#ifndef DIVUP +#define DIVUP(x, y) (((x) + (y) - 1) / (y)) +#endif + +#define MYMAX(a, b) ((a) > (b) ? (a) : (b)) + +#ifndef MUL24 // legacy +#define MUL24(x,y) ((x) * (y)) +#endif + +#define AWR_NUM_THREADS 256 +#define WARP_SIZE 32 +#define AWR_NUM_WARPS AWR_NUM_THREADS / WARP_SIZE +#define AWR_LOG_NUM_THREADS 8 +#define LOG_WARP_SIZE 5 +#define AWR_LOG_NUM_WARPS 3 + +#define DEVICE_HOST -1 +#define DEVICE_NULL -2 + +__global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight); +__global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements); +__global__ void kSetupCurand(curandState *state, unsigned long long seed); + +template +__device__ T shfl_down(T a, int b, int c=WARP_SIZE) { +#if __CUDA_ARCH__ >= 300 + return __shfl_down(a, b, c); +#else + return 0; +#endif +} + +/* + * For now this is supported only for arrays with the same transposedness. + */ +template +__global__ void kEltwiseTernaryOp(const float* a, const float* b, const float* c, float* const dest, + const uint height, const uint width, uint strideA, const uint strideB, const uint strideC, + const uint strideDest, Op op) { + const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x; + const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y; + + for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) { + for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) { + dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x], c[y * strideC + x]); + } + } +} + +template +__global__ void kEltwiseTernaryOpFlat(const float* a, const float* b, const float* c, float* const dest, const uint numElements, Op op) { + const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x; + + for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) { + dest[x] = op(a[x], b[x], c[x]); + } +} + + +/* + * dest here is assumed to be "not transposed" -- height and width correspond to it. + * b is assumed to be transposed. + * a can be either transposed or not -- depending on parameter. + * + * Performs dest := op(a, b) + */ +template +__global__ void kEltwiseBinaryOpTrans(const float* a, const float* b, float* const dest, + const uint height, const uint width, + const uint strideA, const uint strideB, const uint strideDest, Op op) { + + __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1]; + + // x here because that's how much work we do + for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) { + for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) { + const uint readX = by + threadIdx.x; + const uint readY = bx + threadIdx.y; + + for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { + if (!checkBounds || (readX < height && readY + y < width)) { + if (aTrans) { + shmem[threadIdx.x][threadIdx.y + y] = reverse ? op(b[(readY+y) * strideB + readX], a[(readY+y) * strideA + readX]) + : op(a[(readY+y) * strideA + readX], b[(readY+y) * strideB + readX]); + } else { + shmem[threadIdx.x][threadIdx.y + y] = b[(readY+y) * strideB + readX]; + } + } + } + __syncthreads(); + + const uint writeX = bx + threadIdx.x; + const uint writeY = by + threadIdx.y; + + for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { + if(!checkBounds || (writeX < width && writeY + y < height)) { + if (aTrans) { + dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x]; + } else { + dest[(writeY + y) * strideDest + writeX] = reverse ? op(shmem[threadIdx.y + y][threadIdx.x], a[(writeY + y) * strideA + writeX]) + : op(a[(writeY + y) * strideA + writeX], shmem[threadIdx.y + y][threadIdx.x]); + } + } + } + __syncthreads(); + } + } +} +template +__global__ void kEltwiseBinaryOp(const float* a, const float* b, float* const dest, const uint height, const uint width, + const uint strideA, const uint strideB, const uint strideDest, Op op) { + const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x; + const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y; + + for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) { + for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) { + dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x]); + } + } +} + +template +__global__ void kEltwiseBinaryOpFlat(const float* a, const float* b, float* const dest, const uint numElements, Op op) { + const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x; + + for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) { + dest[x] = op(a[x], b[x]); + } +} + +/* + * dest here is assumed to be "not transposed" -- height and width correspond to it. + */ +template +__global__ void kEltwiseUnaryOpTrans(const float* a, float* const dest, + const uint height, const uint width, + const uint strideA, const uint strideDest, Op op) { + + __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1]; + + for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) { + for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) { + const uint readX = by + threadIdx.x; + const uint readY = bx + threadIdx.y; + for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { + if (!checkBounds || (readX < height && readY + y < width)) { + shmem[threadIdx.x][threadIdx.y + y] = op(a[(readY + y) * strideA + readX]); + } + } + __syncthreads(); + + const uint writeX = bx + threadIdx.x; + const uint writeY = by + threadIdx.y; + for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { + if(!checkBounds || (writeX < width && writeY + y < height)) { + dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x]; + + } + } + __syncthreads(); + } + } +} + +template +__global__ void kEltwiseUnaryOpFlat(const float* a, float* const dest, const uint numElements, Op op) { + const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x; + + for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) { + dest[x] = op(a[x]); + } +} + +template +__global__ void kEltwiseUnaryOp(const float* a, float* const dest, const uint height, const uint width, + const uint strideA, const uint strideDest, Op op) { + const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x; + const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y; + + for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) { + for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) { + dest[y * strideDest + x] = op(a[y * strideA + x]); + } + } +} + +/* + * Matrix in ROW-MAJOR order! + */ +template +__global__ void kRowVectorOp(const float* mat, const float* vec, float* const tgtMat, const uint width, const uint height, + const uint matStride, const uint tgtStride, Op op) { + __shared__ float shVec[ADD_VEC_THREADS_X]; + const uint bx = ADD_VEC_THREADS_X * blockIdx.x; + const uint by = ADD_VEC_THREADS_Y * blockIdx.y; + + for (uint x = bx; x < width; x += gridDim.x * ADD_VEC_THREADS_X) { + __syncthreads(); + if (x + threadIdx.x < width && threadIdx.y == 0) { + shVec[threadIdx.x] = vec[x + threadIdx.x]; + } + __syncthreads(); + + if (x + threadIdx.x < width) { + for (uint y = by + threadIdx.y; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) { + tgtMat[y * tgtStride + x + threadIdx.x] = op(mat[y * matStride + x + threadIdx.x], shVec[threadIdx.x]); + } + } + } +} + +/* + * Matrix in ROW-MAJOR order! + */ +template +__global__ void kColVectorOp(float* mat, float* vec, float* tgtMat, + const uint width, const uint height, + const uint matStride, const uint tgtStride, Op op) { + __shared__ float shVec[ADD_VEC_THREADS_Y]; + const uint by = ADD_VEC_THREADS_Y * blockIdx.y; + const uint bx = ADD_VEC_THREADS_X * blockIdx.x; + const uint tidx = ADD_VEC_THREADS_X * threadIdx.y + threadIdx.x; + + mat += threadIdx.y * matStride; + vec += tidx; + tgtMat += threadIdx.y * tgtStride; + + for (uint y = by; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) { + __syncthreads(); + if (y + tidx < height && tidx < ADD_VEC_THREADS_Y) { + shVec[tidx] = vec[y]; + } + __syncthreads(); + + if (y + threadIdx.y < height) { + for (uint x = bx + threadIdx.x; x < width; x += gridDim.x * ADD_VEC_THREADS_X) { + tgtMat[(y) * tgtStride + x] = op(mat[(y) * matStride + x], shVec[threadIdx.y]); + } + } + } +} + +/* + * This one gets coalesced reads but computes only a partial sum which + * must either be summed again (recursively) or summed on the host. + */ +template +__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) { + const int idxX = blockIdx.x * blockSize*2 + threadIdx.x; + + __shared__ float accum[blockSize*2]; + + matSum += blockIdx.y * sumWidth + blockIdx.x; + /* + * Here it's important to make sure that all threads in a block call __syncthreads, + * so I have even the redundant threads (for which idxX >= width) enter this loop + * just so that they may call __syncthreads at the appropriate times. + */ + mat += width * blockIdx.y + idxX; + + accum[threadIdx.x] = agg.getBaseValue(); + accum[threadIdx.x + blockSize] = agg.getBaseValue(); + for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) { + if (idxX < width) { + accum[threadIdx.x] = uop(mat[0]); + if(idxX + blockSize < width) + accum[threadIdx.x + blockSize] = uop(mat[blockSize]); + } + if (blockSize >= 512) { + __syncthreads(); + if (threadIdx.x < 512) + accum[threadIdx.x] = agg(accum[threadIdx.x], accum[threadIdx.x + 512]); + } + if (blockSize >= 256) { + __syncthreads(); + if (threadIdx.x < 256) + accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 256]); + } + if (blockSize >= 128) { + __syncthreads(); + if (threadIdx.x < 128) + accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 128]); + } + if (blockSize >= 64) { + __syncthreads(); + if (threadIdx.x < 64) + accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 64]); + } + + __syncthreads(); + volatile float* myAccum = &accum[threadIdx.x]; + if (threadIdx.x < 32) { // executed only by first warp + myAccum[0] = agg(myAccum[0], myAccum[32]); + myAccum[0] = agg(myAccum[0], myAccum[16]); + myAccum[0] = agg(myAccum[0], myAccum[8]); + myAccum[0] = agg(myAccum[0], myAccum[4]); + myAccum[0] = agg(myAccum[0], myAccum[2]); + myAccum[0] = agg(myAccum[0], myAccum[1]); + } + + if (threadIdx.x == 0) { + matSum[0] = bop(matSum[0], myAccum[0]); + matSum += gridDim.y * sumWidth; + } + __syncthreads(); + mat += width * gridDim.y; + } +} + +template +__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) { + const int tidx = threadIdx.x; + + __shared__ float accum[AWR_NUM_THREADS]; + volatile float* vMyAccum = &accum[tidx]; + float* myAccum = &accum[tidx]; + + matSum += blockIdx.y; + mat += width * blockIdx.y; + + for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) { + myAccum[0] = agg.getBaseValue(); + for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { + myAccum[0] = agg(myAccum[0], mat[x]); + } + #pragma unroll + for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) { + const uint d = 1 << i; + __syncthreads(); + if (tidx < d) { + myAccum[0] = agg(myAccum[0], myAccum[d]); + } + } + __syncthreads(); + if (tidx < WARP_SIZE) { + #pragma unroll + for (int i = LOG_WARP_SIZE; i >= 0; i--) { + const uint d = 1 << i; + vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]); + } + + if (tidx == 0) { + matSum[0] = op(matSum[0], vMyAccum[0]); + matSum += gridDim.y; + } + } + __syncthreads(); + mat += width * gridDim.y; + } +} + +/* + * Implements multiscan idea from http://www.moderngpu.com + * Not really useful for pure reductions but neat nonetheless. + */ +template +__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height, + Agg agg, UnaryOp uop, BinaryOp bop) { + const uint tidx = threadIdx.x; + const uint warpIdx = tidx / WARP_SIZE; + const uint lane = tidx % WARP_SIZE; + + __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS]; + __shared__ float finalAccum[AWR_NUM_WARPS]; + + float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; + float* myFinalAccum = &finalAccum[tidx]; + //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; + matSum += blockIdx.y; + mat += width * blockIdx.y; + + float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem + #pragma unroll 32 + for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { + rAccum = agg(rAccum, uop(mat[x])); + } + myAccum[0] = rAccum; + + // Each warp does a reduction that doesn't require synchronizatoin + #pragma unroll + for (uint i = 0; i < LOG_WARP_SIZE; i++) { + const uint d = 1 << i; + myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d)); + } + __syncthreads(); + // The warps write their results + if (tidx < AWR_NUM_WARPS) { + //volatile float* vMyFinalAccum = &finalAccum[tidx]; + myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)]; + #pragma unroll + for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) { + const uint d = 1 << i; + myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d)); + } + if (tidx == 0) { + matSum[0] = bop(matSum[0], myFinalAccum[0]); + matSum += gridDim.y; + } + } +} + +/* + * To be used when the rows are <= 64. + * + * TODO: try to reduce reg usage. i think this can be made faster too. + */ +//#define AGG_SHORT_ROWS_LOOPS_X 4 +template +__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { + const uint shmemX = THREADS_X + 1; + __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; + + const uint tidx = threadIdx.y * THREADS_X + threadIdx.x; + const uint ty = LOOPS_X == 1 ? tidx / width : threadIdx.y; // when loops==1, width is gonna be smaller than block x dim + const uint tx = LOOPS_X == 1 ? tidx % width : threadIdx.x; + const uint bidx = blockIdx.y * gridDim.x + blockIdx.x; + const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; + float* shmemWrite = shmem + MUL24(ty, shmemX) + tx; + matSum += blockRowIdx + tidx; +// shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0; + mat += width * blockRowIdx + MUL24(ty, width) + tx; + float* shmemWriteZeros = &shmem[MUL24(threadIdx.y,shmemX) + threadIdx.x]; + + bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ; + + if (blockRowIdx < height) { +#pragma unroll + for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { + doAgg &= tidx + y + blockRowIdx < height; + const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height; + + shmemWriteZeros[0] = agg.getBaseValue(); + __syncthreads(); +#pragma unroll + for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) { +// __syncthreads(); + if (heightIdxOK && x + tx < width) { + shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); + } + } + __syncthreads(); + if (doAgg) { + /* + * I tried doing this final sum as a 4-step reduction, with 8 threads + * per warp participating. It was slightly slower. + */ + float accum = agg.getBaseValue(); + float* shmemRead = shmem + MUL24(tidx, shmemX); + // this loops too much if the rows are really short :( +#pragma unroll + for (uint i = 0; i < THREADS_X; i++) { + accum = agg(accum, shmemRead[0]); + shmemRead++; + } + matSum[0] = bop(matSum[0], accum); + matSum += AGG_SHORT_ROWS_THREADS_Y; + } + __syncthreads(); + mat += width * AGG_SHORT_ROWS_THREADS_Y; + } + } +} + +template +__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { + const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1; + __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; + const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X); + const uint tidx = threadIdx.y * AGG_SHORT_ROWS_THREADS_X + threadIdx.x; + + const uint bidx = blockIdx.y * gridDim.x + blockIdx.x; + const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; + + float* shmemWrite = shmem + MUL24(threadIdx.y, shmemX) + threadIdx.x; + matSum += blockRowIdx + tidx; +// shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0; + mat += width * blockRowIdx + MUL24(threadIdx.y, width) + threadIdx.x; + + bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y; + if(blockRowIdx < height) { + for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { + doAgg &= tidx + y + blockRowIdx < height; + const bool heightIdxOK = threadIdx.y + y + blockRowIdx < height; + float accum = agg.getBaseValue(); + shmemWrite[0] = agg.getBaseValue(); + + for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) { +// __syncthreads(); + if (heightIdxOK && x + threadIdx.x < width) { + shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); + } + } + + __syncthreads(); + if (doAgg) { + float* shmemRead = shmem + MUL24(tidx, shmemX); + +#pragma unroll + for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) { + accum = agg(accum, shmemRead[0]); + shmemRead++; + } + + matSum[0] = bop(matSum[0], accum); + matSum += AGG_SHORT_ROWS_THREADS_Y; + } + __syncthreads(); + mat += width * AGG_SHORT_ROWS_THREADS_Y; + } + } +} + +/* + * Bad when there are few columns. + */ +template +__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { + const uint idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < width) { + float mx = agg.getBaseValue(); + for (uint j = 0; j < height; j++) { + mx = agg(uop(tex1Dfetch(mat, width * j + idx)), mx); + } + vec[idx] = bop(vec[idx], mx); + } +} + +/* + * Better with few columns because it only computes a partial sum. + */ +template +__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) { + const uint idxX = blockIdx.x * blockDim.x + threadIdx.x; + const uint idxY = blockIdx.y * sumLength; + if (idxX < width) { + float mx = agg.getBaseValue(); + for (uint j = idxY; j < min(height,idxY + sumLength); j++) { + mx = agg(op(tex1Dfetch(mat, j * width + idxX)), mx); + } + vec[blockIdx.y * width + idxX] = mx; + } +} + +template +__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) { + __shared__ float shmem[DP_BLOCKSIZE]; + uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x; + shmem[threadIdx.x] = agg.getBaseValue(); + if (eidx < gridDim.x * DP_BLOCKSIZE) { + for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) { + shmem[threadIdx.x] = agg(shmem[threadIdx.x], a[eidx]); + } + } + __syncthreads(); + if (threadIdx.x < 256) { + shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 256]); + } + __syncthreads(); + if (threadIdx.x < 128) { + shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 128]); + } + __syncthreads(); + if (threadIdx.x < 64) { + shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 64]); + } + __syncthreads(); + if (threadIdx.x < 32) { + volatile float* mysh = &shmem[threadIdx.x]; + *mysh = agg(*mysh, mysh[32]); + *mysh = agg(*mysh, mysh[16]); + *mysh = agg(*mysh, mysh[8]); + *mysh = agg(*mysh, mysh[4]); + *mysh = agg(*mysh, mysh[2]); + *mysh = agg(*mysh, mysh[1]); + if (threadIdx.x == 0) { + target[blockIdx.x] = *mysh; + } + } +} + +class AddGaussianUnaryRandomizer { +private: + const float stdev; +public: + AddGaussianUnaryRandomizer(float _stdev) : stdev(_stdev) { + } + __device__ inline float operator ()(float data, curandState* state) { + return data + stdev * curand_normal(state); + } +}; + +class BinarizeUnaryRandomizer { +public: + __device__ inline float operator ()(float data, curandState* state) { + return data > curand_uniform(state); + } +}; + +class UniformUnaryRandomizer { +public: + __device__ inline float operator ()(float data, curandState* state) { + return curand_uniform(state); + } +}; + +class GaussianUnaryRandomizer { +private: + const float mean, stdev; +public: + GaussianUnaryRandomizer(float _mean, float _stdev) : mean(_mean), stdev(_stdev) { + } + __device__ inline float operator ()(float data, curandState* state) { + return mean + stdev * curand_normal(state); + } +}; + +template +class AddGaussianBinaryRandomizer { +public: + __device__ inline float operator ()(float data, float stdev, curandState* state) { + return data + (var ? stdev : 1) * stdev * curand_normal(state); + } +}; + +class GaussianBinaryRandomizer { +private: + const float mean; +public: + GaussianBinaryRandomizer(float _mean) : mean(_mean) { + } + __device__ inline float operator ()(float data, float stdev, curandState* state) { + return mean + stdev * curand_normal(state); + } +}; + +class ScaledGaussianBinaryRandomizer { +private: + const float mean, stdevScale; +public: + ScaledGaussianBinaryRandomizer(float _mean, float _stdevScale) : mean(_mean), stdevScale(_stdevScale) { + } + __device__ inline float operator ()(float data, float stdev, curandState* state) { + return mean + stdevScale * stdev * curand_normal(state); + } +}; + +template +__global__ void kUnaryRandomize(float* data, float* targets, curandState* state, const uint numElements, Randomizer rnd) { + const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; + curandState localState = state[tidx]; + + for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) { + targets[i] = rnd(data[i], &localState); + } + state[tidx] = localState; +} + +template +__global__ void kBinaryRandomize(float* data, float* data2, float* targets, curandState* state, const uint numElements, Randomizer rnd) { + const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; + curandState localState = state[tidx]; + + for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) { + targets[i] = rnd(data[i], data2[i], &localState); + } + state[tidx] = localState; +} + +#endif /* NVMATRIX_KERNEL_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh new file mode 100644 index 0000000..6c2a4c3 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh @@ -0,0 +1,485 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVMATRIX_OPERATORS_CUH +#define NVMATRIX_OPERATORS_CUH + +class NVMatrixOps { +public: + class Exp { + public: + __device__ inline float operator()(const float a) const { + return __expf(a); + } + }; + + class Logistic { + public: + __device__ inline float operator()(const float a) const { + return __fdividef(1.0f, 1.0f + __expf(-a)); + } + }; + + class Log { + public: + __device__ inline float operator()(const float a) const { + return __logf(a); + } + }; + + class Square { + public: + __device__ inline float operator()(const float a) const { + return a * a; + } + }; + + class Sqrt { + public: + __device__ inline float operator()(const float a) const { + return sqrtf(a); + } + }; + + class SqrtAbs { + public: + __device__ inline float operator()(const float a) const { + return sqrtf(fabsf(a)); + } + }; + + class Reciprocal { + public: + __device__ inline float operator()(const float a) const { + return 1.0f / a; + } + }; + + class Abs { + public: + __device__ inline float operator()(const float a) const { + return a > 0 ? a : -a; + } + }; + + class Sign { + public: + __device__ inline float operator()(const float a) const { + return (a > 0) - (a < 0); + } + }; + + class Identity { + public: + __device__ inline float operator()(const float a) const { + return a; + } + }; + + class Zero { + public: + __device__ inline float operator()(const float a) const { + return 0; + } + }; + + class One { + public: + __device__ inline float operator()(const float a) const { + return 1; + } + }; + + class Const { + private: + const float scalar; + public: + Const(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return scalar; + } + }; + + class OneMinus { + public: + __device__ inline float operator()(const float x) const { + return 1.0f - x; + } + }; + + class Linear { + protected: + float _a, _b; + public: + __device__ inline float operator()(float x) const { + return _a * x + _b; + } + Linear(float a, float b) : _a(a), _b(b) { + } + }; + + class IsNan { + public: + __device__ inline float operator()(const float a) const { + return isnan(a); + } + }; + + class IsInf { + public: + __device__ inline float operator()(const float a) const { + return isinf(a); + } + }; + + class SmallerThanScalar { + private: + const float scalar; + public: + SmallerThanScalar(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return a < scalar; + } + }; + + class BiggerThanScalar { + private: + const float scalar; + public: + BiggerThanScalar(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return a > scalar; + } + }; + + class AddScalar { + private: + const float scalar; + public: + AddScalar(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return a + scalar; + } + }; + + class WeightedAddScalar { + private: + const float weight, scalar; + public: + WeightedAddScalar(const float _weight, const float _scalar) : weight(_weight), scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return weight * a + scalar; + } + }; + + class MultByScalar { + private: + const float scalar; + public: + MultByScalar(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return a * scalar; + } + }; + + class Pow { + private: + const float p; + public: + Pow(const float _p) : p(_p) { + } + __device__ inline float operator()(const float a) const { + return __powf(a, p); + } + }; + + template + class InRange { + private: + const float lower, upper; + public: + InRange(const float _lower, const float _upper) : lower(_lower), upper(_upper) { + } + __device__ inline float operator()(const float a) const { + return exclusive ? a > lower && a < upper : a >= lower && a <= upper; + } + }; + + class MinWithScalar { + private: + const float scalar; + public: + MinWithScalar(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return a > scalar ? scalar : a; + } + }; + + class MaxWithScalar { + private: + const float scalar; + public: + MaxWithScalar(const float _scalar) : scalar(_scalar) { + } + __device__ inline float operator()(const float a) const { + return a > scalar ? a : scalar; + } + }; +}; + +class NVMatrixBinaryOps { +public: + class BinaryOp { + public: + }; + class Equals : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return a == b; + } + }; + + class BiggerThan : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return a > b; + } + }; + + class Divide : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return __fdividef(a, b); + } + }; + + class DivideAccurate : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return a / b; + } + }; + + class DivideSafe : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return b == 0 ? 0 : __fdividef(a, b); + } + }; + + class DivideSafeAccurate : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return b == 0 ? 0 : (a / b); + } + }; + + class Multiply : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return a * b; + } + }; + + class SquaredDiff : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return (a - b) * (a - b); + } + }; + + class WeightedAdd : public BinaryOp { + private: + const float scaleA, scaleB; + public: + WeightedAdd(const float _scaleA, const float _scaleB) : scaleA(_scaleA), scaleB(_scaleB) { + } + WeightedAdd() : scaleA(0), scaleB(0) { // Compiler complains about no default constructor? + } + __device__ inline float operator()(const float a, const float b) const { + return a * scaleA + b * scaleB; + } + }; + + class WeightedAdd1 : public BinaryOp { + private: + const float scaleB; + public: + WeightedAdd1(const float _scaleB) : scaleB(_scaleB) { + } + __device__ inline float operator()(const float a, const float b) const { + return a + b * scaleB; + } + }; + + class ScaledAdd : public BinaryOp { + private: + const float scaleB; + public: + ScaledAdd(const float _scaleB) : scaleB(_scaleB) { + } + __device__ inline float operator()(const float a, const float b) const { + return a + b * scaleB; + } + }; + + class Add : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return a + b; + } + }; + + class First : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return a; + } + }; + + class Second : public BinaryOp { + public: + __device__ inline float operator()(const float a, const float b) const { + return b; + } + }; + + class SecondScaled : public BinaryOp { + private: + const float scale; + public: + SecondScaled(const float _scale) : scale(_scale) { + } + + SecondScaled() : scale(0) { // Compiler complains about no default constructor? + } + __device__ inline float operator()(const float a, const float b) const { + return scale * b; + } + }; + + template + class CompositeSecond : public BinaryOp { + private: + UnaryOp _uop; + BinaryOp _bop; + public: + CompositeSecond(UnaryOp uop, BinaryOp bop) : _uop(uop), _bop(bop) { + + } + __device__ inline float operator()(const float a, const float b) const { + return _bop(a, _uop(b)); + } + }; +}; + +class NVMatrixAggs { +public: + class Sum { + public: + __device__ inline float operator()(const float a, const float b) const { + return a + b; + } + __device__ inline float getBaseValue() { + return 0; + } + }; + + class Max { + public: + __device__ inline float operator()(const float a, const float b) const { + return a > b ? a : b; + } + __device__ inline float getBaseValue() { + return -2e38; + } + }; + + class Min { + public: + __device__ inline float operator()(const float a, const float b) const { + return a > b ? b : a; + } + __device__ inline float getBaseValue() { + return 2e38; + } + }; + + class CountNan { + public: + __device__ inline float operator()(const float a, const float b) const { + return a + isnan(b); + } + __device__ inline float getBaseValue() { + return 0; + } + }; + + class CountInf { + public: + __device__ inline float operator()(const float a, const float b) const { + return a + isinf(b); + } + __device__ inline float getBaseValue() { + return 0; + } + }; + + template + class ArgMax { + private: + UnaryOperator u; + public: + ArgMax(UnaryOperator _u) : u(_u) { + } + __device__ inline float operator()(const float a, const float b) const { + return u(a) > u(b) ? a : b; + } + __device__ inline float getBaseValue() { + return u.getArgMin(); + } + }; +}; + +class NVMatrixTernaryOps { +public: + class Add { + public: + __device__ inline float operator()(const float a, const float b, const float c) const { + return a + b + c; + } + }; + class WeightedAdd { + private: + const float scaleA, scaleB, scaleC; + public: + WeightedAdd(const float _scaleA, const float _scaleB, const float _scaleC) : scaleA(_scaleA), scaleB(_scaleB), scaleC(_scaleC) { + } + __device__ inline float operator()(const float a, const float b, const float c) const { + return a * scaleA + b * scaleB + c * scaleC; + } + }; +}; + +#endif /* NVMATRIX_OPERATORS_CUH */ + diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu b/caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu new file mode 100644 index 0000000..aab7e45 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu @@ -0,0 +1,97 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/memory.cuh" + +Lock MemoryManager::_globalLock; +std::map FastMemoryManager::_memoryManagers; + +MemoryManager& FastMemoryManager::getInstance(int deviceID) { + _globalLock.acquire(); + if (_memoryManagers.count(deviceID) == 0) { + _memoryManagers[deviceID] = (new FastMemoryManager(deviceID))->init(); + } + MemoryManager& ret = *_memoryManagers[deviceID]; + _globalLock.release(); + return ret; +} + +MemoryManager* CUDAMemoryManager::_memoryManager = NULL; +MemoryManager& CUDAMemoryManager::getInstance(int deviceID) { + _globalLock.acquire(); + if (_memoryManager == NULL) { + _memoryManager = new CUDAMemoryManager(); + } + _globalLock.release(); + return *_memoryManager; +} + +MemoryManager* CUDAHostMemoryManager::_memoryManager = NULL; +MemoryManager& CUDAHostMemoryManager::getInstance() { + _globalLock.acquire(); + if (_memoryManager == NULL) { + _memoryManager = new CUDAHostMemoryManager(); + } + _globalLock.release(); + return *_memoryManager; +} + +MemoryManager* FastHostMemoryManager::_memoryManager = NULL; +MemoryManager& FastHostMemoryManager::getInstance() { + _globalLock.acquire(); + if (_memoryManager == NULL) { + _memoryManager = (new FastHostMemoryManager())->init(); + } + _globalLock.release(); + return *_memoryManager; +} + + +void FastMemoryManager::destroyInstance(int deviceID) { + _globalLock.acquire(); + if (_memoryManagers.count(deviceID) != 0) { + delete _memoryManagers[deviceID]; + _memoryManagers.erase(deviceID); + } + _globalLock.release(); +} + +void FastHostMemoryManager::destroyInstance() { + _globalLock.acquire(); + if (_memoryManager != NULL) { + delete _memoryManager; + _memoryManager = NULL; + } + _globalLock.release(); +} + +void CUDAMemoryManager::destroyInstance(int deviceID) { + _globalLock.acquire(); + if (_memoryManager != NULL) { + delete _memoryManager; + _memoryManager = NULL; + } + _globalLock.release(); +} + +void CUDAHostMemoryManager::destroyInstance() { + _globalLock.acquire(); + if (_memoryManager != NULL) { + delete _memoryManager; + _memoryManager = NULL; + } + _globalLock.release(); +} diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu new file mode 100644 index 0000000..e37c1de --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu @@ -0,0 +1,1724 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../include/nvmatrix.cuh" +#include "../include/nvmatrix_operators.cuh" + +using namespace std; + +/* + * Device random number generator pointers. + */ +//map NVMatrix::rndGen; +map NVMatrix::_rndDevStates; +map NVMatrix::_rndDevThreads; +pthread_mutex_t* NVMatrix::_rndMutex = makeMutex(); +pthread_mutex_t* NVMatrix::_cublasMutex = makeMutex(); +pthread_mutex_t* NVMatrix::_streamMutex = makeMutex(); +std::map NVMatrix::_cublasHandles; +std::map NVMatrix::_defaultStreams; + +pthread_mutex_t* NVMatrix::makeMutex() { + pthread_mutex_t* m = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t)); + pthread_mutex_init(m, NULL); + return m; +} +/* + Do not call resize in _init because resize is a virtual function + which is overridden in base class. Since C++ is retarded and unable + to call overridden functions from constructors, we shall call resize + separately from every constructor after calling _init. +*/ +void NVMatrix::_init(bool isTrans) { + _numRows = 0; + _numCols = 0; + _numElements = 0; + _ownsData = true; + + _isTrans = isTrans; + _memSegment = NULL; + + _stride = 0; + _texObj = 0; +} + +NVMatrix::NVMatrix() : _deleted(false) { + _init(false); +} + +NVMatrix::NVMatrix(bool isTrans) : _deleted(false) { + _init(isTrans); +} + +NVMatrix::NVMatrix(int numRows, int numCols, bool isTrans) : _deleted(false) { + _init(isTrans); + resize(numRows, numCols); +} + +NVMatrix::NVMatrix(const Matrix& like, bool copy) : _deleted(false) { + _init(like.isTrans()); + resize(like.getNumRows(), like.getNumCols()); + if (copy) { + copyFromHost(like); + } +} + +NVMatrix::NVMatrix(const NVMatrix& like, bool copy) : _deleted(false) { + _init(like.isTrans()); + resize(like.getNumRows(), like.getNumCols()); + if (copy) { + like.copy(*this); + } +} + +/* + * Initializes NVMatrix with same dimensions as given matrix but + * does not copy any data. + */ +NVMatrix::NVMatrix(const NVMatrix& like) : _deleted(false) { + _init(like.isTrans()); + resize(like.getNumRows(), like.getNumCols()); +} + +/* + * Initializes NVMatrix with same dimensions as given matrix but + * does not copy any data. + */ +NVMatrix::NVMatrix(const Matrix& like) : _deleted(false) { + _init(false); + resize(like.getNumRows(), like.getNumCols()); +} + +NVMatrix::NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) : + _numRows(numRows), + _numCols(numCols), + _numElements(numRows*numCols), + _ownsData(false), + _memSegment(mem), + _isTrans(isTrans), + _deleted(false), + _texObj(0) { + _stride = stride < 0 ? getLeadingDim() : stride; +} + +NVMatrix::~NVMatrix() { + if (!_deleted) { + deallocTexture(); + if(_ownsData && _numElements > 0) { + dealloc(); + } else { + // dealloc deletes the mem segment. But if this is a view, + // then we still need to delete the mem segment object. +// assert(_memSegment == NULL || _memSegment->getSize() == 0); + delete _memSegment; + } + } +} + +void NVMatrix::copyFromHost(const Matrix& hostMatrix) { + copyFromHost(hostMatrix, false, getDefaultStream()); +} + +void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) { + copyFromHost(hostMatrix, resizeTarget, getDefaultStream()); +} + +void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) { + if (resizeTarget) { + resize(hostMatrix); + } else { + assert(isSameDims(hostMatrix)); + } + setTrans(hostMatrix.isTrans()); + + if (getNumElements() > 0) { + CUBLAS_CALL(cublasSetMatrixAsync(hostMatrix.getLeadingDim(), hostMatrix.getFollowingDim(), sizeof(float), + hostMatrix.getData(), hostMatrix.getLeadingDim(), getDevData(), _stride, stream)); + syncStream(stream); + } +} + +void NVMatrix::copyToHost(Matrix& hostMatrix) const { + copyToHost(hostMatrix, false, getDefaultStream()); +} + +void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const { + copyToHost(hostMatrix, resizeTarget, getDefaultStream()); +} + +void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const { + if (resizeTarget) { + hostMatrix.resize(_numRows, _numCols); + } else { + assert(isSameDims(hostMatrix)); + } + hostMatrix.setTrans(_isTrans); + + if (getNumElements() > 0) { + CUBLAS_CALL(cublasGetMatrixAsync(getLeadingDim(),getFollowingDim(), sizeof(float), + getDevData(), getStride(), hostMatrix.getData(), hostMatrix.getLeadingDim(), stream)); + syncStream(stream); + } +} + +void NVMatrix::copy(NVMatrix& dest) const { + copy(dest, getDefaultStream()); +} + +void NVMatrix::copy(NVMatrix& dest, cudaStream_t stream) const { + if (&dest != this) { + if (!isSameDims(dest)) { + dest.resize(*this); + } + copy(dest, 0, -1, 0, -1, 0, 0, stream); + } +} + +NVMatrix& NVMatrix::copy() const { + NVMatrix& c = construct(); + copy(c); + return c; +} + +void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target) { + rightMult(b, scaleAB, target, getDefaultStream()); +} + +void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream) { +// if(&target != this && &target != &b) { +// target.resize(_numRows, b.getNumCols()); +// target.setTrans(true); +// } + target.addProduct(*this, b, 0, scaleAB, stream); +} + +void NVMatrix::rightMult(NVMatrix &b, float scaleAB) { + rightMult(b, scaleAB, *this); +} + +void NVMatrix::rightMult(NVMatrix &b, NVMatrix& target) { + rightMult(b, 1, target); +} + +void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB) { + addProduct(a, b, scaleThis, scaleAB, getDefaultStream()); +} + +/* + * This will only work if this matrix is in column-major order! In other words, + * if isTrans() returns true. + */ +void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream) { + assert(a.getNumCols() == b.getNumRows()); + + if (scaleThis == 0) { + resize(a.getNumRows(), b.getNumCols()); + setTrans(true); + } + + assert(this->getNumRows() == a.getNumRows()); + assert(this->getNumCols() == b.getNumCols()); + assert(_isTrans); + CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream)); + CUBLAS_CALL(cublasSgemm_v2(getCublasHandle(), a.getTransChar(), b.getTransChar(), a.getNumRows(), b.getNumCols(), a.getNumCols(), + &scaleAB, a.getDevData(), a.getStride(), b.getDevData(), b.getStride(), + &scaleThis, getDevData(), getStride())); +} + +void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b) { + addProduct(a, b, 1, 1); +} + +void NVMatrix::assertSame(NVMatrixV& a) { + for (int i = 1; i < a.size(); ++i) { + assert(a[i]->isSameDims(*a[0])); + assert(a[i]->isTrans() == a[0]->isTrans()); + assert(a[i]->getStride() == a[0]->getStride()); + assert(a[i]->getDataDeviceID() == a[0]->getDataDeviceID()); + } +} + +void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, + const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) { + batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream(), aPtrsDev, bPtrsDev, tgtPtrsDev); +} + +void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB) { + batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream()); +} + +void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream, + const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) { + assert(a.size() == b.size()); + assert(a.size() == target.size()); + assertSame(a); + assertSame(b); + assertSame(target); + + const int batch = a.size(); + if (batch > 0) { + const int rows = a[0]->getNumRows(), inner = a[0]->getNumCols(), cols = b[0]->getNumCols(); + + assert(inner == b[0]->getNumRows()); + assert(target[0]->getNumRows() == rows); + assert(target[0]->getNumCols() == cols); + + const int lda = a[0]->getStride(), ldb = b[0]->getStride(), ldc = target[0]->getStride(); + cublasOperation_t atrans = a[0]->getTransChar(), btrans = b[0]->getTransChar(); + + CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream)); + CUBLAS_CALL(cublasSgemmBatched(getCublasHandle(), atrans, btrans, rows, cols, inner, &scaleAB, aPtrsDev, lda, bPtrsDev, ldb, &scaleTarget, tgtPtrsDev, ldc, batch)); + } +} + +void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream) { + assert(a.size() == b.size()); + assert(a.size() == target.size() || target.size() == 0); + + const int batch = a.size(); + if (batch > 0) { + const int rows = a[0]->getNumRows(), cols = b[0]->getNumCols(); + + const float* aPtrs[batch], *bPtrs[batch], *tgtPtrs[batch]; + for (int i = 0; i < batch; ++i) { + if (target.size() <= i) { + target.push_back(new NVMatrix(rows, cols, true)); + } + aPtrs[i] = a[i]->getDevData(); + bPtrs[i] = b[i]->getDevData(); + tgtPtrs[i] = target[i]->getDevData(); + } + +// const float** aPtrsDev, **bPtrsDev; +// float **tgtPtrsDev; +// checkCudaErrors(cudaMalloc(&aPtrsDev, batch * sizeof(float*))); +// checkCudaErrors(cudaMalloc(&bPtrsDev, batch * sizeof(float*))); +// checkCudaErrors(cudaMalloc(&tgtPtrsDev, batch * sizeof(float*))); + MemorySegment* aPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*)); + MemorySegment* bPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*)); + MemorySegment* tgtPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*)); + + checkCudaErrors(cudaMemcpyAsync(aPtrsDev, aPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(bPtrsDev, bPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(tgtPtrsDev, tgtPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream)); + + batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, stream, const_cast(aPtrsDev->getData()), + const_cast(bPtrsDev->getData()), + tgtPtrsDev->getData()); + +// checkCudaErrors(cudaFree(aPtrsDev)); +// checkCudaErrors(cudaFree(bPtrsDev)); +// checkCudaErrors(cudaFree(tgtPtrsDev)); + DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(aPtrsDev); + DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(bPtrsDev); + DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(tgtPtrsDev); + } +} + +template +void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd) { + _unaryRandomize(target, rnd, getDefaultStream()); +} + +template +void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream) { + assert(isRndInitialized()); + assert(isContiguous() && target.isContiguous()); + if (!isSameDims(target)) { + target.resize(*this); + } + assert(isTrans() == target.isTrans()); + kUnaryRandomize<<>>(getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd); + getLastCudaError("kUnaryRandomize: Kernel execution failed"); +} + +template +void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd) { + _binaryRandomize(data2, target, rnd, getDefaultStream()); +} + +template +void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream) { + assert(isRndInitialized()); + assert(isContiguous() && data2.isContiguous() && target.isContiguous()); + assert(isSameDims(data2)); + assert(isTrans() == data2.isTrans()); + if (!isSameDims(target)) { + target.resize(*this); + } + assert(isTrans() == target.isTrans()); + kBinaryRandomize<<>>(getDevData(), data2.getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd); + getLastCudaError("kBinaryRandomize: Kernel execution failed"); +} + +void NVMatrix::initRandom(unsigned long long seed, int numStreams) { + NVMatrix::initRandom(seed, numStreams, NVMatrix::getDefaultStream()); +} + +void NVMatrix::initRandom(unsigned long long seed, int numStreams, cudaStream_t stream) { +// printf("init random on device %d\n", getDeviceID()); + pthread_mutex_lock(_rndMutex); + assert(!isRndInitialized(true)); + int d = getDeviceID(); +// _rndDevStates[d] = NULL; + _rndDevThreads[d] = numStreams; + _rndDevStates[d] = DEVICE_MEMORY_MANAGER::getInstance(d).malloc(numStreams * sizeof(curandState)); +// checkCudaErrors(cudaMalloc((void **)&_rndDevStates[d], numStreams * sizeof(curandState))); + pthread_mutex_unlock(_rndMutex); + kSetupCurand<<>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one + getLastCudaError("kSetupCurand: Kernel execution failed"); +} + +void NVMatrix::initRandom(unsigned long long seed) { + initRandom(seed, NUM_RND_STREAMS); +} + +void NVMatrix::initRandom() { + NVMatrix::initRandom(time(0)); +} + +void NVMatrix::initCublas() { + int d = getDeviceID(); + pthread_mutex_lock(_cublasMutex); + assert(_cublasHandles.count(d) == 0); + CUBLAS_CALL(cublasCreate(&_cublasHandles[d])); + // It appears that cublasCreate causes a host -> device copy on stream 0, + // so we synchronize with it because we run everything else on other + // streams. + syncDevice(); + pthread_mutex_unlock(_cublasMutex); +} + +void NVMatrix::destroyCublas() { + int d = getDeviceID(); + pthread_mutex_lock(_cublasMutex); + assert(_cublasHandles.count(d) > 0); + CUBLAS_CALL(cublasDestroy(_cublasHandles[d])); + _cublasHandles.erase(d); + pthread_mutex_unlock(_cublasMutex); +} + +cublasHandle_t NVMatrix::getCublasHandle() { + return getCublasHandle(getDeviceID()); +} + +cublasHandle_t NVMatrix::getCublasHandle(int deviceID) { + pthread_mutex_lock(_cublasMutex); + assert(_cublasHandles.count(deviceID) > 0); + cublasHandle_t h = _cublasHandles[deviceID]; + pthread_mutex_unlock(_cublasMutex); + return h; +} + +cudaStream_t NVMatrix::getDefaultStream() { + return getDefaultStream(NVMatrix::getDeviceID()); +} + +cudaStream_t NVMatrix::getDefaultStream(int deviceID) { + if (deviceID >= 0) { + pthread_mutex_lock(_streamMutex); + if (_defaultStreams.count(deviceID) == 0) { + int oldDeviceID = getDeviceID(); + NVMatrix::setDeviceID(deviceID); + checkCudaErrors(cudaStreamCreateWithFlags(&_defaultStreams[deviceID], cudaStreamNonBlocking)); + NVMatrix::setDeviceID(oldDeviceID); + } + cudaStream_t s = _defaultStreams[deviceID]; + pthread_mutex_unlock(_streamMutex); + return s; + } + return 0; +} + +void NVMatrix::syncDevice() { + checkCudaErrors(cudaDeviceSynchronize()); +} + +void NVMatrix::syncStream(cudaStream_t stream) { + checkCudaErrors(cudaStreamSynchronize(stream)); +} + +void NVMatrix::syncStream() { + syncStream(getDefaultStream()); +} + +curandState* NVMatrix::getCurandState() { + /* + * Even though we're only reading from the map here, it's important to grab + * the mutex because another thread may be writing to it. + */ + pthread_mutex_lock(_rndMutex); + int d = getDeviceID(); + assert(isRndInitialized(true)); + curandState* r = _rndDevStates[d]->getData(); + pthread_mutex_unlock(_rndMutex); + return r; +} + +curandState* NVMatrix::getCurandState(int numStreams) { + int d = getDeviceID(); + pthread_mutex_lock(_rndMutex); + assert(isRndInitialized(true)); + bool realloc = numStreams > _rndDevThreads[d]; + pthread_mutex_unlock(_rndMutex); + + if (realloc) { + destroyRandom(); + initRandom(time(0), numStreams); + } + return getCurandState(); +} + +int NVMatrix::getDataDeviceID() const { + if (getDevData() == NULL) { + return DEVICE_NULL; + } + struct cudaPointerAttributes atts; + checkCudaErrors(cudaPointerGetAttributes(&atts, getDevData())); + return atts.memoryType == cudaMemoryTypeDevice ? atts.device : DEVICE_HOST; +} + + +int NVMatrix::getDeviceID() { + int d; + checkCudaErrors(cudaGetDevice(&d)); +// if (d == 0) { +// raise(SIGABRT); +// } + return d; +} + +void NVMatrix::setDeviceID(int d) { + assert(d >= 0); +// printf("Setting device to %d\n", d); +// if (d == 0) { +// raise(SIGABRT); +// } + checkCudaErrors(cudaSetDevice(d)); +} + +bool NVMatrix::canAccessPeer(int srcDevice, int tgtDevice) { + if (srcDevice == tgtDevice) { + return true; + } + int canAccess; + checkCudaErrors(cudaDeviceCanAccessPeer(&canAccess, srcDevice, tgtDevice)); + return canAccess; +} + +bool NVMatrix::isRndInitialized(bool haveLock) { + if (!haveLock) { + pthread_mutex_lock(_rndMutex); + } + bool b = _rndDevStates.count(getDeviceID()) != 0; + if (!haveLock) { + pthread_mutex_unlock(_rndMutex); + } + return b; +} + +bool NVMatrix::isRndInitialized() { + return isRndInitialized(false); +} + +void NVMatrix::destroyRandom() { + int d = getDeviceID(); + pthread_mutex_lock(_rndMutex); + assert(isRndInitialized(true)); +// checkCudaErrors(cudaFree(_rndDevStates[d])); + DEVICE_MEMORY_MANAGER::getInstance(d).free(_rndDevStates[d]); + _rndDevStates.erase(d); + _rndDevThreads.erase(d); + pthread_mutex_unlock(_rndMutex); +} + +void NVMatrix::binarizeProbs() { + binarizeProbs(*this); +} + +void NVMatrix::binarizeProbs(NVMatrix& target) { + _unaryRandomize(target, BinarizeUnaryRandomizer()); +} + +void NVMatrix::randomizeUniform() { + assert(isContiguous()); + assert(isRndInitialized()); +// CURAND_CALL(curandGenerateUniform(rndGen, _devData, getNumElements())); + _unaryRandomize(*this, UniformUnaryRandomizer()); +} + +void NVMatrix::randomizeGaussian() { + randomizeGaussian(1); +} + +void NVMatrix::randomizeGaussian(float stdev) { + randomizeGaussian(0, stdev); +} + +void NVMatrix::randomizeGaussian(float mean, float stdev) { + assert(isContiguous()); + assert(isRndInitialized()); +// CURAND_CALL(curandGenerateNormal(rndGen, _devData, getNumElements(), mean, stdev)); + _unaryRandomize(*this, GaussianUnaryRandomizer(mean, stdev)); +} + +/* + * Kind of a hack since we don't actually need the contents of this matrix for it, + * so we don't really need a binary randomizer. + */ +void NVMatrix::randomizeGaussian(NVMatrix& stdevs) { + randomizeGaussian(0, stdevs); +} + +void NVMatrix::randomizeGaussian(float mean, NVMatrix& stdevs) { + _binaryRandomize(stdevs, *this, GaussianBinaryRandomizer(mean)); +} + +void NVMatrix::randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs) { + _binaryRandomize(stdevs, *this, ScaledGaussianBinaryRandomizer(mean, stdevMult)); +} + +void NVMatrix::addGaussianNoise() { + addGaussianNoise(1); +} + +void NVMatrix::addGaussianNoise(float stdev) { + addGaussianNoise(stdev, *this); +} + +void NVMatrix::addGaussianNoise(float stdev, NVMatrix& target) { + _unaryRandomize(target, AddGaussianUnaryRandomizer(stdev)); +} + +void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var) { + addGaussianNoise(stdevs, var, *this); +} + +void NVMatrix::addGaussianNoise(NVMatrix& stdevs) { + addGaussianNoise(stdevs, false, *this); +} + +void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target) { + if (var) { + _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer()); + } else { + _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer()); + } +} + +void NVMatrix::biggerThan(NVMatrix& b, NVMatrix& target) { + applyBinary(NVMatrixBinaryOps::BiggerThan(), b, target); +} + +void NVMatrix::biggerThan(NVMatrix& b) { + biggerThan(b, *this); +} + +void NVMatrix::equals(NVMatrix& b, NVMatrix& target) { + applyBinary(NVMatrixBinaryOps::Equals(), b, target); +} + +void NVMatrix::equals(NVMatrix& m) { + equals(m, *this); +} + +void NVMatrix::biggerThanVector(NVMatrix& vec, NVMatrix& target) { + applyBinaryV(NVMatrixBinaryOps::BiggerThan(), vec, target); +} + +void NVMatrix::biggerThanVector(NVMatrix& vec) { + biggerThanVector(vec, *this); +} + +void NVMatrix::_checkBounds(int startRow, int endRow, int startCol, int endCol) const { + assert(startRow >= 0 && startRow <= _numRows); + assert(endRow >= startRow && endRow <= _numRows); + + assert(startCol >= 0 && startCol <= _numCols); + assert(endCol >= startCol && endCol <= _numCols); +} + +/* + * The only place where stride is supported for now! + * Will ALWAYS return a view of the original data, sometimes non-contiguous. + */ +NVMatrix& NVMatrix::slice(int startRow, int endRow, int startCol, int endCol) const { + endRow = endRow < 0 ? this->_numRows : endRow; + endCol = endCol < 0 ? this->_numCols : endCol; + _checkBounds(startRow, endRow, startCol, endCol); + + if (!isTrans()) { + return construct(new MemorySegment(this->getDevData() + startRow * _stride + startCol), endRow - startRow, endCol - startCol, _stride, false); + } + return construct(new MemorySegment(this->getDevData() + startCol * _stride + startRow), endRow - startRow, endCol - startCol, _stride, true); +} + +/* this will NEVER return a view */ +void NVMatrix::slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const { + endRow = endRow < 0 ? this->_numRows : endRow; + endCol = endCol < 0 ? this->_numCols : endCol; + _checkBounds(startRow, endRow, startCol, endCol); + + int sliceRows = endRow - startRow, sliceCols = endCol - startCol; + if (target.getNumRows() != sliceRows || target.getNumCols() != sliceCols) { + target.resize(sliceRows, sliceCols); + } + this->copy(target, startRow, endRow, startCol, endCol, 0, 0); +} + +NVMatrix& NVMatrix::sliceRows(int startRow, int endRow) const { + return slice(startRow, endRow, 0, -1); +} + +void NVMatrix::sliceRows(int startRow, int endRow, NVMatrix& target) const { + slice(startRow, endRow, 0, -1, target); +} + +NVMatrix& NVMatrix::sliceCols(int startCol, int endCol) const { + return slice(0, -1, startCol, endCol); +} + +void NVMatrix::sliceCols(int startCol, int endCol, NVMatrix& target) const { + slice(0, -1, startCol, endCol, target); +} + +NVMatrixV& NVMatrix::splitRows(int numParts) { + assert(getNumRows() % numParts == 0); + NVMatrixV& v = *new NVMatrixV(); + int partSize = getNumRows() / numParts; + for (int p = 0; p < numParts; ++p) { + v.push_back(&sliceRows(p * partSize, (p+1) * partSize)); + } + return v; +} + +NVMatrixV& NVMatrix::splitCols(int numParts) { + assert(getNumCols() % numParts == 0); + NVMatrixV& v = *new NVMatrixV(); + int partSize = getNumCols() / numParts; + for (int p = 0; p < numParts; ++p) { + v.push_back(&sliceCols(p * partSize, (p+1) * partSize)); + } + return v; +} + +/* + * Guaranteed to not change the data if the number of elements doesn't change. + * So you can use this to "reshape" a matrix. + */ +bool NVMatrix::resize(int numRows, int numCols, bool trans) { + setTrans(trans); + bool reallocated = false; + if (numRows != _numRows || numCols != _numCols) { + assert(_ownsData || (_numElements == numRows * numCols && isContiguous())); + if (_numElements != numRows * numCols) { + if (_numElements > 0) { // free old memory + dealloc(); + } + if (numRows * numCols > 0) { // allocate new memory + alloc(numCols * numRows); + } else { + _memSegment = NULL; + } + reallocated = true; + } + _numRows = numRows; + _numCols = numCols; + _numElements = numRows * numCols; + _stride = getLeadingDim(); + } + return reallocated; +} + +bool NVMatrix::resize(int numRows, int numCols) { + return resize(numRows, numCols, isTrans()); +} + +bool NVMatrix::resize(const NVMatrix& like) { + setTrans(like.isTrans()); + return resize(like.getNumRows(), like.getNumCols()); +} + +bool NVMatrix::resize(const Matrix& like) { + setTrans(like.isTrans()); + return resize(like.getNumRows(), like.getNumCols()); +} + +void NVMatrix::reshape(int numRows, int numCols) { + assert(isContiguous()); + assert(_numElements == numRows*numCols); + _numRows = numRows; + _numCols = numCols; + _stride = getLeadingDim(); +} + +NVMatrix& NVMatrix::reshaped(int numRows, int numCols) const { + assert(isContiguous()); + assert(_numElements == numRows*numCols); + return construct(new MemorySegment(*_memSegment), numRows, numCols, -1, _isTrans); +} + +void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow, + int srcStartCol, int srcEndCol, + int destStartRow, int destStartCol) const { + copy(dest, srcStartRow, srcEndRow, srcStartCol, srcEndCol, destStartRow, destStartCol, getDefaultStream()); +} + +void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow, + int srcStartCol, int srcEndCol, + int destStartRow, int destStartCol, cudaStream_t stream) const { + srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow; + srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol; + NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol); + NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol); + if (srcSlice->isContiguous() && destSlice->isContiguous() && srcSlice->isSameDims(*destSlice) && srcSlice->isTrans() == destSlice->isTrans()) { + // The commonest case. + checkCudaErrors(cudaMemcpyAsync(destSlice->getDevData(), srcSlice->getDevData(), srcSlice->getNumDataBytes(), cudaMemcpyDefault, stream)); + } else { + srcSlice->apply(NVMatrixOps::Identity(), *destSlice, stream); + } + delete srcSlice; + delete destSlice; +} + + +NVMatrix& NVMatrix::getTranspose() { + return construct(new MemorySegment(*_memSegment), _numCols, _numRows, _stride, !_isTrans); +} + +NVMatrix& NVMatrix::getClone() { + return construct(new MemorySegment(*_memSegment), _numRows, _numCols, _stride, _isTrans); +} + +void NVMatrix::transpose(NVMatrix& target) { + flipTrans(target); + target.setTrans(!target.isTrans()); + target.reshape(target.getNumCols(), target.getNumRows()); +} + +void NVMatrix::transpose() { + int tmp = _numCols; + _numCols = _numRows; + _numRows = tmp; + _isTrans = !_isTrans; +} + +bool NVMatrix::transpose(bool trans) { + bool oldTrans = _isTrans; + if (oldTrans != trans) { + transpose(); + } + return oldTrans; +} + +/* + * Flips the ordering of the matrix from row-major to column-major and vice versa. + * This creates temporary storage -- not a cheap operation. + * + * This is not equivalent to a "hard transpose". The resultant matrix still has + * the same dimensions, its layout in memory just changes. + */ +NVMatrix& NVMatrix::flipTrans() { + NVMatrix& meTrans = construct(*this); + flipTrans(meTrans); + return meTrans; +} + +void NVMatrix::flipTrans(NVMatrix& target) { + flipTrans(target, getDefaultStream()); +} + +void NVMatrix::flipTrans(NVMatrix& target, cudaStream_t stream) { + assert(&target != this); + target.resize(_numRows, _numCols); + target.setTrans(!isTrans()); +// target.printShape("target"); +// this->printShape("this"); + apply(NVMatrixOps::Identity(), target, stream); +} + +void NVMatrix::squaredDiff(NVMatrix& b) { + squaredDiff(b, *this); +} + +void NVMatrix::squaredDiff(NVMatrix& b, NVMatrix& target) { + applyBinary(NVMatrixBinaryOps::SquaredDiff(), b, target); +} + +void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target) { + add(b, scaleA, scaleB, target, NVMatrix::getDefaultStream()); +} + +void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream) { + if (scaleA == 0) { + b.scale(scaleB, target, stream); + } else if (scaleB == 0) { + scale(scaleA, target, stream); + } else if (scaleA == 1 && scaleB == 1) { // slight optimization + applyBinary(NVMatrixBinaryOps::Add(), b, target, stream); + } else if (scaleA == 1) { + applyBinary(NVMatrixBinaryOps::WeightedAdd1(scaleB), b, target, stream); + } else { + applyBinary(NVMatrixBinaryOps::WeightedAdd(scaleA, scaleB), b, target, stream); + } +} + +void NVMatrix::add(NVMatrix& b, float scaleB, NVMatrix& target) { + add(b, 1, scaleB, target); +} + +void NVMatrix::add(NVMatrix& b, NVMatrix& target) { + add(b, 1, target); +} + +void NVMatrix::add(NVMatrix& b, float scaleB) { + add(b, scaleB, *this); +} + +void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB) { + add(b, scaleA, scaleB, *this); +} + +void NVMatrix::add(NVMatrix& b) { + add(b, 1, *this); +} + +void NVMatrix::subtract(NVMatrix& b, NVMatrix& target) { + add(b, -1, target); +} + +void NVMatrix::subtract(NVMatrix& b) { + add(b, -1); +} + +void NVMatrix::eltwiseMult(NVMatrix& b, NVMatrix& target) { + applyBinary(NVMatrixBinaryOps::Multiply(), b, target); +} + +void NVMatrix::eltwiseMult(NVMatrix& b) { + eltwiseMult(b, *this); +} + +void NVMatrix::eltwiseDivide(NVMatrix& b, NVMatrix& target) { + applyBinary(NVMatrixBinaryOps::Divide(), b, target); +} + +void NVMatrix::eltwiseDivide(NVMatrix& b) { + eltwiseDivide(b, *this); +} + +void NVMatrix::tile(int timesY, int timesX, NVMatrix& target) { + tile(timesY, timesX, target, getDefaultStream()); +} + +void NVMatrix::tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream) { + assert(isContiguous() && target.isContiguous()); + assert(timesX > 0 && timesY > 0); + target.resize(_numRows*timesY, _numCols*timesX); + target.setTrans(_isTrans); + if(!isTrans()) { + kTile<<>>(getDevData(), target.getDevData(), _numCols, _numRows, target._numCols, target._numRows); + } else { + kTile<<>>(getDevData(), target.getDevData(), _numRows, _numCols, target._numRows, target._numCols); + } + getLastCudaError("Kernel execution failed"); +} + +void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target) { + addVector(vec, scaleVec, target, getDefaultStream()); +} + +void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream) { + applyBinaryV(NVMatrixBinaryOps::ScaledAdd(scaleVec), vec, target, stream); +} + +void NVMatrix::addVector(NVMatrix& vec) { + addVector(vec, 1); +} + +void NVMatrix::addVector(NVMatrix& vec, float scaleVec) { + addVector(vec, scaleVec, *this); +} + +void NVMatrix::addVector(NVMatrix& vec, NVMatrix& target) { + addVector(vec, 1, target); +} + +void NVMatrix::equalsVector(NVMatrix& vec, NVMatrix& target) { + applyBinaryV(NVMatrixBinaryOps::Equals(), vec, target); +} + +void NVMatrix::equalsVector(NVMatrix& vec) { + equalsVector(vec, *this); +} + +void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target) { + eltwiseMultByVector(vec, target, getDefaultStream()); +} + +void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream) { + applyBinaryV(NVMatrixBinaryOps::Multiply(), vec, target, stream); +} + +void NVMatrix::eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream) { + eltwiseMultByVector(vec, *this, stream); +} + +void NVMatrix::eltwiseMultByVector(NVMatrix& vec) { + eltwiseMultByVector(vec, *this); +} + +void NVMatrix::eltwiseDivideByVector(NVMatrix& vec) { + eltwiseDivideByVector(vec, *this); +} + +void NVMatrix::eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target) { + applyBinaryV(NVMatrixBinaryOps::Divide(), vec, target); +} + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) { + _aggregate(axis, target, agg, uop, bop, stream, NULL); +} + +/* + * TODO: this is a mess, fix it. it works pretty fast but it's too ugly. + * TODO: this function is _really_ bad for very long aggregations of few columns. + */ +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp) { + assert(axis == 0 || axis == 1); + assert(isContiguous() && target.isContiguous()); + assert(&target != this); + int width = _isTrans ? _numRows : _numCols; + int height = _isTrans ? _numCols : _numRows; + + target.setTrans(_isTrans); + assert(width > 0); + assert(height > 0); + if((axis == 0 && !_isTrans) || (axis == 1 && _isTrans)) { //col sum + target.resize(!_isTrans ? 1 : _numRows, !_isTrans ? _numCols : 1); +// int height = getFollowingDim(); + if ((height <= 2048 || width >= 4096)) { + int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK); + assert(numBlocks * NUM_SUM_COLS_THREADS_PER_BLOCK >= width); + assert(numBlocks < NUM_BLOCKS_MAX); + kDumbAggCols<<>>(getTextureObject(), target.getDevData(), width, height, agg, uop, bop); + getLastCudaError("kDumbAggCols: Kernel execution failed"); + } else { // Specialize the case when we have very long columns and few of them + const int sumLength = 128; + bool deltmp = tmp == NULL; + if (tmp == NULL) { + tmp = new NVMatrix(false); + } + + int numBlocksX = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK); + int numBlocksY = DIVUP(height, sumLength); + tmp->resize(numBlocksY, width); + + dim3 blocks(numBlocksX, numBlocksY); + dim3 threads(NUM_SUM_COLS_THREADS_PER_BLOCK); + kAggCols<<>>(getTextureObject(), tmp->getDevData(), width, height, sumLength, agg, uop); + getLastCudaError("kAggCols: Kernel execution failed"); + + int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK); + kDumbAggCols<<>>(tmp->getTextureObject(), target.getDevData(), width, numBlocksY, agg, NVMatrixOps::Identity(), bop); + getLastCudaError("kDumbAggCols: Kernel execution failed"); + if (deltmp) { + delete tmp; + } + } + } else { // row sum + target.resize(_isTrans ? 1 : _numRows, _isTrans ? _numCols : 1); + if (width > 1) { + if (height >= 16384) { // linear aggregation + int numBlocksX = 1; + int numBlocksY = DIVUP(height, AGG_SHORT_ROWS_THREADS_Y*AGG_SHORT_ROWS_LOOPS_Y); + int numThreadsX = width <= 4 ? 4 : width <= 8 ? 8 : width <= 12 ? 12 : width <= 16 ? 16 : AGG_SHORT_ROWS_THREADS_X; + int numThreadsY = AGG_SHORT_ROWS_THREADS_Y; + while (numBlocksY > NUM_BLOCKS_MAX) { + numBlocksY = DIVUP(numBlocksY,2); + numBlocksX *= 2; + } + dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY); + if(width <= 16) { + if(width <= 4) { + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } else if(width <= 8) { + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } else if(width <= 12) { + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } else { + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } + } else if(width <= 32) { + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } else if(width <= 48){ + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } else if(width <= 64){ + kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } else { + kAggShortRows2<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); + } + } else { + if (width >= 512) { + // NOTE: this is the only case which I bothered to try to optimize for Kepler + dim3 threads(AWR_NUM_THREADS); + dim3 blocks(1, height); + kAggRows_wholerow_nosync<<>>(getDevData(), target.getDevData(), width, height, agg, uop, bop); + } else { + + int numThreadsX = width <= 64 ? 32 : (width <= 128 ? 64 : (width <= 256 ? 128 : (width <= 512 ? 256 : 512))); + int numThreadsY = 1; + int numBlocksX = DIVUP(width, 2*numThreadsX); + int numBlocksY = std::min(height, NUM_BLOCKS_MAX); + + dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY); + assert(numBlocksX <= NUM_BLOCKS_MAX); + assert(numBlocksY <= NUM_BLOCKS_MAX); + + if(width <= 64) { + kAggRows<<>>(getDevData(), target.getDevData(), + width, height, target.getLeadingDim(), agg, uop, bop); + } else if(width <= 128) { + kAggRows<<>>(getDevData(), target.getDevData(), + width, height, target.getLeadingDim(), agg, uop, bop); + } else if(width <= 256) { + kAggRows<<>>(getDevData(), target.getDevData(), + width, height, target.getLeadingDim(), agg, uop, bop); + } else if(width <= 512) { + kAggRows<<>>(getDevData(), target.getDevData(), + width, height, target.getLeadingDim(), agg, uop, bop); + } else { + kAggRows<<>>(getDevData(), target.getDevData(), + width, height, target.getLeadingDim(), agg, uop, bop); + } + + getLastCudaError("agg rows: Kernel execution failed"); + } + } + } else { + target.applyBinary(NVMatrixBinaryOps::CompositeSecond(uop, bop), *this, target, stream); +// copy(target, stream); + } + } +} + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop) { + _aggregate(axis, target, agg, uop, bop, getDefaultStream()); +} + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop) { + _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream()); +} + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream) { + _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream); +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop) { + NVMatrix &sumVec = construct(); + _aggregate(axis, sumVec, agg, uop, bop); + return sumVec; +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) { + NVMatrix &sumVec = construct(); + _aggregate(axis, sumVec, agg, uop, bop, stream); + return sumVec; +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop) { + return _aggregate(axis, agg, NVMatrixOps::Identity(), bop); +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream) { + return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream); +} + + + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) { + _aggregate(axis, target, agg, uop, bop, getDefaultStream(), tmp); +} + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp) { + _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream(), &tmp); +} + +template +void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) { + _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream, &tmp); +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) { + NVMatrix &sumVec = construct(); + _aggregate(axis, sumVec, agg, uop, bop, tmp); + return sumVec; +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) { + NVMatrix &sumVec = construct(); + _aggregate(axis, sumVec, agg, uop, bop, stream, tmp); + return sumVec; +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp) { + return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, tmp); +} + +template +NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) { + return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream, tmp); +} + +void NVMatrix::inRangeInc(float lower, float upper) { + inRangeInc(lower, upper, *this); +} +void NVMatrix::inRangeInc(float lower, float upper, NVMatrix& target) { + apply(NVMatrixOps::InRange(lower, upper), target); +} + +void NVMatrix::inRangeExc(float lower, float upper) { + inRangeExc(lower, upper, *this); +} + +void NVMatrix::inRangeExc(float lower, float upper, NVMatrix& target) { + apply(NVMatrixOps::InRange(lower, upper), target); +} + +void NVMatrix::biggerThanScalar(float scalar) { + biggerThanScalar(scalar, *this); +} + +void NVMatrix::biggerThanScalar(float scalar, NVMatrix& target) { + apply(NVMatrixOps::BiggerThanScalar(scalar), target); +} + +void NVMatrix::smallerThanScalar(float scalar) { + smallerThanScalar(scalar, *this); +} + +void NVMatrix::smallerThanScalar(float scalar, NVMatrix& target) { + apply(NVMatrixOps::SmallerThanScalar(scalar), target); +} + +void NVMatrix::addScalar(float scaleThis, float scalar, NVMatrix& target) { + apply(NVMatrixOps::WeightedAddScalar(scaleThis, scalar), target); +} + +void NVMatrix::addScalar(float scalar, NVMatrix& target) { + apply(NVMatrixOps::AddScalar(scalar), target); +} + +void NVMatrix::addScalar(float scalar) { + addScalar(scalar, *this); +} + +void NVMatrix::minWithScalar(float scalar, NVMatrix& target) { + apply(NVMatrixOps::MinWithScalar(scalar), target); +} + +void NVMatrix::minWithScalar(float scalar) { + minWithScalar(scalar, *this); +} + +void NVMatrix::maxWithScalar(float scalar, NVMatrix& target) { + apply(NVMatrixOps::MaxWithScalar(scalar), target); +} + +void NVMatrix::maxWithScalar(float scalar) { + maxWithScalar(scalar, *this); +} + +void NVMatrix::pow(float p, NVMatrix& target) { + apply(NVMatrixOps::Pow(p), target); +} + +void NVMatrix::pow(float p) { + pow(p, *this); +} + +void NVMatrix::scale(float _scale) { + scale(_scale, *this); +} + +void NVMatrix::scale(float _scale, cudaStream_t stream) { + scale(_scale, *this, stream); +} + +void NVMatrix::scale(float _scale, NVMatrix& target) { + scale(_scale, target, NVMatrix::getDefaultStream()); +} + +void NVMatrix::scale(float _scale, NVMatrix& target, cudaStream_t stream) { + if (_scale != 1 || &target != this) { // optimize away scale by 1 + if (_scale == 1) { + copy(target, stream); + } else { + apply(NVMatrixOps::MultByScalar(_scale), target, stream); + } + } +} + +void NVMatrix::zero() { + apply(NVMatrixOps::Zero()); +} + +void NVMatrix::zero(NVMatrix& like) { + resize(like); + zero(); +} + +void NVMatrix::max(int axis, NVMatrix& target) { + _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second()); +} + +void NVMatrix::max(int axis, NVMatrix& target, NVMatrix& tmp) { + _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second(), tmp); +} + +void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum) { + addSum(a, axis, scaleThis, scaleSum, getDefaultStream()); +} + +void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream) { + if (scaleThis != 0) { + a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleSum), stream); + } else { + a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::SecondScaled(scaleSum), stream); + } +} + +void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax) { + addMax(a, axis, scaleThis, scaleMax, getDefaultStream()); +} + +void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream) { + if (scaleThis != 0) { + a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleMax), stream); + } else { + a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::SecondScaled(scaleMax), stream); + } +} + +void NVMatrix::sum(int axis, NVMatrix& target) { + sum(axis, target, getDefaultStream()); +} + +void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream) { + _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream); +} + +void NVMatrix::sum(int axis, NVMatrix& target, NVMatrix& tmp) { + sum(axis, target, getDefaultStream(), tmp); +} + +void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp) { + _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream, tmp); +} + +void NVMatrix::sumOfSquares(int axis, NVMatrix& target) { + sumOfSquares(axis, target, getDefaultStream()); +} + +void NVMatrix::sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream) { + _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second(), stream); +} + +void NVMatrix::min(int axis, NVMatrix& target) { + _aggregate(axis, target, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second()); +} + +NVMatrix& NVMatrix::max(int axis) { + return _aggregate(axis, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second()); +} + +NVMatrix& NVMatrix::sum(int axis) { + return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second()); +} + +NVMatrix& NVMatrix::min(int axis) { + return _aggregate(axis, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second()); +} + +NVMatrix& NVMatrix::sumOfSquares(int axis) { + return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second()); +} + +void NVMatrix::_sum_setParams(int n, dim3* blocks, dim3* threads) { + *threads = dim3(DP_BLOCKSIZE); + *blocks = dim3(std::min(CPUSUM_MAX, DIVUP(n, DP_BLOCKSIZE))); +} + +float NVMatrix::mean() { + return sum() / getNumElements(); +} + +float NVMatrix::sum() { + return _totalAgg(NVMatrixAggs::Sum()); +} + +float NVMatrix::sum(NVMatrix& tmpbuf) { + return _totalAgg(NVMatrixAggs::Sum(), tmpbuf, getDefaultStream()); +} + +float NVMatrix::max() { + return _totalAgg(NVMatrixAggs::Max()); +} + +float NVMatrix::min() { + return _totalAgg(NVMatrixAggs::Min()); +} + +float NVMatrix::countNan() { + return _totalAgg(NVMatrixAggs::CountNan()); +} + +float NVMatrix::countInf() { + return _totalAgg(NVMatrixAggs::CountInf()); +} + +template +float NVMatrix::_totalAgg(Agg agg) { + return _totalAgg(agg, getDefaultStream()); +} + +template +float NVMatrix::_totalAgg(Agg agg, cudaStream_t stream) { + NVMatrix tmp; + return _totalAgg(agg, tmp, stream); +} + +template +float NVMatrix::_totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream) { + assert(isContiguous()); + dim3 blocks, threads; + // Sum most of it on GPU + + _sum_setParams(getNumElements(), &blocks, &threads); + tmpbuf.resize(1, blocks.x); + kTotalAgg<<>>(getDevData(), tmpbuf.getDevData(), getNumElements(), agg); + getLastCudaError("kTotalAgg: Kernel execution failed"); + // Don't need to sync because we copyToHost in the same stream, so it's serialized +// NVMatrix::syncStream(stream); + return tmpbuf.cpuAgg(agg, stream); +} +template +float NVMatrix::cpuAgg(Agg agg, cudaStream_t stream) { + Matrix bufCPU(getNumRows(), getNumCols()); + copyToHost(bufCPU, false, stream); + if (getNumElements() > 1) { // Sum remainder on CPU + if (typeid(Agg) == typeid(NVMatrixAggs::Sum)) { + return bufCPU.sum(); + } else if (typeid(Agg) == typeid(NVMatrixAggs::Max)) { + return bufCPU.max(); + } else if (typeid(Agg) == typeid(NVMatrixAggs::Min)) { + return bufCPU.min(); + } else if (typeid(Agg) == typeid(NVMatrixAggs::CountNan)) { + return bufCPU.hasNan(); //yea, it's not the same, who cares + } else if (typeid(Agg) == typeid(NVMatrixAggs::CountInf)) { + return bufCPU.hasInf(); + } else { + assert(false); + } + } + return bufCPU(0,0); +} + +float NVMatrix::dotProduct(NVMatrix& b) { + return dotProduct(b, getDefaultStream()); +} + +float NVMatrix::dotProduct(NVMatrix& b, cudaStream_t stream) { + NVMatrix tmp; + return dotProduct(b, tmp, stream); +} + +/* + * Fast dot product only for matrices with same transposedness. + */ +float NVMatrix::dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream) { + assert(isContiguous() && b.isContiguous()); + assert(isSameDims(b)); + assert(isTrans() == b.isTrans()); // see? + dim3 blocks, threads; + _sum_setParams(getNumElements(), &blocks, &threads); +// NVMatrix target(1, blocks.x); + tmp.resize(1, blocks.x); + kDotProduct_r<<>>(getDevData(), b.getDevData(), tmp.getDevData(), getNumElements()); + getLastCudaError("kDotProduct_r: Kernel execution failed"); +// cudaThreadSynchronize(); +// syncStream(stream); +// return tmp._totalAgg(NVMatrixAggs::Sum(), stream); + return tmp.cpuAgg(NVMatrixAggs::Sum(), stream); +} + +float NVMatrix::norm2() { + return dotProduct(*this); +} + +float NVMatrix::norm() { + return sqrt(norm2()); +} + +void NVMatrix::print(int startRow, int rows, int startCol, int cols) const { +// cudaThreadSynchronize(); + syncDevice(); + Matrix hm = Matrix(_numRows, _numCols); + copyToHost(hm); + hm.print(startRow, rows, startCol, cols); +} + +void NVMatrix::print(int rows, int cols) const { + print(0, rows, 0, cols); +} + +void NVMatrix::printShape(const char* name) const { + printf("%s: %dx%d\n", name, _numRows, _numCols); +} + +void NVMatrix::alloc(int numElements) { + _memSegment = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(numElements * sizeof(float)); +} + +void NVMatrix::dealloc() { + DEVICE_MEMORY_MANAGER::getInstance(_memSegment->getDeviceID()).free(_memSegment); + _memSegment = NULL; + deallocTexture(); +} + +void NVMatrix::deallocTexture() { + if (_texObj != 0) { + checkCudaErrors(cudaDestroyTextureObject(_texObj)); + _texObj = 0; + } +} + +cudaTextureObject_t NVMatrix::getTextureObject() { + if (_texObj == 0) { + assert(isContiguous()); + //size_t memFree, memTotal; + + struct cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = getDevData(); + resDesc.res.linear.sizeInBytes = getNumDataBytes(); + resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + struct cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + checkCudaErrors(cudaCreateTextureObject(&_texObj, &resDesc, &texDesc, NULL)); + } + assert(_texObj != 0); + return _texObj; +} + +NVMatrix& NVMatrix::construct() const { + return *new NVMatrix(); +} +NVMatrix& NVMatrix::construct(bool isTrans) const { + return *new NVMatrix(isTrans); +} +NVMatrix& NVMatrix::construct(int numRows, int numCols, bool isTrans) const { + return *new NVMatrix(numRows, numCols, isTrans); +} +NVMatrix& NVMatrix::construct(const Matrix& like, bool copy) const { + return *new NVMatrix(like, copy); +} +NVMatrix& NVMatrix::construct(const NVMatrix& like, bool copy) const { + return *new NVMatrix(like, copy); +} +NVMatrix& NVMatrix::construct(const NVMatrix& like) const { + return *new NVMatrix(like); +} +NVMatrix& NVMatrix::construct(const Matrix& like) const { + return *new NVMatrix(like); +} +NVMatrix& NVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const { + return *new NVMatrix(mem, numRows, numCols, stride, isTrans); +} + +std::pair NVMatrix::getCudaMemorySize() { + size_t memFree, memTotal; + checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal)); + return std::pair(memFree, memTotal); +} + + +/* ================ + * HostNVMatrix + * ================ + */ +HostNVMatrix::~HostNVMatrix() { + if (_ownsData && _numElements > 0) { + dealloc(); + } else { + // dealloc frees the mem segment. But if this is a view, + // then we need to delete the mem segment object. +// assert(_memSegment == NULL || _memSegment->getSize() == 0); + delete _memSegment; + } + _deleted = true; +} +HostNVMatrix::HostNVMatrix() : NVMatrix() { + _init(false); +} +HostNVMatrix::HostNVMatrix(bool isTrans) { + _init(isTrans); +} +HostNVMatrix::HostNVMatrix(int numRows, int numCols, bool isTrans) { + _init(isTrans); + resize(numRows, numCols); +} +HostNVMatrix::HostNVMatrix(const Matrix& like, bool copy) { + _init(like.isTrans()); + resize(like.getNumRows(), like.getNumCols()); + if (copy) { + copyFromHost(like); + } +} +HostNVMatrix::HostNVMatrix(const NVMatrix& like, bool copy) { + _init(like.isTrans()); + resize(like.getNumRows(), like.getNumCols()); + if (copy) { + like.copy(*this); + } +} +HostNVMatrix::HostNVMatrix(const NVMatrix& like) { + _init(like.isTrans()); + resize(like.getNumRows(), like.getNumCols()); +} +HostNVMatrix::HostNVMatrix(const Matrix& like) { + _init(false); + resize(like.getNumRows(), like.getNumCols()); +} +HostNVMatrix::HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) + : NVMatrix(mem, numRows, numCols, stride, isTrans) { +} + +NVMatrix& HostNVMatrix::construct() const { + return *new HostNVMatrix(); +} +NVMatrix& HostNVMatrix::construct(bool isTrans) const { + return *new HostNVMatrix(isTrans); +} +NVMatrix& HostNVMatrix::construct(int numRows, int numCols, bool isTrans) const { + return *new HostNVMatrix(numRows, numCols, isTrans); +} +NVMatrix& HostNVMatrix::construct(const Matrix& like, bool copy) const { + return *new HostNVMatrix(like, copy); +} +NVMatrix& HostNVMatrix::construct(const NVMatrix& like, bool copy) const { + return *new HostNVMatrix(like, copy); +} +NVMatrix& HostNVMatrix::construct(const NVMatrix& like) const { + return *new HostNVMatrix(like); +} +NVMatrix& HostNVMatrix::construct(const Matrix& like) const { + return *new HostNVMatrix(like); +} +NVMatrix& HostNVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const { + return *new HostNVMatrix(mem, numRows, numCols, stride, isTrans); +} + +void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) { + if (resizeTarget) { + resize(hostMatrix); + } else { + assert(isSameDims(hostMatrix)); + } + setTrans(hostMatrix.isTrans()); + if (getNumElements() > 0) { + checkCudaErrors(cudaMemcpy2D(getDevData(), _stride * sizeof(float), hostMatrix.getData(), + hostMatrix.getLeadingDim() * sizeof(float), getLeadingDim() * sizeof(float), + getFollowingDim(), cudaMemcpyHostToHost)); +// syncStream(stream); + } +} + +void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) { + copyFromHost(hostMatrix, resizeTarget, 0); +} + +void HostNVMatrix::copyFromHost(const Matrix& hostMatrix) { + copyFromHost(hostMatrix, false, 0); +} + +void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const { + if (resizeTarget) { + hostMatrix.resize(getNumRows(), getNumCols()); + } else { + assert(isSameDims(hostMatrix)); + } + hostMatrix.setTrans(_isTrans); + if (getNumElements() > 0) { + checkCudaErrors(cudaMemcpy2D(hostMatrix.getData(), hostMatrix.getLeadingDim() * sizeof(float), + getDevData(), _stride * sizeof(float), getLeadingDim() * sizeof(float), + getFollowingDim(), cudaMemcpyHostToHost)); +// syncStream(stream); + } +} + +void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const { + copyToHost(hostMatrix, resizeTarget, 0); +} + +void HostNVMatrix::copyToHost(Matrix& hostMatrix) const { + copyToHost(hostMatrix, false, 0); +} + +void HostNVMatrix::alloc(int numElements) { +// checkCudaErrors(cudaHostAlloc(&_devData, numElements * sizeof(float), cudaHostAllocPortable)); + _memSegment = HOST_MEMORY_MANAGER::getInstance().malloc(numElements * sizeof(float)); +// _memSegment = FastHostMemoryManager::getInstance().malloc(numElements * sizeof(float)); +} + +void HostNVMatrix::dealloc() { +// FastHostMemoryManager::getInstance().free(_memSegment); + HOST_MEMORY_MANAGER::getInstance().free(_memSegment); + _memSegment = NULL; +// checkCudaErrors(cudaFreeHost(_devData)); +} + +cudaTextureObject_t HostNVMatrix::getTextureObject() { + assert(false); + return 0; +} diff --git a/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu new file mode 100644 index 0000000..628a1f5 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu @@ -0,0 +1,77 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../include/nvmatrix_kernels.cuh" + +__global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int numThreads = blockDim.x * gridDim.x; + // const unsigned int numEls = tgtWidth * tgtHeight; + for (uint i = idx; i < tgtWidth * tgtHeight; i += numThreads) { + const uint y = i / tgtWidth; + const uint x = i % tgtWidth; + const uint srcY = y % srcHeight; + const uint srcX = x % srcWidth; + tgt[i] = src[srcY * srcWidth + srcX]; + } +} + +__global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements) { + __shared__ float shmem[DP_BLOCKSIZE]; + + uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x; + shmem[threadIdx.x] = 0; + if (eidx < gridDim.x * DP_BLOCKSIZE) { + for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) { + shmem[threadIdx.x] += a[eidx] * b[eidx]; + } + } + __syncthreads(); + if (threadIdx.x < 256) { + shmem[threadIdx.x] += shmem[threadIdx.x + 256]; + } + __syncthreads(); + if (threadIdx.x < 128) { + shmem[threadIdx.x] += shmem[threadIdx.x + 128]; + } + __syncthreads(); + if (threadIdx.x < 64) { + shmem[threadIdx.x] += shmem[threadIdx.x + 64]; + } + __syncthreads(); + if (threadIdx.x < 32) { + volatile float* mysh = &shmem[threadIdx.x]; + *mysh += mysh[32]; + *mysh += mysh[16]; + *mysh += mysh[8]; + *mysh += mysh[4]; + *mysh += mysh[2]; + *mysh += mysh[1]; + if (threadIdx.x == 0) { + target[blockIdx.x] = *mysh; + } + } +} + +__global__ void kSetupCurand(curandState *state, unsigned long long seed) { + const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; + /* Each thread gets same seed, a different sequence number, + no offset */ + curand_init(seed, tidx, 0, &state[tidx]); +} + diff --git a/caffe2/contrib/cuda-convnet2/python_util/__init__.py b/caffe2/contrib/cuda-convnet2/python_util/__init__.py new file mode 100644 index 0000000..520b1ea --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/python_util/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/caffe2/contrib/cuda-convnet2/python_util/data.py b/caffe2/contrib/cuda-convnet2/python_util/data.py new file mode 100644 index 0000000..d8c8ff1 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/python_util/data.py @@ -0,0 +1,194 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as n +from numpy.random import randn, rand, random_integers +import os +from threading import Thread +from util import * + +BATCH_META_FILE = "batches.meta" + +class DataLoaderThread(Thread): + def __init__(self, path, tgt): + Thread.__init__(self) + self.path = path + self.tgt = tgt + def run(self): + self.tgt += [unpickle(self.path)] + +class DataProvider: + BATCH_REGEX = re.compile('^data_batch_(\d+)(\.\d+)?$') + def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False): + if batch_range == None: + batch_range = DataProvider.get_batch_nums(data_dir) + if init_batchnum is None or init_batchnum not in batch_range: + init_batchnum = batch_range[0] + + self.data_dir = data_dir + self.batch_range = batch_range + self.curr_epoch = init_epoch + self.curr_batchnum = init_batchnum + self.dp_params = dp_params + self.batch_meta = self.get_batch_meta(data_dir) + self.data_dic = None + self.test = test + self.batch_idx = batch_range.index(init_batchnum) + + def get_next_batch(self): + if self.data_dic is None or len(self.batch_range) > 1: + self.data_dic = self.get_batch(self.curr_batchnum) + epoch, batchnum = self.curr_epoch, self.curr_batchnum + self.advance_batch() + + return epoch, batchnum, self.data_dic + + def get_batch(self, batch_num): + fname = self.get_data_file_name(batch_num) + if os.path.isdir(fname): # batch in sub-batches + sub_batches = sorted(os.listdir(fname), key=alphanum_key) + #print sub_batches + num_sub_batches = len(sub_batches) + tgts = [[] for i in xrange(num_sub_batches)] + threads = [DataLoaderThread(os.path.join(fname, s), tgt) for (s, tgt) in zip(sub_batches, tgts)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return [t[0] for t in tgts] + return unpickle(self.get_data_file_name(batch_num)) + + def get_data_dims(self,idx=0): + return self.batch_meta['num_vis'] if idx == 0 else 1 + + def advance_batch(self): + self.batch_idx = self.get_next_batch_idx() + self.curr_batchnum = self.batch_range[self.batch_idx] + if self.batch_idx == 0: # we wrapped + self.curr_epoch += 1 + + def get_next_batch_idx(self): + return (self.batch_idx + 1) % len(self.batch_range) + + def get_next_batch_num(self): + return self.batch_range[self.get_next_batch_idx()] + + # get filename of current batch + def get_data_file_name(self, batchnum=None): + if batchnum is None: + batchnum = self.curr_batchnum + return os.path.join(self.data_dir, 'data_batch_%d' % batchnum) + + @classmethod + def get_instance(cls, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, type="default", dp_params={}, test=False): + # why the fuck can't i reference DataProvider in the original definition? + #cls.dp_classes['default'] = DataProvider + type = type or DataProvider.get_batch_meta(data_dir)['dp_type'] # allow data to decide data provider + if type.startswith("dummy-"): + name = "-".join(type.split('-')[:-1]) + "-n" + if name not in dp_types: + raise DataProviderException("No such data provider: %s" % type) + _class = dp_classes[name] + dims = int(type.split('-')[-1]) + return _class(dims) + elif type in dp_types: + _class = dp_classes[type] + return _class(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) + + raise DataProviderException("No such data provider: %s" % type) + + @classmethod + def register_data_provider(cls, name, desc, _class): + if name in dp_types: + raise DataProviderException("Data provider %s already registered" % name) + dp_types[name] = desc + dp_classes[name] = _class + + @staticmethod + def get_batch_meta(data_dir): + return unpickle(os.path.join(data_dir, BATCH_META_FILE)) + + @staticmethod + def get_batch_filenames(srcdir): + return sorted([f for f in os.listdir(srcdir) if DataProvider.BATCH_REGEX.match(f)], key=alphanum_key) + + @staticmethod + def get_batch_nums(srcdir): + names = DataProvider.get_batch_filenames(srcdir) + return sorted(list(set(int(DataProvider.BATCH_REGEX.match(n).group(1)) for n in names))) + + @staticmethod + def get_num_batches(srcdir): + return len(DataProvider.get_batch_nums(srcdir)) + +class DummyDataProvider(DataProvider): + def __init__(self, data_dim): + #self.data_dim = data_dim + self.batch_range = [1] + self.batch_meta = {'num_vis': data_dim, 'data_in_rows':True} + self.curr_epoch = 1 + self.curr_batchnum = 1 + self.batch_idx = 0 + + def get_next_batch(self): + epoch, batchnum = self.curr_epoch, self.curr_batchnum + self.advance_batch() + data = rand(512, self.get_data_dims()).astype(n.single) + return self.curr_epoch, self.curr_batchnum, {'data':data} + +class LabeledDataProvider(DataProvider): + def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False): + DataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) + + def get_num_classes(self): + return len(self.batch_meta['label_names']) + +class LabeledDummyDataProvider(DummyDataProvider): + def __init__(self, data_dim, num_classes=10, num_cases=7): + #self.data_dim = data_dim + self.batch_range = [1] + self.batch_meta = {'num_vis': data_dim, + 'label_names': [str(x) for x in range(num_classes)], + 'data_in_rows':True} + self.num_cases = num_cases + self.num_classes = num_classes + self.curr_epoch = 1 + self.curr_batchnum = 1 + self.batch_idx=0 + self.data = None + + def get_num_classes(self): + return self.num_classes + + def get_next_batch(self): + epoch, batchnum = self.curr_epoch, self.curr_batchnum + self.advance_batch() + if self.data is None: + data = rand(self.num_cases, self.get_data_dims()).astype(n.single) # <--changed to rand + labels = n.require(n.c_[random_integers(0,self.num_classes-1,self.num_cases)], requirements='C', dtype=n.single) + self.data, self.labels = data, labels + else: + data, labels = self.data, self.labels +# print data.shape, labels.shape + return self.curr_epoch, self.curr_batchnum, [data.T, labels.T ] + + +dp_types = {"dummy-n": "Dummy data provider for n-dimensional data", + "dummy-labeled-n": "Labeled dummy data provider for n-dimensional data"} +dp_classes = {"dummy-n": DummyDataProvider, + "dummy-labeled-n": LabeledDummyDataProvider} + +class DataProviderException(Exception): + pass diff --git a/caffe2/contrib/cuda-convnet2/python_util/gpumodel.py b/caffe2/contrib/cuda-convnet2/python_util/gpumodel.py new file mode 100644 index 0000000..d4df71c --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/python_util/gpumodel.py @@ -0,0 +1,358 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as n +import os +from time import time, asctime, localtime, strftime +from util import * +from data import * +from options import * +from math import ceil, floor, sqrt +from data import DataProvider, dp_types +import sys +import shutil +import platform +from os import linesep as NL +from threading import Thread +import tempfile as tf + +class ModelStateException(Exception): + pass + +class CheckpointWriter(Thread): + def __init__(self, path, dic): + Thread.__init__(self) + self.path = path + self.dic = dic + + def run(self): + save_dir = os.path.dirname(self.path) + save_file = os.path.basename(self.path) + # Write checkpoint to temporary filename + tmpfile = tf.NamedTemporaryFile(dir=os.path.dirname(save_dir), delete=False) + pickle(tmpfile, self.dic) # Also closes tf + # Move it to final filename + os.rename(tmpfile.name, self.path) + # Delete old checkpoints + for f in os.listdir(save_dir): + if f != save_file: + os.remove(os.path.join(save_dir, f)) + +# GPU Model interface +class IGPUModel: + def __init__(self, model_name, op, load_dic, filename_options=[], dp_params={}): + # these are input parameters + self.model_name = model_name + self.op = op + self.options = op.options + self.load_dic = load_dic + self.filename_options = filename_options + self.dp_params = dp_params + self.device_ids = self.op.get_value('gpu') + self.fill_excused_options() + self.checkpoint_writer = None + #assert self.op.all_values_given() + + for o in op.get_options_list(): + setattr(self, o.name, o.value) + self.loaded_from_checkpoint = load_dic is not None + # these are things that the model must remember but they're not input parameters + if self.loaded_from_checkpoint: + self.model_state = load_dic["model_state"] + self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else self.options['load_file'].value + if not os.path.isdir(self.save_file) and os.path.exists(self.save_file): + self.save_file = os.path.dirname(self.save_file) +# print self.options["save_file_override"].value, self.save_file + else: + self.model_state = {} + self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else os.path.join(self.options['save_path'].value, model_name + "_" + '_'.join(['%s_%s' % (char, self.options[opt].get_str_value()) for opt, char in filename_options]) + '_' + strftime('%Y-%m-%d_%H.%M.%S')) + self.model_state["train_outputs"] = [] + self.model_state["test_outputs"] = [] + self.model_state["epoch"] = 1 + self.model_state["batchnum"] = self.train_batch_range[0] +# print self.save_file + + self.init_data_providers() + if load_dic: + self.train_data_provider.advance_batch() + + # model state often requries knowledge of data provider, so it's initialized after + try: + self.init_model_state() + except ModelStateException, e: + print e + sys.exit(1) + for var, val in self.model_state.iteritems(): + setattr(self, var, val) + + self.import_model() + self.init_model_lib() + + def import_model(self): + print "=========================" + print "Importing %s C++ module" % ('_' + self.model_name) + self.libmodel = __import__('_' + self.model_name) + + def fill_excused_options(self): + pass + + def init_data_providers(self): + self.dp_params['convnet'] = self + try: + self.test_data_provider = DataProvider.get_instance(self.data_path, self.test_batch_range, + type=self.dp_type, dp_params=self.dp_params, test=True) + self.train_data_provider = DataProvider.get_instance(self.data_path, self.train_batch_range, + self.model_state["epoch"], self.model_state["batchnum"], + type=self.dp_type, dp_params=self.dp_params, test=False) + except DataProviderException, e: + print "Unable to create data provider: %s" % e + self.print_data_providers() + sys.exit() + + def init_model_state(self): + pass + + def init_model_lib(self): + pass + + def start(self): + if self.test_only: + self.test_outputs += [self.get_test_error()] + self.print_test_results() + else: + self.train() + self.cleanup() + if self.force_save: + self.save_state().join() + sys.exit(0) + + def train(self): + print "=========================" + print "Training %s" % self.model_name + self.op.print_values() + print "=========================" + self.print_model_state() + print "Running on CUDA device(s) %s" % ", ".join("%d" % d for d in self.device_ids) + print "Current time: %s" % asctime(localtime()) + print "Saving checkpoints to %s" % self.save_file + print "=========================" + next_data = self.get_next_batch() + while self.epoch <= self.num_epochs: + data = next_data + self.epoch, self.batchnum = data[0], data[1] + self.print_iteration() + sys.stdout.flush() + + compute_time_py = time() + self.start_batch(data) + + # load the next batch while the current one is computing + next_data = self.get_next_batch() + + batch_output = self.finish_batch() + self.train_outputs += [batch_output] + self.print_train_results() + + if self.get_num_batches_done() % self.testing_freq == 0: + self.sync_with_host() + self.test_outputs += [self.get_test_error()] + self.print_test_results() + self.print_test_status() + self.conditional_save() + + self.print_elapsed_time(time() - compute_time_py) + + def cleanup(self): + if self.checkpoint_writer is not None: + self.checkpoint_writer.join() + self.checkpoint_writer = None + + def print_model_state(self): + pass + + def get_num_batches_done(self): + return len(self.train_batch_range) * (self.epoch - 1) + self.batchnum - self.train_batch_range[0] + 1 + + def get_next_batch(self, train=True): + dp = self.train_data_provider + if not train: + dp = self.test_data_provider + return self.parse_batch_data(dp.get_next_batch(), train=train) + + def parse_batch_data(self, batch_data, train=True): + return batch_data[0], batch_data[1], batch_data[2]['data'] + + def start_batch(self, batch_data, train=True): + self.libmodel.startBatch(batch_data[2], not train) + + def finish_batch(self): + return self.libmodel.finishBatch() + + def print_iteration(self): + print "\t%d.%d..." % (self.epoch, self.batchnum), + + def print_elapsed_time(self, compute_time_py): + print "(%.3f sec)" % (compute_time_py) + + def print_train_results(self): + batch_error = self.train_outputs[-1][0] + if not (batch_error > 0 and batch_error < 2e20): + print "Crazy train error: %.6f" % batch_error + self.cleanup() + + print "Train error: %.6f " % (batch_error), + + def print_test_results(self): + batch_error = self.test_outputs[-1][0] + print "%s\t\tTest error: %.6f" % (NL, batch_error), + + def print_test_status(self): + status = (len(self.test_outputs) == 1 or self.test_outputs[-1][0] < self.test_outputs[-2][0]) and "ok" or "WORSE" + print status, + + def sync_with_host(self): + if self.checkpoint_writer is not None: + self.checkpoint_writer.join() + self.checkpoint_writer = None + self.libmodel.syncWithHost() + + def conditional_save(self): + batch_error = self.test_outputs[-1][0] + if batch_error > 0 and batch_error < self.max_test_err: + self.save_state() + else: + print "\tTest error > %g, not saving." % self.max_test_err, + + def aggregate_test_outputs(self, test_outputs): + test_error = tuple([sum(t[r] for t in test_outputs) / (1 if self.test_one else len(self.test_batch_range)) for r in range(len(test_outputs[-1]))]) + return test_error + + def get_test_error(self): + next_data = self.get_next_batch(train=False) + test_outputs = [] + while True: + data = next_data + start_time_test = time() + self.start_batch(data, train=False) + load_next = (not self.test_one or self.test_only) and data[1] < self.test_batch_range[-1] + if load_next: # load next batch + next_data = self.get_next_batch(train=False) + test_outputs += [self.finish_batch()] + if self.test_only: # Print the individual batch results for safety + print "batch %d: %s" % (data[1], str(test_outputs[-1])), + self.print_elapsed_time(time() - start_time_test) + if not load_next: + break + sys.stdout.flush() + + return self.aggregate_test_outputs(test_outputs) + + def set_var(self, var_name, var_val): + setattr(self, var_name, var_val) + self.model_state[var_name] = var_val + return var_val + + def get_var(self, var_name): + return self.model_state[var_name] + + def has_var(self, var_name): + return var_name in self.model_state + + def save_state(self): + for att in self.model_state: + if hasattr(self, att): + self.model_state[att] = getattr(self, att) + + dic = {"model_state": self.model_state, + "op": self.op} + + checkpoint_file = "%d.%d" % (self.epoch, self.batchnum) + checkpoint_file_full_path = os.path.join(self.save_file, checkpoint_file) + if not os.path.exists(self.save_file): + os.makedirs(self.save_file) + + assert self.checkpoint_writer is None + self.checkpoint_writer = CheckpointWriter(checkpoint_file_full_path, dic) + self.checkpoint_writer.start() + print "-------------------------------------------------------" + print "Saved checkpoint to %s" % self.save_file + print "=======================================================", + return self.checkpoint_writer + + def get_progress(self): + num_batches_total = self.num_epochs * len(self.train_batch_range) + return min(1.0, max(0.0, float(self.get_num_batches_done()-1) / num_batches_total)) + + @staticmethod + def load_checkpoint(load_dir): + if os.path.isdir(load_dir): + return unpickle(os.path.join(load_dir, sorted(os.listdir(load_dir), key=alphanum_key)[-1])) + return unpickle(load_dir) + + @staticmethod + def get_options_parser(): + op = OptionsParser() + op.add_option("load-file", "load_file", StringOptionParser, "Load file", default="", excuses=OptionsParser.EXCUSE_ALL) + op.add_option("save-path", "save_path", StringOptionParser, "Save path", excuses=['save_file_override']) + op.add_option("save-file", "save_file_override", StringOptionParser, "Save file override", excuses=['save_path']) + op.add_option("train-range", "train_batch_range", RangeOptionParser, "Data batch range: training") + op.add_option("test-range", "test_batch_range", RangeOptionParser, "Data batch range: testing") + op.add_option("data-provider", "dp_type", StringOptionParser, "Data provider", default="default") + op.add_option("test-freq", "testing_freq", IntegerOptionParser, "Testing frequency", default=25) + op.add_option("epochs", "num_epochs", IntegerOptionParser, "Number of epochs", default=500) + op.add_option("data-path", "data_path", StringOptionParser, "Data path") + + op.add_option("max-test-err", "max_test_err", FloatOptionParser, "Maximum test error for saving") + op.add_option("test-only", "test_only", BooleanOptionParser, "Test and quit?", default=0) + op.add_option("test-one", "test_one", BooleanOptionParser, "Test on one batch at a time?", default=1) + op.add_option("force-save", "force_save", BooleanOptionParser, "Force save before quitting", default=0) + op.add_option("gpu", "gpu", ListOptionParser(IntegerOptionParser), "GPU override") + return op + + @staticmethod + def print_data_providers(): + print "Available data providers:" + for dp, desc in dp_types.iteritems(): + print " %s: %s" % (dp, desc) + + + @staticmethod + def parse_options(op): + try: + load_dic = None + options = op.parse() + load_location = None +# print options['load_file'].value_given, options['save_file_override'].value_given +# print options['save_file_override'].value + if options['load_file'].value_given: + load_location = options['load_file'].value + elif options['save_file_override'].value_given and os.path.exists(options['save_file_override'].value): + load_location = options['save_file_override'].value + + if load_location is not None: + load_dic = IGPUModel.load_checkpoint(load_location) + old_op = load_dic["op"] + old_op.merge_from(op) + op = old_op + op.eval_expr_defaults() + return op, load_dic + except OptionMissingException, e: + print e + op.print_usage() + except OptionException, e: + print e + except UnpickleError, e: + print "Error loading checkpoint:" + print e + sys.exit() diff --git a/caffe2/contrib/cuda-convnet2/python_util/options.py b/caffe2/contrib/cuda-convnet2/python_util/options.py new file mode 100644 index 0000000..afc6ed5 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/python_util/options.py @@ -0,0 +1,408 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from getopt import getopt +import os +import re +#import types + +TERM_BOLD_START = "\033[1m" +TERM_BOLD_END = "\033[0m" + +class Option: + def __init__(self, letter, name, desc, parser, set_once, default, excuses, requires, save): + assert not name is None + self.letter = letter + self.name = name + self.desc = desc + self.parser = parser + self.set_once = set_once + self.default = default + self.excuses = excuses + self.requires = requires + self.save = save + + self.value = None + self.value_given = False + self.prefixed_letter = min(2, len(letter)) * '-' + letter + + def set_value(self, value, parse=True): + try: + self.value = self.parser.parse(value) if parse else value + self.value_given = True +# print self.name, self.value + except OptionException, e: + raise OptionException("Unable to parse option %s (%s): %s" % (self.prefixed_letter, self.desc, e)) + + def set_default(self): + if not self.default is None: + self.value = self.default + + def eval_expr_default(self, env): + try: + if isinstance(self.default, OptionExpression) and not self.value_given: + self.value = self.default.evaluate(env) + if not self.parser.is_type(self.value): + raise OptionException("expression result %s is not of right type (%s)" % (self.value, self.parser.get_type_str())) + except Exception, e: + raise OptionException("Unable to set default value for option %s (%s): %s" % (self.prefixed_letter, self.desc, e)) + + def get_str_value(self, get_default_str=False): + val = self.value + if get_default_str: val = self.default + if val is None: return "" + if isinstance(val, OptionExpression): + return val.expr + return self.parser.to_string(val) + +class OptionsParser: + """An option parsing class. All options without default values are mandatory, unless a excuses + option (usually a load file) is given. + Does not support options without arguments.""" + SORT_LETTER = 1 + SORT_DESC = 2 + SORT_EXPR_LAST = 3 + EXCUSE_ALL = "all" + def __init__(self): + self.options = {} + + def add_option(self, letter, name, parser, desc, set_once=False, default=None, excuses=[], requires=[], save=True): + """ + The letter parameter is the actual parameter that the user will have to supply on the command line. + The name parameter is some name to be given to this option and must be a valid python variable name. + + An explanation of the "default" parameter: + The default value, if specified, should have the same type as the option. + You can also specify an expression as the default value. In this case, the default value of the parameter + will be the output of the expression. The expression may assume all other option names + as local variables. For example, you can define the hidden bias + learning rate to be 10 times the weight learning rate by setting this default: + + default=OptionExpression("eps_w * 10") (assuming an option named eps_w exists). + + However, it is up to you to make sure you do not make any circular expression definitions. + + Note that the order in which the options are parsed is arbitrary. + In particular, expression default values that depend on other expression default values + will often raise errors (depending on the order in which they happen to be parsed). + Therefore it is best not to make the default value of one variable depend on the value + of another if the other variable's default value is itself an expression. + + An explanation of the "excuses" parameter: + All options are mandatory, but certain options can exclude other options from being mandatory. + For example, if the excuses parameter for option "load_file" is ["num_hid", "num_vis"], + then the options num_hid and num_vis are not mandatory as long as load_file is specified. + Use the special flag EXCUSE_ALL to allow an option to make all other options optional. + """ + + assert name not in self.options + self.options[name] = Option(letter, name, desc, parser, set_once, default, excuses, requires, save) + + def set_value(self, name, value, parse=True): + self.options[name].set_value(value, parse=parse) + + def get_value(self, name): + return self.options[name].value + + def delete_option(self, name): + if name in self.options: + del self.options[name] + + def parse(self, eval_expr_defaults=False): + """Parses the options in sys.argv based on the options added to this parser. The + default behavior is to leave any expression default options as OptionExpression objects. + Set eval_expr_defaults=True to circumvent this.""" + short_opt_str = ''.join(["%s:" % self.options[name].letter for name in self.options if len(self.options[name].letter) == 1]) + long_opts = ["%s=" % self.options[name].letter for name in self.options if len(self.options[name].letter) > 1] + (go, ga) = getopt(sys.argv[1:], short_opt_str, longopts=long_opts) + dic = dict(go) + + for o in self.get_options_list(sort_order=self.SORT_EXPR_LAST): + if o.prefixed_letter in dic: + o.set_value(dic[o.prefixed_letter]) + else: + # check if excused or has default + excused = max([o2.prefixed_letter in dic for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses]) + if not excused and o.default is None: + raise OptionMissingException("Option %s (%s) not supplied" % (o.prefixed_letter, o.desc)) + o.set_default() + # check requirements + if o.prefixed_letter in dic: + for o2 in self.get_options_list(sort_order=self.SORT_LETTER): + if o2.name in o.requires and o2.prefixed_letter not in dic: + raise OptionMissingException("Option %s (%s) requires option %s (%s)" % (o.prefixed_letter, o.desc, + o2.prefixed_letter, o2.desc)) + if eval_expr_defaults: + self.eval_expr_defaults() + return self.options + + def merge_from(self, op2): + """Merges the options in op2 into this instance, but does not overwrite + this instances's SET options with op2's default values.""" + for name, o in self.options.iteritems(): + if name in op2.options and ((op2.options[name].value_given and op2.options[name].value != self.options[name].value) or not op2.options[name].save): + if op2.options[name].set_once: + raise OptionException("Option %s (%s) cannot be changed" % (op2.options[name].prefixed_letter, op2.options[name].desc)) + self.options[name] = op2.options[name] + for name in op2.options: + if name not in self.options: + self.options[name] = op2.options[name] + + def eval_expr_defaults(self): + env = dict([(name, o.value) for name, o in self.options.iteritems()]) + for o in self.options.values(): + o.eval_expr_default(env) + + def all_values_given(self): + return max([o.value_given for o in self.options.values() if o.default is not None]) + + def get_options_list(self, sort_order=SORT_LETTER): + """ Returns the list of Option objects in this OptionParser, + sorted as specified""" + + cmp = lambda x, y: (x.desc < y.desc and -1 or 1) + if sort_order == self.SORT_LETTER: + cmp = lambda x, y: (x.letter < y.letter and -1 or 1) + elif sort_order == self.SORT_EXPR_LAST: + cmp = lambda x, y: (type(x.default) == OptionExpression and 1 or -1) + return sorted(self.options.values(), cmp=cmp) + + def print_usage(self, print_constraints=False): + print "%s usage:" % os.path.basename(sys.argv[0]) + opslist = self.get_options_list() + + usage_strings = [] + num_def = 0 + for o in opslist: + excs = ' ' + if o.default is None: + excs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses])) + reqs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.name in o.requires])) + usg = (OptionsParser._bold(o.prefixed_letter) + " <%s>" % o.parser.get_type_str(), o.desc, ("[%s]" % o.get_str_value(get_default_str=True)) if not o.default is None else None, excs, reqs) + if o.default is None: + usage_strings += [usg] + else: + usage_strings.insert(num_def, usg) + num_def += 1 + + col_widths = [self._longest_value(usage_strings, key=lambda x:x[i]) for i in range(len(usage_strings[0]) - 1)] + + col_names = [" Option", "Description", "Default"] + if print_constraints: + col_names += ["Excused by", "Requires"] + for i, s in enumerate(col_names): + print self._bold(s.ljust(col_widths[i])), + + print "" + for l, d, de, ex, req in usage_strings: + if de is None: + de = ' ' + print (" %s -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]), + else: + print (" [%s] -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]), + if print_constraints: + print ex.ljust(col_widths[3]), req + else: + print "" + + def print_values(self): + longest_desc = self._longest_value(self.options.values(), key=lambda x:x.desc) + longest_def_value = self._longest_value([v for v in self.options.values() if not v.value_given and not v.default is None], + key=lambda x:x.get_str_value()) + for o in self.get_options_list(sort_order=self.SORT_DESC): + print "%s: %s %s" % (o.desc.ljust(longest_desc), o.get_str_value().ljust(longest_def_value), (not o.value_given and not o.default is None) and "[DEFAULT]" or "") + + @staticmethod + def _longest_value(values, key=lambda x:x): + mylen = lambda x: 0 if x is None else len(x) + return mylen(key(max(values, key=lambda x:mylen(key(x))))) + + @staticmethod + def _bold(str): + return TERM_BOLD_START + str + TERM_BOLD_END + +class OptionException(Exception): + pass + +class OptionMissingException(OptionException): + pass + +class OptionParser: + @staticmethod + def parse(value): + return str(value) + + @staticmethod + def to_string(value): + return str(value) + + @staticmethod + def get_type_str(): + pass + +class IntegerOptionParser(OptionParser): + @staticmethod + def parse(value): + try: + return int(value) + except: + raise OptionException("argument is not an integer") + + @staticmethod + def get_type_str(): + return "int" + + @staticmethod + def is_type(value): + return type(value) == int + +class BooleanOptionParser(OptionParser): + @staticmethod + def parse(value): + try: + v = int(value) + if not v in (0,1): + raise OptionException + return v + except: + raise OptionException("argument is not a boolean") + + @staticmethod + def get_type_str(): + return "0/1" + + @staticmethod + def is_type(value): + return type(value) == int and value in (0, 1) + +class StringOptionParser(OptionParser): + @staticmethod + def get_type_str(): + return "string" + + @staticmethod + def is_type(value): + return type(value) == str + +class FloatOptionParser(OptionParser): + @staticmethod + def parse(value): + try: + return float(value) + except: + raise OptionException("argument is not a float") + + @staticmethod + def to_string(value): + return "%.6g" % value + + @staticmethod + def get_type_str(): + return "float" + + @staticmethod + def is_type(value): + return type(value) == float + +class RangeOptionParser(OptionParser): + @staticmethod + def parse(value): + m = re.match("^(\d+)\-(\d+)$", value) + try: + if m: return range(int(m.group(1)), int(m.group(2)) + 1) + return [int(value)] + except: + raise OptionException("argument is neither an integer nor a range") + + @staticmethod + def to_string(value): + return "%d-%d" % (value[0], value[-1]) + + @staticmethod + def get_type_str(): + return "int[-int]" + + @staticmethod + def is_type(value): + return type(value) == list + +class ListOptionParser(OptionParser): + """ + A parser that parses a delimited list of items. If the "parsers" + argument is a list of parsers, then the list of items must have the form and length + specified by that list. + + Example: + ListOptionParser([FloatOptionParser, IntegerOptionParser]) + + would parse "0.5,3" but not "0.5,3,0.6" or "0.5" or "3,0.5". + + If the "parsers" argument is another parser, then the list of items may be of + arbitrary length, but each item must be parseable by the given parser. + + Example: + ListOptionParser(FloatOptionParser) + + would parse "0.5" and "0.5,0.3" and "0.5,0.3,0.6", etc. + """ + def __init__(self, parsers, sepchar=','): + self.parsers = parsers + self.sepchar = sepchar + + def parse(self, value): + values = value.split(self.sepchar) + if type(self.parsers) == list and len(values) != len(self.parsers): + raise OptionException("requires %d arguments, given %d" % (len(self.parsers), len(values))) + + try: + if type(self.parsers) == list: + return [p.parse(v) for p, v in zip(self.parsers, values)] + return [self.parsers.parse(v) for v in values] + except: + raise OptionException("argument is not of the form %s" % self.get_type_str()) + + def to_string(self, value): + if type(self.parsers) == list: + return self.sepchar.join([p.to_string(v) for p, v in zip(self.parsers, value)]) + return self.sepchar.join([self.parsers.to_string(v) for v in value]) + + def get_type_str(self): + if type(self.parsers) == list: + return self.sepchar.join([p.get_type_str() for p in self.parsers]) + return "%s%s..." % (self.parsers.get_type_str(), self.sepchar) + + @staticmethod + def is_type(value): + return type(value) == list + +class OptionExpression: + """ + This allows you to specify option values in terms of other option values. + Example: + op.add_option("eps-w", "eps_w", ListOptionParser(FloatOptionParser), "Weight learning rates for each layer") + op.add_option("eps-b", "eps_b", ListOptionParser(FloatOptionParser), "Bias learning rates for each layer", default=OptionExpression("[o * 10 for o in eps_w]")) + + This says: the default bias learning rate for each layer is 10 + times the weight learning rate for that layer. + """ + def __init__(self, expr): + self.expr = expr + + def evaluate(self, options): + locals().update(options) + try: + return eval(self.expr) + except Exception, e: + raise OptionException("expression '%s': unable to parse: %s" % (self.expr, e)) diff --git a/caffe2/contrib/cuda-convnet2/python_util/util.py b/caffe2/contrib/cuda-convnet2/python_util/util.py new file mode 100644 index 0000000..b3b6211 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/python_util/util.py @@ -0,0 +1,94 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import cPickle +import os +from cStringIO import StringIO + +class UnpickleError(Exception): + pass + +GPU_LOCK_NO_SCRIPT = -2 +GPU_LOCK_NO_LOCK = -1 + +def pickle(filename, data): + fo = filename + if type(filename) == str: + fo = open(filename, "w") + + cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL) + fo.close() + +def unpickle(filename): + if not os.path.exists(filename): + raise UnpickleError("Path '%s' does not exist." % filename) + + fo = open(filename, 'r') + z = StringIO() + file_size = os.fstat(fo.fileno()).st_size + # Read 1GB at a time to avoid overflow + while fo.tell() < file_size: + z.write(fo.read(1 << 30)) + fo.close() + dict = cPickle.loads(z.getvalue()) + z.close() + + return dict + +def is_intel_machine(): + VENDOR_ID_REGEX = re.compile('^vendor_id\s+: (\S+)') + f = open('/proc/cpuinfo') + for line in f: + m = VENDOR_ID_REGEX.match(line) + if m: + f.close() + return m.group(1) == 'GenuineIntel' + f.close() + return False + +# Returns the CPUs associated with a given GPU +def get_cpus_for_gpu(gpu): + #proc = subprocess.Popen(['nvidia-smi', '-q', '-i', str(gpu)], stdout=subprocess.PIPE) + #lines = proc.communicate()[0] + #lines = subprocess.check_output(['nvidia-smi', '-q', '-i', str(gpu)]).split(os.linesep) + + with open('/proc/driver/nvidia/gpus/%d/information' % gpu) as f: + for line in f: + if line.startswith('Bus Location'): + bus_id = line.split(':', 1)[1].strip() + bus_id = bus_id[:7] + ':' + bus_id[8:] + ff = open('/sys/module/nvidia/drivers/pci:nvidia/%s/local_cpulist' % bus_id) + cpus_str = ff.readline() + ff.close() + cpus = [cpu for s in cpus_str.split(',') for cpu in range(int(s.split('-')[0]),int(s.split('-')[1])+1)] + return cpus + return [-1] + +def get_cpu(): + if is_intel_machine(): + return 'intel' + return 'amd' + +def is_windows_machine(): + return os.name == 'nt' + +def tryint(s): + try: + return int(s) + except: + return s + +def alphanum_key(s): + return [tryint(c) for c in re.split('([0-9]+)', s)] diff --git a/caffe2/contrib/cuda-convnet2/shownet.py b/caffe2/contrib/cuda-convnet2/shownet.py new file mode 100644 index 0000000..6e1bf11 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/shownet.py @@ -0,0 +1,341 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +from tarfile import TarFile, TarInfo +from matplotlib import pylab as pl +import numpy as n +import getopt as opt +from python_util.util import * +from math import sqrt, ceil, floor +from python_util.gpumodel import IGPUModel +import random as r +import numpy.random as nr +from convnet import ConvNet +from python_util.options import * +from PIL import Image +from time import sleep + +class ShowNetError(Exception): + pass + +class ShowConvNet(ConvNet): + def __init__(self, op, load_dic): + ConvNet.__init__(self, op, load_dic) + + def init_data_providers(self): + self.need_gpu = self.op.get_value('show_preds') + class Dummy: + def advance_batch(self): + pass + if self.need_gpu: + ConvNet.init_data_providers(self) + else: + self.train_data_provider = self.test_data_provider = Dummy() + + def import_model(self): + if self.need_gpu: + ConvNet.import_model(self) + + def init_model_state(self): + if self.op.get_value('show_preds'): + self.softmax_name = self.op.get_value('show_preds') + + def init_model_lib(self): + if self.need_gpu: + ConvNet.init_model_lib(self) + + def plot_cost(self): + if self.show_cost not in self.train_outputs[0][0]: + raise ShowNetError("Cost function with name '%s' not defined by given convnet." % self.show_cost) +# print self.test_outputs + train_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.train_outputs] + test_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.test_outputs] + if self.smooth_test_errors: + test_errors = [sum(test_errors[max(0,i-len(self.test_batch_range)):i])/(i-max(0,i-len(self.test_batch_range))) for i in xrange(1,len(test_errors)+1)] + numbatches = len(self.train_batch_range) + test_errors = n.row_stack(test_errors) + test_errors = n.tile(test_errors, (1, self.testing_freq)) + test_errors = list(test_errors.flatten()) + test_errors += [test_errors[-1]] * max(0,len(train_errors) - len(test_errors)) + test_errors = test_errors[:len(train_errors)] + + numepochs = len(train_errors) / float(numbatches) + pl.figure(1) + x = range(0, len(train_errors)) + pl.plot(x, train_errors, 'k-', label='Training set') + pl.plot(x, test_errors, 'r-', label='Test set') + pl.legend() + ticklocs = range(numbatches, len(train_errors) - len(train_errors) % numbatches + 1, numbatches) + epoch_label_gran = int(ceil(numepochs / 20.)) + epoch_label_gran = int(ceil(float(epoch_label_gran) / 10) * 10) if numepochs >= 10 else epoch_label_gran + ticklabels = map(lambda x: str((x[1] / numbatches)) if x[0] % epoch_label_gran == epoch_label_gran-1 else '', enumerate(ticklocs)) + + pl.xticks(ticklocs, ticklabels) + pl.xlabel('Epoch') +# pl.ylabel(self.show_cost) + pl.title('%s[%d]' % (self.show_cost, self.cost_idx)) +# print "plotted cost" + + def make_filter_fig(self, filters, filter_start, fignum, _title, num_filters, combine_chans, FILTERS_PER_ROW=16): + MAX_ROWS = 24 + MAX_FILTERS = FILTERS_PER_ROW * MAX_ROWS + num_colors = filters.shape[0] + f_per_row = int(ceil(FILTERS_PER_ROW / float(1 if combine_chans else num_colors))) + filter_end = min(filter_start+MAX_FILTERS, num_filters) + filter_rows = int(ceil(float(filter_end - filter_start) / f_per_row)) + + filter_pixels = filters.shape[1] + filter_size = int(sqrt(filters.shape[1])) + fig = pl.figure(fignum) + fig.text(.5, .95, '%s %dx%d filters %d-%d' % (_title, filter_size, filter_size, filter_start, filter_end-1), horizontalalignment='center') + num_filters = filter_end - filter_start + if not combine_chans: + bigpic = n.zeros((filter_size * filter_rows + filter_rows + 1, filter_size*num_colors * f_per_row + f_per_row + 1), dtype=n.single) + else: + bigpic = n.zeros((3, filter_size * filter_rows + filter_rows + 1, filter_size * f_per_row + f_per_row + 1), dtype=n.single) + + for m in xrange(filter_start,filter_end ): + filter = filters[:,:,m] + y, x = (m - filter_start) / f_per_row, (m - filter_start) % f_per_row + if not combine_chans: + for c in xrange(num_colors): + filter_pic = filter[c,:].reshape((filter_size,filter_size)) + bigpic[1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size, + 1 + (1 + filter_size*num_colors) * x + filter_size*c:1 + (1 + filter_size*num_colors) * x + filter_size*(c+1)] = filter_pic + else: + filter_pic = filter.reshape((3, filter_size,filter_size)) + bigpic[:, + 1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size, + 1 + (1 + filter_size) * x:1 + (1 + filter_size) * x + filter_size] = filter_pic + + pl.xticks([]) + pl.yticks([]) + if not combine_chans: + pl.imshow(bigpic, cmap=pl.cm.gray, interpolation='nearest') + else: + bigpic = bigpic.swapaxes(0,2).swapaxes(0,1) + pl.imshow(bigpic, interpolation='nearest') + + def plot_filters(self): + FILTERS_PER_ROW = 16 + filter_start = 0 # First filter to show + if self.show_filters not in self.layers: + raise ShowNetError("Layer with name '%s' not defined by given convnet." % self.show_filters) + layer = self.layers[self.show_filters] + filters = layer['weights'][self.input_idx] +# filters = filters - filters.min() +# filters = filters / filters.max() + if layer['type'] == 'fc': # Fully-connected layer + num_filters = layer['outputs'] + channels = self.channels + filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1]) + elif layer['type'] in ('conv', 'local'): # Conv layer + num_filters = layer['filters'] + channels = layer['filterChannels'][self.input_idx] + if layer['type'] == 'local': + filters = filters.reshape((layer['modules'], channels, layer['filterPixels'][self.input_idx], num_filters)) + filters = filters[:, :, :, self.local_plane] # first map for now (modules, channels, pixels) + filters = filters.swapaxes(0,2).swapaxes(0,1) + num_filters = layer['modules'] +# filters = filters.swapaxes(0,1).reshape(channels * layer['filterPixels'][self.input_idx], num_filters * layer['modules']) +# num_filters *= layer['modules'] + FILTERS_PER_ROW = layer['modulesX'] + else: + filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1]) + + + # Convert YUV filters to RGB + if self.yuv_to_rgb and channels == 3: + R = filters[0,:,:] + 1.28033 * filters[2,:,:] + G = filters[0,:,:] + -0.21482 * filters[1,:,:] + -0.38059 * filters[2,:,:] + B = filters[0,:,:] + 2.12798 * filters[1,:,:] + filters[0,:,:], filters[1,:,:], filters[2,:,:] = R, G, B + combine_chans = not self.no_rgb and channels == 3 + + # Make sure you don't modify the backing array itself here -- so no -= or /= + if self.norm_filters: + #print filters.shape + filters = filters - n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).mean(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1)) + filters = filters / n.sqrt(n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).var(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1))) + #filters = filters - n.tile(filters.min(axis=0).min(axis=0), (3, filters.shape[1], 1)) + #filters = filters / n.tile(filters.max(axis=0).max(axis=0), (3, filters.shape[1], 1)) + #else: + filters = filters - filters.min() + filters = filters / filters.max() + + self.make_filter_fig(filters, filter_start, 2, 'Layer %s' % self.show_filters, num_filters, combine_chans, FILTERS_PER_ROW=FILTERS_PER_ROW) + + def plot_predictions(self): + epoch, batch, data = self.get_next_batch(train=False) # get a test batch + num_classes = self.test_data_provider.get_num_classes() + NUM_ROWS = 2 + NUM_COLS = 4 + NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[0].shape[1] + NUM_TOP_CLASSES = min(num_classes, 5) # show this many top labels + NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs'] + PRED_IDX = 1 + + label_names = [lab.split(',')[0] for lab in self.test_data_provider.batch_meta['label_names']] + if self.only_errors: + preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single) + else: + preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single) + #rand_idx = nr.permutation(n.r_[n.arange(1), n.where(data[1] == 552)[1], n.where(data[1] == 795)[1], n.where(data[1] == 449)[1], n.where(data[1] == 274)[1]])[:NUM_IMGS] + rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS) + if NUM_IMGS < data[0].shape[1]: + data = [n.require(d[:,rand_idx], requirements='C') for d in data] +# data += [preds] + # Run the model + print [d.shape for d in data], preds.shape + self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name]) + IGPUModel.finish_batch(self) + print preds + data[0] = self.test_data_provider.get_plottable_data(data[0]) + + if self.save_preds: + if not gfile.Exists(self.save_preds): + gfile.MakeDirs(self.save_preds) + preds_thresh = preds > 0.5 # Binarize predictions + data[0] = data[0] * 255.0 + data[0][data[0]<0] = 0 + data[0][data[0]>255] = 255 + data[0] = n.require(data[0], dtype=n.uint8) + dir_name = '%s_predictions_batch_%d' % (os.path.basename(self.save_file), batch) + tar_name = os.path.join(self.save_preds, '%s.tar' % dir_name) + tfo = gfile.GFile(tar_name, "w") + tf = TarFile(fileobj=tfo, mode='w') + for img_idx in xrange(NUM_IMGS): + img = data[0][img_idx,:,:,:] + imsave = Image.fromarray(img) + prefix = "CORRECT" if data[1][0,img_idx] == preds_thresh[img_idx,PRED_IDX] else "FALSE_POS" if preds_thresh[img_idx,PRED_IDX] == 1 else "FALSE_NEG" + file_name = "%s_%.2f_%d_%05d_%d.png" % (prefix, preds[img_idx,PRED_IDX], batch, img_idx, data[1][0,img_idx]) +# gf = gfile.GFile(file_name, "w") + file_string = StringIO() + imsave.save(file_string, "PNG") + tarinf = TarInfo(os.path.join(dir_name, file_name)) + tarinf.size = file_string.tell() + file_string.seek(0) + tf.addfile(tarinf, file_string) + tf.close() + tfo.close() +# gf.close() + print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name) + else: + fig = pl.figure(3, figsize=(12,9)) + fig.text(.4, .95, '%s test samples' % ('Mistaken' if self.only_errors else 'Random')) + if self.only_errors: + # what the net got wrong + if NUM_OUTPUTS > 1: + err_idx = [i for i,p in enumerate(preds.argmax(axis=1)) if p not in n.where(data[2][:,i] > 0)[0]] + else: + err_idx = n.where(data[1][0,:] != preds[:,0].T)[0] + print err_idx + err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS)) + data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:] + + + import matplotlib.gridspec as gridspec + import matplotlib.colors as colors + cconv = colors.ColorConverter() + gs = gridspec.GridSpec(NUM_ROWS*2, NUM_COLS, + width_ratios=[1]*NUM_COLS, height_ratios=[2,1]*NUM_ROWS ) + #print data[1] + for row in xrange(NUM_ROWS): + for col in xrange(NUM_COLS): + img_idx = row * NUM_COLS + col + if data[0].shape[0] <= img_idx: + break + pl.subplot(gs[(row * 2) * NUM_COLS + col]) + #pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1) + pl.xticks([]) + pl.yticks([]) + img = data[0][img_idx,:,:,:] + pl.imshow(img, interpolation='lanczos') + show_title = data[1].shape[0] == 1 + true_label = [int(data[1][0,img_idx])] if show_title else n.where(data[1][:,img_idx]==1)[0] + #print true_label + #print preds[img_idx,:].shape + #print preds[img_idx,:].max() + true_label_names = [label_names[i] for i in true_label] + img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:] + #print img_labels + axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col]) + height = 0.5 + ylocs = n.array(range(NUM_TOP_CLASSES))*height + pl.barh(ylocs, [l[0] for l in img_labels], height=height, \ + color=['#ffaaaa' if l[1] in true_label_names else '#aaaaff' for l in img_labels]) + #pl.title(", ".join(true_labels)) + if show_title: + pl.title(", ".join(true_label_names), fontsize=15, fontweight='bold') + else: + print true_label_names + pl.yticks(ylocs + height/2, [l[1] for l in img_labels], x=1, backgroundcolor=cconv.to_rgba('0.65', alpha=0.5), weight='bold') + for line in enumerate(axes.get_yticklines()): + line[1].set_visible(False) + #pl.xticks([width], ['']) + #pl.yticks([]) + pl.xticks([]) + pl.ylim(0, ylocs[-1] + height) + pl.xlim(0, 1) + + def start(self): + self.op.print_values() +# print self.show_cost + if self.show_cost: + self.plot_cost() + if self.show_filters: + self.plot_filters() + if self.show_preds: + self.plot_predictions() + + if pl: + pl.show() + sys.exit(0) + + @classmethod + def get_options_parser(cls): + op = ConvNet.get_options_parser() + for option in list(op.options): + if option not in ('gpu', 'load_file', 'inner_size', 'train_batch_range', 'test_batch_range', 'multiview_test', 'data_path', 'pca_noise', 'scalar_mean'): + op.delete_option(option) + op.add_option("show-cost", "show_cost", StringOptionParser, "Show specified objective function", default="") + op.add_option("show-filters", "show_filters", StringOptionParser, "Show learned filters in specified layer", default="") + op.add_option("norm-filters", "norm_filters", BooleanOptionParser, "Individually normalize filters shown with --show-filters", default=0) + op.add_option("input-idx", "input_idx", IntegerOptionParser, "Input index for layer given to --show-filters", default=0) + op.add_option("cost-idx", "cost_idx", IntegerOptionParser, "Cost function return value index for --show-cost", default=0) + op.add_option("no-rgb", "no_rgb", BooleanOptionParser, "Don't combine filter channels into RGB in layer given to --show-filters", default=False) + op.add_option("yuv-to-rgb", "yuv_to_rgb", BooleanOptionParser, "Convert RGB filters to YUV in layer given to --show-filters", default=False) + op.add_option("channels", "channels", IntegerOptionParser, "Number of channels in layer given to --show-filters (fully-connected layers only)", default=0) + op.add_option("show-preds", "show_preds", StringOptionParser, "Show predictions made by given softmax on test set", default="") + op.add_option("save-preds", "save_preds", StringOptionParser, "Save predictions to given path instead of showing them", default="") + op.add_option("only-errors", "only_errors", BooleanOptionParser, "Show only mistaken predictions (to be used with --show-preds)", default=False, requires=['show_preds']) + op.add_option("local-plane", "local_plane", IntegerOptionParser, "Local plane to show", default=0) + op.add_option("smooth-test-errors", "smooth_test_errors", BooleanOptionParser, "Use running average for test error plot?", default=1) + + op.options['load_file'].default = None + return op + +if __name__ == "__main__": + #nr.seed(6) + try: + op = ShowConvNet.get_options_parser() + op, load_dic = IGPUModel.parse_options(op) + model = ShowConvNet(op, load_dic) + model.start() + except (UnpickleError, ShowNetError, opt.GetoptError), e: + print "----------------" + print "Error:" + print e diff --git a/caffe2/contrib/cuda-convnet2/util/Makefile b/caffe2/contrib/cuda-convnet2/util/Makefile new file mode 100644 index 0000000..55aba16 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/Makefile @@ -0,0 +1,57 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LINK_LIBS := -L$(ATLAS_LIB_PATH) -latlas -lcblas +INCLUDES := -I./include +COMMONFLAGS := +CC_ARGS := +CC=g++ + +ifndef debug + CC_ARGS += -O3 +endif + +OUT_DIR=./bin/$(OUT_SUFFIX) +OUT_FILE=libutil.so + +ifeq ($(numpy), 1) + PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2) + LINK_LIBS += -lpython$(PYTHON_VERSION) + + INCLUDES += -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) + COMMONFLAGS += -DNUMPY_INTERFACE + OUT_FILE=libutilpy.so +endif + +OBJECTS = matrix.cpp + +all: dir classes $(OUT_FILE) + +dir: + mkdir -p $(OUT_DIR)/src + +SOURCES = $(shell echo src/*.cpp) +CLASSES = $(SOURCES:.cpp=.o) + +classes: $(CLASSES) + +%.o: %.cpp + $(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o + +$(OUT_FILE): classes + cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS) + ln -sf $(OUT_DIR)/$(OUT_FILE) . + +clean: + rm -rf $(OUT_DIR)/* diff --git a/caffe2/contrib/cuda-convnet2/util/include/matrix.h b/caffe2/contrib/cuda-convnet2/util/include/matrix.h new file mode 100644 index 0000000..c75da8c --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/include/matrix.h @@ -0,0 +1,263 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MATRIX_H_ +#define MATRIX_H_ + +#include "matrix_funcs.h" +#ifdef NUMPY_INTERFACE +#include +#include +#endif +#include +#include +#include +#include +#include +#include + +extern "C" { +// #include +#include "caffe2/utils/cblas.h" +} + +#ifdef DOUBLE_PRECISION +#define CBLAS_GEMM cblas_dgemm +#define CBLAS_SCAL cblas_dscal +#define CBLAS_AXPY cblas_daxpy +#else +#define CBLAS_GEMM cblas_sgemm +#define CBLAS_SCAL cblas_sscal +#define CBLAS_AXPY cblas_saxpy +#endif /* DOUBLE_PRECISION */ + +#define MTYPE_MAX numeric_limits::max() + +typedef long long int int64; + +class Matrix { +private: + MTYPE* _data; + bool _ownsData; + int64 _numRows, _numCols; + int64 _numElements; + CBLAS_TRANSPOSE _trans; + + void _init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData); + void _tileTo2(Matrix& target) const; + void _copyAllTo(Matrix& target) const; + MTYPE _sum_column(int64 col) const; + MTYPE _sum_row(int64 row) const; + MTYPE _aggregate(MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; + void _aggregate(int64 axis, Matrix& target, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; + MTYPE _aggregateRow(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; + MTYPE _aggregateCol(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; + void _updateDims(int64 numRows, int64 numCols); + void _applyLoop(MTYPE(*func)(MTYPE)); + void _applyLoop(MTYPE (*func)(MTYPE), Matrix& target); + void _applyLoop2(const Matrix& a, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const; + void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const; + void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const; + void _applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const; + void _checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const; + void _divideByVector(const Matrix& vec, Matrix& target); + inline int64 _getNumColsBackEnd() const { + return _trans == CblasNoTrans ? _numCols : _numRows; + } +public: + enum FUNCTION { + TANH, RECIPROCAL, SQUARE, ABS, EXP, LOG, ZERO, ONE, LOGISTIC1, LOGISTIC2, SIGN + }; + Matrix(); + Matrix(int64 numRows, int64 numCols); + Matrix(int64 numRows, int64 numCols, bool transpose); +#ifdef NUMPY_INTERFACE + Matrix(const PyArrayObject *src); +#endif + Matrix(const Matrix &like); + Matrix(MTYPE* data, int64 numRows, int64 numCols); + Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose); + ~Matrix(); + + inline MTYPE& getCell(int64 i, int64 j) const { + assert(i >= 0 && i < _numRows); + assert(j >= 0 && j < _numCols); + if (_trans == CblasTrans) { + return _data[j * _numRows + i]; + } + return _data[i * _numCols + j]; + } + + MTYPE& operator()(int64 i, int64 j) const { + return getCell(i, j); + } + + inline MTYPE* getData() const { + return _data; + } + + inline bool isView() const { + return !_ownsData; + } + + inline int64 getNumRows() const { + return _numRows; + } + + inline int64 getNumCols() const { + return _numCols; + } + + inline int64 getNumDataBytes() const { + return _numElements * sizeof(MTYPE); + } + + inline int64 getNumElements() const { + return _numElements; + } + + inline int64 getLeadingDim() const { + return _trans == CblasTrans ? _numRows : _numCols; + } + + inline int64 getFollowingDim() const { + return _trans == CblasTrans ? _numCols : _numRows; + } + + inline CBLAS_TRANSPOSE getBLASTrans() const { + return _trans; + } + + inline bool isSameDims(const Matrix& a) const { + return a.getNumRows() == getNumRows() && a.getNumCols() == getNumCols(); + } + + inline bool isTrans() const { + return _trans == CblasTrans; + } + + /* + * Only use if you know what you're doing! + * Does not update any dimensions. Just flips the _trans flag. + * + * Use transpose() if you want to get the transpose of this matrix. + */ + inline void setTrans(bool trans) { + assert(isTrans() == trans || !isView()); + _trans = trans ? CblasTrans : CblasNoTrans; + } + + void apply(FUNCTION f); + void apply(Matrix::FUNCTION f, Matrix& target); + void subtractFromScalar(MTYPE scalar); + void subtractFromScalar(MTYPE scalar, Matrix &target) const; + void biggerThanScalar(MTYPE scalar); + void smallerThanScalar(MTYPE scalar); + void equalsScalar(MTYPE scalar); + void biggerThanScalar(MTYPE scalar, Matrix& target) const; + void smallerThanScalar(MTYPE scalar, Matrix& target) const; + void equalsScalar(MTYPE scalar, Matrix& target) const; + void biggerThan(Matrix& a); + void biggerThan(Matrix& a, Matrix& target) const; + void smallerThan(Matrix& a); + void smallerThan(Matrix& a, Matrix& target) const; + void minWith(Matrix &a); + void minWith(Matrix &a, Matrix &target) const; + void maxWith(Matrix &a); + void maxWith(Matrix &a, Matrix &target) const; + void equals(Matrix& a); + void equals(Matrix& a, Matrix& target) const; + void notEquals(Matrix& a) ; + void notEquals(Matrix& a, Matrix& target) const; + void add(const Matrix &m); + void add(const Matrix &m, MTYPE scale); + void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM); + void add(const Matrix &m, Matrix& target); + void add(const Matrix &m, MTYPE scaleM, Matrix &target); + void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target); + void subtract(const Matrix &m); + void subtract(const Matrix &m, Matrix& target); + void subtract(const Matrix &m, MTYPE scale); + void subtract(const Matrix &m, MTYPE scale, Matrix& target); + void addVector(const Matrix& vec, MTYPE scale); + void addVector(const Matrix& vec, MTYPE scale, Matrix& target); + void addVector(const Matrix& vec); + void addVector(const Matrix& vec, Matrix& target); + void addScalar(MTYPE scalar); + void addScalar(MTYPE scalar, Matrix& target) const; + void maxWithScalar(MTYPE scalar); + void maxWithScalar(MTYPE scalar, Matrix &target) const; + void minWithScalar(MTYPE scalar); + void minWithScalar(MTYPE scalar, Matrix &target) const; + void eltWiseMultByVector(const Matrix& vec); + void eltWiseMultByVector(const Matrix& vec, Matrix& target); + void eltWiseDivideByVector(const Matrix& vec); + void eltWiseDivideByVector(const Matrix& vec, Matrix& target); + void resize(int64 newNumRows, int64 newNumCols); + void resize(const Matrix& like); + Matrix& slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const; + void slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix &target) const; + Matrix& sliceRows(int64 startRow, int64 endRow) const; + void sliceRows(int64 startRow, int64 endRow, Matrix& target) const; + Matrix& sliceCols(int64 startCol, int64 endCol) const; + void sliceCols(int64 startCol, int64 endCol, Matrix& target) const; + void rightMult(const Matrix &b, MTYPE scale); + void rightMult(const Matrix &b, Matrix &target) const; + void rightMult(const Matrix &b); + void rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const; + void addProduct(const Matrix &a, const Matrix &b, MTYPE scaleAB, MTYPE scaleThis); + void addProduct(const Matrix& a, const Matrix& b); + void eltWiseMult(const Matrix& a); + void eltWiseMult(const Matrix& a, Matrix& target) const; + void eltWiseDivide(const Matrix& a); + void eltWiseDivide(const Matrix& a, Matrix &target) const; + Matrix& transpose() const; + Matrix& transpose(bool hard) const; + Matrix& tile(int64 timesY, int64 timesX) const; + void tile(int64 timesY, int64 timesX, Matrix& target) const; + void copy(Matrix &dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const; + Matrix& copy() const; + void copy(Matrix& target) const; + Matrix& sum(int64 axis) const; + void sum(int64 axis, Matrix &target) const; + MTYPE sum() const; + MTYPE max() const; + Matrix& max(int64 axis) const; + void max(int64 axis, Matrix& target) const; + MTYPE min() const; + Matrix& min(int64 axis) const; + void min(int64 axis, Matrix& target) const; + MTYPE norm() const; + MTYPE norm2() const; + void scale(MTYPE scale); + void scale(MTYPE alpha, Matrix& target); + void reshape(int64 numRows, int64 numCols); + Matrix& reshaped(int64 numRows, int64 numCols); + void printShape(const char* name) const; + bool hasNan() const; + bool hasInf() const; + + void randomizeNormal(MTYPE mean, MTYPE stdev); + void randomizeUniform(); + void randomizeNormal(); + void print() const; + void print(int64 startRow,int64 rows, int64 startCol,int64 cols) const; + void print(int64 rows, int64 cols) const; +}; + +typedef std::vector MatrixV; + +#endif /* MATRIX_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h b/caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h new file mode 100644 index 0000000..2d37ff1 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h @@ -0,0 +1,128 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MATRIX_FUNCS_H_ +#define MATRIX_FUNCS_H_ + +#include +#include +#include + +#ifdef DOUBLE_PRECISION +#define MTYPE double +#else +#define MTYPE float +#endif + +#define MYRAND ((double)rand() / ((double)RAND_MAX + 1)) + +inline MTYPE _zero(MTYPE /*x*/) { + return 0; +} + +inline MTYPE _one(MTYPE /*x*/) { + return 1; +} + +inline MTYPE _abs(MTYPE x) { + return x > 0 ? x : -x; +} + +inline MTYPE _square(MTYPE x) { + return x * x; +} + +inline MTYPE _sigma1(MTYPE x) { + return (tanh(x / 2) + 1) / 2; +} + +inline MTYPE _sigma2(MTYPE x) { + return 1 / (1 + exp(-x)); +} + +inline MTYPE _recip(MTYPE x) { + return 1 / x; +} + +inline MTYPE _exp(MTYPE x) { + return exp(x); +} + +inline MTYPE _log(MTYPE x) { + return log(x); +} + +inline MTYPE _tanh(MTYPE x) { + return tanh(x); +} + +inline MTYPE _sign(MTYPE x) { + return x > 0 ? 1 : -1; +} + +inline MTYPE _rand(MTYPE /*x*/) { + return MYRAND; +} + +inline MTYPE _divide(MTYPE x, MTYPE y) { + return x / y; +} + +inline MTYPE _mult(MTYPE x, MTYPE y) { + return x * y; +} + +inline MTYPE _add(MTYPE x, MTYPE y) { + return x + y; +} + +inline MTYPE _addSquare(MTYPE x, MTYPE y) { + return x*x + y; +} + +inline MTYPE _addWithScale(MTYPE x, MTYPE y, MTYPE scale) { + return x + scale*y; +} + +inline MTYPE _addWithScale2(MTYPE x, MTYPE y, MTYPE scaleThis, MTYPE scaleM) { + return scaleThis * x + scaleM * y; +} + +inline MTYPE _max(MTYPE x, MTYPE y) { + return std::max(x, y); +} + +inline MTYPE _min(MTYPE x, MTYPE y) { + return std::min(x, y); +} + +inline MTYPE _bigger(MTYPE x, MTYPE y) { + return x > y; +} + +inline MTYPE _smaller(MTYPE x, MTYPE y) { + return x < y; +} + +inline MTYPE _equal(MTYPE x, MTYPE y) { + return x == y; +} + +inline MTYPE _notEqual(MTYPE x, MTYPE y) { + return x != y; +} + +#endif /* MATRIX_FUNCS_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/util/include/queue.h b/caffe2/contrib/cuda-convnet2/util/include/queue.h new file mode 100644 index 0000000..e5cddd4 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/include/queue.h @@ -0,0 +1,112 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef QUEUE_H_ +#define QUEUE_H_ +#include +#include + +/* + * A thread-safe circular queue that automatically grows but never shrinks. + */ +template +class Queue { +private: + T *_elements; + int _numElements; + int _head, _tail; + int _maxSize; + pthread_mutex_t *_queueMutex; + pthread_cond_t *_queueCV; + + void _init(int initialSize) { + _numElements = 0; + _head = 0; + _tail = 0; + _maxSize = initialSize; + _elements = new T[initialSize]; + _queueCV = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t))); + _queueMutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t))); + pthread_mutex_init(_queueMutex, NULL); + pthread_cond_init(_queueCV, NULL); + } + + void expand() { + T *newStorage = new T[_maxSize * 2]; + memcpy(newStorage, _elements + _head, (_maxSize - _head) * sizeof(T)); + memcpy(newStorage + _maxSize - _head, _elements, _tail * sizeof(T)); + delete[] _elements; + _elements = newStorage; + _head = 0; + _tail = _numElements; + _maxSize *= 2; + } +public: + Queue(int initialSize) { + _init(initialSize); + } + + Queue() { + _init(1); + } + + ~Queue() { + pthread_mutex_destroy(_queueMutex); + pthread_cond_destroy(_queueCV); + delete[] _elements; + free(_queueMutex); + free(_queueCV); + } + + void enqueue(T el) { + pthread_mutex_lock(_queueMutex); + if (_numElements == _maxSize) { + expand(); + } + _elements[_tail] = el; + _tail = (_tail + 1) % _maxSize; + _numElements++; + + pthread_cond_signal(_queueCV); + pthread_mutex_unlock(_queueMutex); + } + + /* + * Blocks until not empty. + */ + T dequeue() { + pthread_mutex_lock(_queueMutex); + // Apparently, pthread_cond_signal may actually unblock + // multiple threads, so a while loop is needed here. + while (_numElements == 0) { + pthread_cond_wait(_queueCV, _queueMutex); + } + T el = _elements[_head]; + _head = (_head + 1) % _maxSize; + _numElements--; + pthread_mutex_unlock(_queueMutex); + return el; + } + + /* + * Obviously this number can change by the time you actually look at it. + */ + inline int getNumElements() const { + return _numElements; + } +}; + +#endif /* QUEUE_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/util/include/sync.h b/caffe2/contrib/cuda-convnet2/util/include/sync.h new file mode 100644 index 0000000..00113a5 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/include/sync.h @@ -0,0 +1,79 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SYNC_H_ +#define SYNC_H_ + +#include + +class Lock { +private: + pthread_mutex_t _mutex; +public: + Lock() { + pthread_mutex_init(&_mutex, NULL); + } + ~Lock() { + pthread_mutex_destroy(&_mutex); + } + + void acquire() { + pthread_mutex_lock(&_mutex); + } + + void release() { + pthread_mutex_unlock(&_mutex); + } +}; + +class ThreadSynchronizer { +private: + int _numThreads; + int _numSynced; + pthread_mutex_t *_syncMutex; + pthread_cond_t *_syncThresholdCV; +public: + ThreadSynchronizer(int numThreads) { + _numThreads = numThreads; + _numSynced = 0; + _syncMutex = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t)); + _syncThresholdCV = (pthread_cond_t*) malloc(sizeof(pthread_cond_t)); + pthread_mutex_init(_syncMutex, NULL); + pthread_cond_init(_syncThresholdCV, NULL); + } + + ~ThreadSynchronizer() { + pthread_mutex_destroy(_syncMutex); + pthread_cond_destroy(_syncThresholdCV); + free(_syncMutex); + free(_syncThresholdCV); + } + + void sync() { + pthread_mutex_lock(_syncMutex); + _numSynced++; + + if (_numSynced == _numThreads) { + _numSynced = 0; + pthread_cond_broadcast(_syncThresholdCV); + } else { + pthread_cond_wait(_syncThresholdCV, _syncMutex); + } + pthread_mutex_unlock(_syncMutex); + } +}; + +#endif /* SYNC_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/util/include/thread.h b/caffe2/contrib/cuda-convnet2/util/include/thread.h new file mode 100644 index 0000000..8380b58 --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/include/thread.h @@ -0,0 +1,111 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef THREAD_H_ +#define THREAD_H_ +#include +#include +#include +#include +#include + +#define NUM_CPUS_MAX 48 + +/* + * Abstract joinable thread class. + * The only thing the implementer has to fill in is the run method. + */ +class Thread { +private: + cpu_set_t *_cpu_set; + pthread_attr_t _pthread_attr; + pthread_t _threadID; + bool _joinable, _startable; + + static void* start_pthread_func(void *obj) { + void* retval = reinterpret_cast(obj)->run(); + pthread_exit(retval); + return retval; + } +protected: + virtual void* run() = 0; +public: + Thread(bool joinable) : _cpu_set(NULL), _joinable(joinable), _startable(true) { + pthread_attr_init(&_pthread_attr); + } + + Thread(bool joinable, std::vector& cpus) : _cpu_set(NULL), _joinable(joinable), _startable(true) { + pthread_attr_init(&_pthread_attr); + setAffinity(cpus); + } + + virtual ~Thread() { + if (_cpu_set != NULL) { + CPU_FREE(_cpu_set); + } + pthread_attr_destroy(&_pthread_attr); + } + + void setAffinity(std::vector& cpus) { + assert(_startable); + _cpu_set = CPU_ALLOC(NUM_CPUS_MAX); + size_t size = CPU_ALLOC_SIZE(NUM_CPUS_MAX); + if (cpus.size() > 0 && cpus[0] >= 0) { + CPU_ZERO_S(size, _cpu_set); + for (int i = 0; i < cpus.size(); i++) { + assert(cpus[i] < NUM_CPUS_MAX); + CPU_SET_S(cpus[i], size, _cpu_set); +// printf("set cpu %d\n", cpus[i]); + } + pthread_attr_setaffinity_np(&_pthread_attr, size, _cpu_set); + } + } + + pthread_t start() { + assert(_startable); + _startable = false; + pthread_attr_setdetachstate(&_pthread_attr, _joinable ? PTHREAD_CREATE_JOINABLE : PTHREAD_CREATE_DETACHED); + int n; + if ((n = pthread_create(&_threadID, &_pthread_attr, &Thread::start_pthread_func, (void*)this))) { + errno = n; + perror("pthread_create error"); + } + return _threadID; + } + + void join(void **status) { + assert(_joinable); + int n; + if((n = pthread_join(_threadID, status))) { + errno = n; + perror("pthread_join error"); + } + } + + void join() { + join(NULL); + } + + pthread_t getThreadID() const { + return _threadID; + } + + bool isStartable() const { + return _startable; + } +}; + +#endif /* THREAD_H_ */ diff --git a/caffe2/contrib/cuda-convnet2/util/src/matrix.cpp b/caffe2/contrib/cuda-convnet2/util/src/matrix.cpp new file mode 100644 index 0000000..a1da84c --- /dev/null +++ b/caffe2/contrib/cuda-convnet2/util/src/matrix.cpp @@ -0,0 +1,820 @@ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/matrix.h" +#include "../include/matrix_funcs.h" + +#if defined(_WIN64) || defined(_WIN32) +double sqrt(int _X) {return sqrt((double) _X);} +double log(int _X) {return log((double) _X);} +#endif + +using namespace std; + +void Matrix::_init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData) { + _updateDims(numRows, numCols); + _ownsData = ownsData; + _trans = transpose ? CblasTrans : CblasNoTrans; + _data = data; +} + +Matrix::Matrix() { + _init(NULL, 0, 0, false, true); +} + +Matrix::Matrix(int64 numRows, int64 numCols) { + _init(NULL, numRows, numCols, false, true); + this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL; +} + +Matrix::Matrix(int64 numRows, int64 numCols, bool transpose) { + _init(NULL, numRows, numCols, transpose, true); + this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL; +} + +Matrix::Matrix(const Matrix &like) { + _init(NULL, like.getNumRows(), like.getNumCols(), false, true); + this->_data = new MTYPE[this->_numElements]; +} + +/* construct a matrix with another matrix's data. the resultant + * matrix does NOT own its data */ +Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols) { + _init(data, numRows, numCols, false, false); +} + +/* construct a matrix with another matrix's data (and optionally transpose it). the resultant + * matrix does NOT own its data -- it is a VIEW */ +Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose) { + _init(data, numRows, numCols, transpose, false); +} + +#ifdef NUMPY_INTERFACE +Matrix::Matrix(const PyArrayObject *src) { + this->_data = NULL; + this->_trans = CblasNoTrans; + if (src != NULL) { + this->_updateDims(PyArray_DIM(src,0), PyArray_DIM(src,1)); + if (src->flags & NPY_CONTIGUOUS || src->flags & NPY_FORTRAN) { + this->_data = (MTYPE*) src->data; + this->_ownsData = false; + this->_trans = src->flags & NPY_CONTIGUOUS ? CblasNoTrans : CblasTrans; + } else { + this->_data = new MTYPE[PyArray_DIM(src,0) * PyArray_DIM(src,1)]; + for (int64 i = 0; i < PyArray_DIM(src,0); i++) { + for (int64 j = 0; j < PyArray_DIM(src,1); j++) { + (*this)(i,j) = *reinterpret_cast(PyArray_GETPTR2(src,i,j)); + } + } + this->_ownsData = true; + } + } +} +#endif +Matrix::~Matrix() { + if(this->_data != NULL && this->_ownsData) { + delete[] this->_data; + } +} + +void Matrix::_updateDims(int64 numRows, int64 numCols) { + this->_numRows = numRows; + this->_numCols = numCols; + this->_numElements = numRows * numCols; +} + +void Matrix::_checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const { + assert(startRow >= 0 && startRow <= _numRows); + assert(endRow >= 0 && endRow <= _numRows); + assert(startCol >= 0 && startCol <= _numCols); + assert(endCol >= 0 && endCol <= _numCols); +} + +/* will return a view if possible */ +Matrix& Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const { + endRow = endRow < 0 ? this->_numRows : endRow; + endCol = endCol < 0 ? this->_numCols : endCol; + _checkBounds(startRow, endRow, startCol, endCol); + if (!isTrans() && ((startCol == 0 && endCol == this->_numCols) || (startRow == endRow - 1))) { + return *new Matrix(this->_data + startRow * this->_numCols + startCol, endRow - startRow, endCol - startCol); + } else if (isTrans() && ((startRow == 0 && endRow == this->_numRows) || (startCol == endCol - 1))) { + return *new Matrix(this->_data + startCol * this->_numRows + startRow, endRow - startRow, endCol - startCol, true); + } + Matrix& newSlice = *new Matrix(endRow - startRow, endCol - startCol); + this->copy(newSlice, startRow, endRow, startCol, endCol, 0, 0); + return newSlice; +} + +/* this will NEVER return a view, unlike Matrix_slice */ +void Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix& target) const { + endRow = endRow < 0 ? this->_numRows : endRow; + endCol = endCol < 0 ? this->_numCols : endCol; + _checkBounds(startRow, endRow, startCol, endCol); + target.resize(endRow - startRow, endCol - startCol); + this->copy(target, startRow, endRow, startCol, endCol, 0, 0); +} + +Matrix& Matrix::sliceRows(int64 startRow, int64 endRow) const { + return slice(startRow, endRow, 0, -1); +} + +void Matrix::sliceRows(int64 startRow, int64 endRow, Matrix& target) const { + slice(startRow, endRow, 0, -1, target); +} + +Matrix& Matrix::sliceCols(int64 startCol, int64 endCol) const { + return slice(0, -1, startCol, endCol); +} + +void Matrix::sliceCols(int64 startCol, int64 endCol, Matrix& target) const { + slice(0, -1, startCol, endCol, target); +} + +void Matrix::subtractFromScalar(MTYPE scalar) { + subtractFromScalar(scalar, *this); +} + +void Matrix::subtractFromScalar(MTYPE scalar, Matrix& target) const { + if(&target != this) { + copy(target); + } + target.scale(-1); + target.addScalar(scalar); +} + +void Matrix::biggerThanScalar(MTYPE scalar) { + biggerThanScalar(scalar, *this); +} + +void Matrix::smallerThanScalar(MTYPE scalar) { + smallerThanScalar(scalar, *this); +} + +void Matrix::equalsScalar(MTYPE scalar) { + equalsScalar(scalar, *this); +} + +void Matrix::biggerThanScalar(MTYPE scalar, Matrix& target) const { + target.resize(*this); + _applyLoopScalar(scalar, &_bigger, target); +} + +void Matrix::smallerThanScalar(MTYPE scalar, Matrix& target) const { + target.resize(*this); + _applyLoopScalar(scalar, &_smaller, target); +} + +void Matrix::equalsScalar(MTYPE scalar, Matrix& target) const { + target.resize(*this); + _applyLoopScalar(scalar, &_equal, target); +} + +void Matrix::add(const Matrix &m) { + add(m, 1, *this); +} + +void Matrix::add(const Matrix &m, Matrix& target) { + add(m, 1, target); +} + +void Matrix::add(const Matrix &m, MTYPE scale) { + add(m, scale, *this); +} + +void Matrix::subtract(const Matrix &m) { + add(m, -1, *this); +} + +void Matrix::subtract(const Matrix &m, Matrix& target) { + add(m, -1, target); +} + +void Matrix::subtract(const Matrix &m, MTYPE scale) { + add(m, -scale, *this); +} + +void Matrix::subtract(const Matrix &m, MTYPE scale, Matrix& target) { + add(m, -scale, target); +} + +void Matrix::add(const Matrix &m, MTYPE scaleM, Matrix &target) { + add(m, 1, scaleM, target); +} + +void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM) { + add(m, scaleThis, scaleM, *this); +} + +void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target) { + assert(this->isSameDims(m)); + if (isTrans() != m.isTrans() || isTrans() != target.isTrans() || scaleThis != 1) { + if (&target != this) { + target.resize(*this); + } + if(scaleThis == 1 && scaleM == 1) { + this->_applyLoop2(m, &_add, target); + } else if (scaleThis == 1) { + this->_applyLoop2(m, &_addWithScale, scaleM, target); + } else { + this->_applyLoop2(m, &_addWithScale2, scaleThis, scaleM, target); + } + } else { + if (&target != this) { + copy(target); + } + CBLAS_AXPY(getNumElements(), scaleM, m._data, 1, target._data, 1); + } +} + +void Matrix::addScalar(MTYPE scalar) { + addScalar(scalar, *this); +} + +void Matrix::addScalar(MTYPE scalar, Matrix& target) const { + target.resize(*this); + _applyLoopScalar(scalar, &_add, target); +} + +void Matrix::maxWithScalar(MTYPE scalar) { + maxWithScalar(scalar, *this); +} + +void Matrix::maxWithScalar(MTYPE scalar, Matrix& target) const { + target.resize(*this); + _applyLoopScalar(scalar, &_max, target); +} + +void Matrix::minWithScalar(MTYPE scalar) { + minWithScalar(scalar, *this); +} + +void Matrix::minWithScalar(MTYPE scalar, Matrix& target) const { + target.resize(*this); + _applyLoopScalar(scalar, &_min, target); +} + +void Matrix::biggerThan(Matrix& a) { + biggerThan(a, *this); +} + +void Matrix::biggerThan(Matrix& a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + _applyLoop2(a, &_bigger, target); +} + +void Matrix::smallerThan(Matrix& a) { + smallerThan(a, *this); +} + +void Matrix::smallerThan(Matrix& a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + _applyLoop2(a, &_smaller, target); +} + +void Matrix::equals(Matrix& a) { + equals(a, *this); +} + +void Matrix::equals(Matrix& a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + _applyLoop2(a, &_equal, target); +} + +void Matrix::notEquals(Matrix& a) { + notEquals(a, *this); +} + +void Matrix::notEquals(Matrix& a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + _applyLoop2(a, &_notEqual, target); +} + +void Matrix::minWith(Matrix &a) { + minWith(a, *this); +} + +void Matrix::minWith(Matrix &a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + _applyLoop2(a, &_min, target); +} + +void Matrix::maxWith(Matrix &a) { + maxWith(a, *this); +} + +void Matrix::maxWith(Matrix &a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + _applyLoop2(a, &_max, target); +} + +/* this := this + scale*tile(vec) */ +void Matrix::addVector(const Matrix& vec, MTYPE scale, Matrix& target) { + if(&target != this) { + copy(target); + } + assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1); + const bool rowVector = vec.getNumRows() == 1; + const bool colVector = vec.getNumCols() == 1; + assert((rowVector && vec.getNumCols() == target.getNumCols()) || (colVector && vec.getNumRows() == target.getNumRows())); + if (rowVector && colVector) { + addScalar(vec(0,0) * scale, target); + return; + } + const int64 loopTil = rowVector ? target.getNumRows() : target.getNumCols(); + const int64 dataInc = ((rowVector && target.isTrans()) || (!rowVector && !target.isTrans())) ? 1 : (rowVector ? target.getNumCols() : target.getNumRows()); + const int64 myStride = ((target.isTrans() && rowVector) || (!target.isTrans() && !rowVector)) ? loopTil : 1; + for (int64 i = 0; i < loopTil; i++) { + CBLAS_AXPY(vec.getNumElements(), scale, vec._data, 1, target._data + dataInc * i, myStride); + } +} + +/* this := this + scale*tile(vec) */ +void Matrix::addVector(const Matrix& vec, MTYPE scale) { + addVector(vec, scale, *this); +} + +void Matrix::addVector(const Matrix& vec) { + addVector(vec, 1, *this); +} + +void Matrix::addVector(const Matrix& vec, Matrix& target) { + addVector(vec, 1, target); +} + +void Matrix::eltWiseMultByVector(const Matrix& vec) { + eltWiseMultByVector(vec, *this); +} + +/* omg test these */ +void Matrix::eltWiseMultByVector(const Matrix& vec, Matrix& target) { + if(&target != this) { + copy(target); + } + assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1); + const bool rowVector = vec.getNumRows() == 1; + assert((rowVector && vec.getNumCols() == target.getNumCols()) || (!rowVector && vec.getNumRows() == target.getNumRows())); + const int64 dataInc = ((rowVector && !target.isTrans()) || (!rowVector && target.isTrans())) ? 1 : (rowVector ? target.getNumRows() : target.getNumCols()); + const int64 myStride = ((!target.isTrans() && !rowVector) || (target.isTrans() && rowVector)) ? 1 : vec.getNumElements(); + const int64 numScaling = rowVector ? target.getNumRows() : target.getNumCols(); + for (int64 i = 0; i < vec.getNumElements(); i++) { + CBLAS_SCAL(numScaling, vec._data[i], target._data + dataInc * i, myStride); + } +} + +/* return := scale * this * b */ +void Matrix::rightMult(const Matrix& b, MTYPE scale) { + rightMult(b, scale, *this); +} + +/* return := this * b */ +void Matrix::rightMult(const Matrix& b) { + rightMult(b, 1); +} + +/* target := this * b + * also resizes target if necessary.*/ +void Matrix::rightMult(const Matrix &b, Matrix &target) const { + rightMult(b, 1, target); +} + +/* target := scaleAB * this * b + * also resizes target if necessary.*/ +void Matrix::rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const { + if(&target != this) { + target.resize(this->_numRows, b._numCols); + } + target.addProduct(*this, b, scaleAB, 0); +} + +/* this := scaleAB * a*b + scaleC * this + * ALL SIZES MUST BE CORRECT. */ +void Matrix::addProduct(const Matrix& a, const Matrix& b, MTYPE scaleAB, MTYPE scaleThis) { + assert(a.getNumCols() == b.getNumRows()); + assert(this->getNumRows() == a.getNumRows() && this->getNumCols() == b.getNumCols()); + assert(!isTrans()); + CBLAS_GEMM(CblasRowMajor, a._trans, b._trans, a._numRows, b._numCols, a._numCols, scaleAB, a._data, + a._getNumColsBackEnd(), b._data, b._getNumColsBackEnd(), scaleThis, this->_data, this->_numCols); +} + +void Matrix::addProduct(const Matrix& a, const Matrix& b) { + addProduct(a, b, 1, 1); +} + +Matrix& Matrix::transpose() const { + return *new Matrix(this->_data, this->_numCols, this->_numRows, !isTrans()); +} + +Matrix& Matrix::transpose(bool hard) const { + if (!hard || isTrans()) { + return transpose(); + } + Matrix &meTrans = *new Matrix(_numCols, _numRows); + for (int64 i = 0; i < _numRows; i++) { + for (int64 j = 0; j < _numCols; j++) { + meTrans(j, i) = (*this)(i, j); + } + } + return meTrans; +} + +Matrix& Matrix::tile(int64 timesY, int64 timesX) const { + Matrix& tiled = *new Matrix(this->_numRows * timesY, this->_numCols * timesX); + _tileTo2(tiled); + return tiled; +} + +/* resizes target if necessary */ +void Matrix::tile(int64 timesY, int64 timesX, Matrix& target) const { + target.resize(this->_numRows * timesY, this->_numCols * timesX); + _tileTo2(target); +} + +/* a variant ... seems to be no faster than original. */ +void Matrix::_tileTo2(Matrix& target) const { + for(int64 y = 0; y < target._numRows; y += this->_numRows) { + for(int64 x = 0; x < target._numCols; x += this->_numCols) { + this->copy(target, 0, -1, 0, -1, y, x); + } + } +} + +/* guarantees that result will be non-transposed */ +void Matrix::resize(int64 newNumRows, int64 newNumCols) { + if(this->_numRows != newNumRows || this->_numCols != newNumCols) { + assert(!isView()); + if (this->getNumElements() != newNumRows * newNumCols) { + delete[] this->_data; //deleting NULL is ok, sez c++ + this->_data = new MTYPE[newNumRows * newNumCols]; + } + this->_updateDims(newNumRows, newNumCols); + this->_trans = CblasNoTrans; + } +} + +void Matrix::resize(const Matrix& like) { + resize(like.getNumRows(), like.getNumCols()); +} + +void Matrix::scale(MTYPE alpha) { + scale(alpha, *this); +} + +void Matrix::scale(MTYPE alpha, Matrix& target) { + if (&target != this) { + target.resize(*this); + copy(target); + } + CBLAS_SCAL(getNumElements(), alpha, target._data, 1); +} + +/* performs no resizing. + * Warnings: + * 1. ALL DIMENSIONS MUST BE CORRECT + * 2. The source and destination memories better not overlap! */ +void Matrix::copy(Matrix& dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const { + srcEndRow = srcEndRow < 0 ? this->_numRows : srcEndRow; + srcEndCol = srcEndCol < 0 ? this->_numCols : srcEndCol; + assert(destStartRow >= 0 && destStartCol >= 0); //some range-checking + assert(srcEndRow <= _numRows && srcEndCol <= _numCols); + assert(destStartRow + srcEndRow - srcStartRow <= dest.getNumRows()); + assert(destStartCol + srcEndCol - srcStartCol <= dest.getNumCols()); + // I found no evidence that memcpy is actually faster than just + // copying element-by-element. + if (!isTrans() && !dest.isTrans()) { + int64 src_start_idx = this->_numCols * srcStartRow + srcStartCol; + int64 dest_start_idx = dest._numCols * destStartRow + destStartCol; + int64 copy_row_width = srcEndCol - srcStartCol; + + for (int64 i = srcStartRow; i < srcEndRow; i++) { + memcpy(dest._data + dest_start_idx + dest._numCols * (i - srcStartRow), + this->_data + src_start_idx + this->_numCols * (i - srcStartRow), sizeof(MTYPE) * copy_row_width); + } + } else { + for (int64 i = srcStartRow; i < srcEndRow; i++) { + for (int64 j = srcStartCol; j < srcEndCol; j++) { + dest(i - srcStartRow + destStartRow, j - srcStartCol + destStartCol) = (*this)(i, j); + } + } + } +} + +/* preserves everything excluding transposedness. + * new matrix owns its data */ +Matrix& Matrix::copy() const { + Matrix& copy = *new Matrix(*this); + this->copy(copy); + return copy; +} + +/* resizes target if necessary */ +void Matrix::copy(Matrix& target) const { + target.resize(this->_numRows, this->_numCols); //target is now non-transposed + if(this->isTrans() == target.isTrans()) { + this->_copyAllTo(target); + } else { //if I'm transposed, make sure that target is non-transposed copy + this->copy(target, 0, -1, 0, -1, 0, 0); + } +} + +void Matrix::_copyAllTo(Matrix& target) const { + assert(target.isTrans() == isTrans()); + memcpy((void*) target._data, (void*) this->_data, this->getNumDataBytes()); + target._trans = this->_trans; +} + +MTYPE Matrix::min() const { + return _aggregate(&_min, MTYPE_MAX); +} + +Matrix& Matrix::min(int64 axis) const { + Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1); + this->min(axis, target); + return target; +} + +void Matrix::min(int64 axis, Matrix& target) const { + _aggregate(axis, target, &_min, MTYPE_MAX); +} + +MTYPE Matrix::max() const { + return _aggregate(&_max, -MTYPE_MAX); +} + +Matrix& Matrix::max(int64 axis) const { + Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1); + this->max(axis, target); + return target; +} + +void Matrix::max(int64 axis, Matrix& target) const { + _aggregate(axis, target, &_max, -MTYPE_MAX); +} + +MTYPE Matrix::sum() const { + return _aggregate(&_add, 0); +} + +MTYPE Matrix::norm() const { + return sqrt(norm2()); +} + +MTYPE Matrix::norm2() const { + return _aggregate(&_addSquare, 0); +} + +Matrix& Matrix::sum(int64 axis) const { + Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1); + this->sum(axis, target); + return target; +} + +void Matrix::sum(int64 axis, Matrix& target) const { + _aggregate(axis, target, &_add, 0); +} + +void Matrix::_aggregate(int64 axis, Matrix& target, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { + if (axis == 0) { + target.resize(1, this->_numCols); + for (int64 j = 0; j < this->_numCols; j++) { + target(0, j) = _aggregateCol(j, agg_func, initialValue); + } + } else { + target.resize(this->_numRows, 1); + for (int64 i = 0; i < this->_numRows; i++) { + target(i, 0) = _aggregateRow(i, agg_func, initialValue); + } + } +} + +MTYPE Matrix::_aggregateRow(int64 row, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { + MTYPE v = initialValue; + for (int64 j = 0; j < this->_numCols; j++) { + v = agg_func((*this)(row, j), v); + } + return v; +} + +MTYPE Matrix::_aggregateCol(int64 col, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { + MTYPE v = initialValue; + for (int64 i = 0; i < this->_numRows; i++) { + v = agg_func((*this)(i, col), v); + } + return v; +} + +MTYPE Matrix::_aggregate(MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { + MTYPE v = initialValue; + MTYPE* ptr = _data; + for (int64 i = 0; i < getNumElements(); i++, ptr++) { + v = agg_func(*ptr, v); + } + return v; +} + +void Matrix::printShape(const char* name) const { + printf("%s: %lldx%lld\n", name, getNumRows(), getNumCols()); +} + +void Matrix::print() const { + print(0,getNumRows(),0, getNumCols()); +} + +void Matrix::print(int64 rows, int64 cols) const { + print(0,rows,0, cols); +} + +void Matrix::print(int64 startRow, int64 rows, int64 startCol, int64 cols) const { + for (int64 i = startRow; i < std::min(startRow+rows, this->_numRows); i++) { + for (int64 j = startCol; j < std::min(startCol+cols, this->_numCols); j++) { + printf("%.15f ", (*this)(i, j)); + } + printf("\n"); + } +} + +void Matrix::apply(Matrix::FUNCTION f) { + apply(f, *this); +} + + +void Matrix::apply(Matrix::FUNCTION f, Matrix& target) { + MTYPE (*func)(MTYPE); + if(f == EXP) { + func = &_exp; + } else if(f == TANH) { + func = &_tanh; + } else if(f == RECIPROCAL) { + func = &_recip; + } else if (f == SQUARE) { + func = &_square; + } else if(f == LOG) { + func = &_log; + } else if(f == ZERO) { + func = &_zero; + } else if (f == ONE) { + func = &_one; + } else if(f == LOGISTIC1) { + func = &_sigma1; + } else if(f == LOGISTIC2) { + func = &_sigma2; + } else if (f == ABS) { + func = &_abs; + } else if (f == SIGN) { + func = &_sign; + } else { + return; + //LOG(FATAL) << "Matrix::apply: Unknown function type"; + } + this->_applyLoop(func, target); +} + +void Matrix::eltWiseMult(const Matrix& a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + this->_applyLoop2(a, &_mult, target); +} + +void Matrix::eltWiseDivide(const Matrix& a, Matrix& target) const { + assert(isSameDims(a)); + target.resize(*this); + this->_applyLoop2(a, &_divide, target); +} + +void Matrix::eltWiseMult(const Matrix& a) { + eltWiseMult(a, *this); +} + +void Matrix::eltWiseDivide(const Matrix& a) { + eltWiseDivide(a, *this); +} + +void Matrix::randomizeUniform() { + this->_applyLoop(&_rand); +} + +void Matrix::randomizeNormal() { + //LOG(FATAL) << "randomizeNormal only implemented on MKL!"; +} + +void Matrix::randomizeNormal(MTYPE /*mean*/, MTYPE /*stdev*/) { + // LOG(FATAL) << "randomizeNormal only implemented on MKL!"; +} + +void Matrix::eltWiseDivideByVector(const Matrix& vec) { + eltWiseDivideByVector(vec, *this); +} + +/* This function allocates a chunk of memory at most as big as the input vector */ +void Matrix::eltWiseDivideByVector(const Matrix& vec, Matrix& target) { + assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1); + const bool rowVector = vec.getNumRows() == 1; + assert((rowVector && vec.getNumCols() == getNumCols()) || (!rowVector && vec.getNumRows() == getNumRows())); + if(&target != this) { + target.resize(*this); + } + _divideByVector(vec, target); +} + +void Matrix::_divideByVector(const Matrix& vec, Matrix& target) { + Matrix& vecInverse = vec.copy(); + vecInverse.apply(RECIPROCAL); + eltWiseMultByVector(vecInverse,target); + delete &vecInverse; +} + +void Matrix::reshape(int64 numRows, int64 numCols) { + assert(_numElements == numRows*numCols); + _numRows = numRows; + _numCols = numCols; +} + +Matrix& Matrix::reshaped(int64 numRows, int64 numCols) { + assert(_numElements == numRows*numCols); + return *new Matrix(_data, numRows, numCols, isTrans()); +} + +void Matrix::_applyLoop(MTYPE (*func)(MTYPE), Matrix& target) { + MTYPE *ptr = this->_data, *tgtPtr = target._data; + for (int64 i = 0; i < getNumElements(); i++, ptr++, tgtPtr++) { + *tgtPtr = (*func)(*ptr); + } +} + +void Matrix::_applyLoop(MTYPE (*func)(MTYPE)) { + _applyLoop(func, *this); +} + +void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE), Matrix& target) const { + for (int64 i = 0; i < getNumRows(); i++) { + for (int64 j = 0; j < getNumCols(); j++) { + target(i, j) = (*func)((*this)(i, j), a(i, j)); + } + } +} + +void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const { + for (int64 i = 0; i < getNumRows(); i++) { + for (int64 j = 0; j < getNumCols(); j++) { + target(i, j) = (*func)((*this)(i, j), a(i, j), scalar); + } + } +} + +void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const { + for (int64 i = 0; i < getNumRows(); i++) { + for (int64 j = 0; j < getNumCols(); j++) { + target(i, j) = (*func)((*this)(i, j), a(i, j), scalar1, scalar2); + } + } +} + +void Matrix::_applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const { + MTYPE *myPtr = _data; + MTYPE *targetPtr = target._data; + for (int64 i = 0; i < getNumElements(); i++, myPtr++, targetPtr++) { + *targetPtr = (*func)(*myPtr, scalar); + } +} + +bool Matrix::hasNan() const { + for (int64 r = 0; r < _numRows; r++) { + for (int64 c = 0; c < _numCols; c++) { + if (isnan((*this)(r,c))) { + return true; + } + } + } + return false; +} + +bool Matrix::hasInf() const { + for (int64 r = 0; r < _numRows; r++) { + for (int64 c = 0; c < _numCols; c++) { + if (isinf((*this)(r,c))) { + return true; + } + } + } + return false; +} + + diff --git a/caffe2/contrib/docker-ubuntu-14.04/Dockerfile b/caffe2/contrib/docker-ubuntu-14.04/Dockerfile new file mode 100644 index 0000000..c8d2bcc --- /dev/null +++ b/caffe2/contrib/docker-ubuntu-14.04/Dockerfile @@ -0,0 +1,126 @@ +FROM ubuntu:14.04 +MAINTAINER caffe-dev + +# A docker container with CUDA and caffe2 installed. +# Note: this should install everything but cudnn, which requires you to have a +# manual registration and download from the NVidia website. After creating this +# docker image, the Caffe2 repository is located at /opt/caffe2. You can install +# cudnn manually and re-compile caffe2. + +################################################################################ +# Step 1: set up cuda on the ubuntu box. +################################################################################ + +RUN apt-get update && apt-get install -q -y \ + build-essential \ + wget + +RUN cd /tmp && \ + wget http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run && \ + chmod +x cuda_*_linux.run && ./cuda_*_linux.run -extract=`pwd` && \ + ./NVIDIA-Linux-x86_64-*.run -s --no-kernel-module && \ + ./cuda-linux64-rel-*.run -noprompt && \ + rm -rf * + +# Ensure the CUDA libs and binaries are in the correct environment variables +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 +ENV PATH=$PATH:/usr/local/cuda/bin + +# Run nvcc to make sure things are set correctly. +RUN nvcc --version + +################################################################################ +# Step 2: set up caffe2 pre-requisites +################################################################################ + +RUN apt-get update && apt-get install -q -y \ + git \ + libeigen3-dev \ + libgoogle-glog-dev \ + libleveldb-dev \ + liblmdb-dev \ + libopencv-dev \ + libprotobuf-dev \ + libsnappy-dev \ + zlib1g-dev \ + libbz2-dev \ + protobuf-compiler \ + python-dev \ + python-pip + +RUN cd /tmp && \ + git clone https://github.com/facebook/rocksdb.git && \ + cd /tmp/rocksdb && \ + make && make install && \ + cd / && \ + rm -rf /tmp/rocksdb + +# Caffe2 works best with openmpi 1.8.5 or above (which has cuda support). +# If you do not need openmpi, skip this step. +RUN cd /tmp && \ + wget http://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-1.10.0.tar.gz && \ + tar xzvf openmpi-1.10.0.tar.gz && \ + cd /tmp/openmpi-1.10.0 && \ + ./configure --with-cuda --with-threads && \ + make && make install && \ + cd / && \ + rm -rf /tmp/openmpi-1.10.0 && \ + rm /tmp/openmpi-1.10.0.tar.gz + +# Caffe2 requires zeromq 4.0 or above, manually install. +# If you do not need zeromq, skip this step. +RUN apt-get install -q -y autoconf libtool +RUN mkdir /tmp/zeromq-build && \ + cd /tmp/zeromq-build && \ + wget https://github.com/zeromq/zeromq4-1/archive/v4.1.3.tar.gz && \ + tar xzvf v4.1.3.tar.gz --strip 1 && \ + ./autogen.sh && \ + ./configure --without-libsodium && \ + make && make install && \ + cd / && \ + rm -rf /tmp/zeromq-build + +# pip self upgrade +RUN pip install --upgrade pip + +# Python dependencies +RUN pip install \ + matplotlib \ + numpy \ + protobuf + +################################################################################ +# Step 3: install optional dependencies ("good to have" features) +################################################################################ + +RUN apt-get install -q -y \ + gfortran \ + graphviz \ + libatlas-base-dev \ + vim + +RUN pip install \ + flask \ + ipython \ + notebook \ + pydot \ + python-nvd3 \ + scipy \ + tornado + +# This is intentional. scikit-image has to be after scipy. +RUN pip install \ + scikit-image + +################################################################################ +# Step 4: set up caffe2 +################################################################################ + +# Get the repository, and build. +RUN cd /opt && \ + git clone https://github.com/Yangqing/caffe2.git && \ + cd /opt/caffe2 && \ + make + +# Now, we know that some of the caffe tests will fail. How do we deal with +# those? diff --git a/caffe2/contrib/gloo/CMakeLists.txt b/caffe2/contrib/gloo/CMakeLists.txt new file mode 100644 index 0000000..ff77e32 --- /dev/null +++ b/caffe2/contrib/gloo/CMakeLists.txt @@ -0,0 +1,22 @@ +if(USE_GLOO) + set(Caffe2_CONTRIB_GLOO_CPU_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/allgather_ops.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/barrier_ops.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/context.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc" + ) + + set(Caffe2_CONTRIB_GLOO_GPU_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc" + ) + + set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE) + set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE) +endif() diff --git a/caffe2/contrib/gloo/allgather_ops.cc b/caffe2/contrib/gloo/allgather_ops.cc new file mode 100644 index 0000000..ff536bd --- /dev/null +++ b/caffe2/contrib/gloo/allgather_ops.cc @@ -0,0 +1,61 @@ +/** + * Copyright (c) 2017-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "allgather_ops.h" + +#include + +namespace caffe2 { +namespace gloo { + +template +void AllgatherOp::initializeAlgorithm() { + if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllgatherRing( + init_.context, + init_.template getInputs(), + init_.template getOutput(), + init_.size)); + } else if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllgatherRing( + init_.context, + init_.template getInputs(), + init_.template getOutput(), + init_.size)); + } else if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllgatherRing( + init_.context, + init_.template getInputs(), + init_.template getOutput(), + init_.size)); + } else if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllgatherRing<::gloo::float16>( + init_.context, + init_.template getInputs<::gloo::float16>(), + init_.template getOutput<::gloo::float16>(), + init_.size)); + } else { + CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); + } +} + +namespace { + +REGISTER_CPU_OPERATOR_WITH_ENGINE(Allgather, GLOO, AllgatherOp); + +} // namespace +} // namespace gloo +} // namespace caffe2 diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h new file mode 100644 index 0000000..044357c --- /dev/null +++ b/caffe2/contrib/gloo/allgather_ops.h @@ -0,0 +1,130 @@ +/** + * Copyright (c) 2017-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "caffe2/contrib/gloo/common.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/types.h" + +#include +#include +#include + +namespace caffe2 { +namespace gloo { + +template +class AllgatherOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + AllgatherOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + ws_(ws), + status_blob_( + OperatorBase::GetSingleArgument("status_blob", "")) { + if (status_blob_ != "") { + ws_->CreateBlob(status_blob_); + } + } + + virtual ~AllgatherOp() {} + + bool RunOnDevice() override { + std::call_once(once_, [&] { initialize(); }); + + // If any parameter has changed in between runs, the initialized + // algorithm is invalid and cannot be used. + update(current_); + CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed"); + + try { + algorithm_->run(); + } catch (::gloo::IoException& ioe) { + LOG(ERROR) << "Caught gloo IO exception: " << ioe.what(); + if (status_blob_ != "") { + signalFailure(ws_->GetBlob(status_blob_), ioe); + return false; + } else { + throw; + } + } + return true; + } + + protected: + void initialize() { + // Allocate output tensor + CAFFE_ENFORCE_EQ(OutputSize(), 1); + auto comm_size = + OperatorBase::Input>(0)->size; + const auto dims = + std::vector(1, (InputSize() - 1) * Input(1).size() * comm_size); + Output(0)->Resize(dims); + + // Store which inputs/outputs this instance initialized with + update(init_); + + CAFFE_ENFORCE_EQ(init_.outputs.size(), 1); + + // Verify tensors all have same size + size_t size = Input(1).size(); + for (auto i = 2; i < InputSize(); i++) { + CAFFE_ENFORCE_EQ(Input(i).size(), size); + } + + // Verify tensors all have same type + TypeMeta meta = Input(1).meta(); + for (auto i = 2; i < InputSize(); i++) { + CAFFE_ENFORCE(Input(i).meta() == meta); + } + + // Finally initialize the algorithm + initializeAlgorithm(); + } + + void initializeAlgorithm(); + + std::once_flag once_; + std::unique_ptr<::gloo::Algorithm> algorithm_; + + // Captures the parameters passed to Gloo when first initialized. + // An instance is updated every time this op runs and is compared + // to the reference instance for equality. If any parameter has + // changed from run to run, the initialized algorithm is invalid. + void update(GlooParameters& params) { + params.context = OperatorBase::Input>(0); + params.inputs.resize(InputSize() - 1); + params.size = Input(1).size(); + params.meta = Input(1).meta(); + for (auto i = 0; i < params.inputs.size(); i++) { + params.inputs[i] = Input(i + 1).template raw_data(); + } + params.outputs.resize(OutputSize()); + params.outputs[0] = Output(0)->raw_mutable_data(params.meta); + } + + GlooParameters init_; + GlooParameters current_; + Workspace* ws_; + std::string status_blob_; +}; + +} // namespace gloo +} // namespace caffe2 diff --git a/caffe2/contrib/gloo/allreduce_ops.cc b/caffe2/contrib/gloo/allreduce_ops.cc new file mode 100644 index 0000000..888e34a --- /dev/null +++ b/caffe2/contrib/gloo/allreduce_ops.cc @@ -0,0 +1,62 @@ +#include "allreduce_ops.h" + +#include +#include +#include +#include + +namespace caffe2 { +namespace gloo { + +template +void AllreduceOp::initializeHalvingDoubling() { + if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllreduceHalvingDoubling( + init_.context, init_.template getOutputs(), init_.size)); + } else if (init_.template IsType<::caffe2::float16>()) { + algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<::gloo::float16>( + init_.context, + init_.template getOutputs<::gloo::float16>(), + init_.size)); + } else { + CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); + } +} + +template +void AllreduceOp::initializeRingFull() { + if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllreduceRing( + init_.context, init_.template getOutputs(), init_.size)); + } else if (init_.template IsType<::caffe2::float16>()) { + algorithm_.reset(new ::gloo::AllreduceRing<::gloo::float16>( + init_.context, + init_.template getOutputs<::gloo::float16>(), + init_.size)); + } else { + CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); + } +} + +template +void AllreduceOp::initializeRingChunked() { + if (init_.template IsType()) { + algorithm_.reset(new ::gloo::AllreduceRingChunked( + init_.context, init_.template getOutputs(), init_.size)); + } else if (init_.template IsType<::caffe2::float16>()) { + algorithm_.reset(new ::gloo::AllreduceRingChunked<::gloo::float16>( + init_.context, + init_.template getOutputs<::gloo::float16>(), + init_.size)); + } else { + CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); + } +} + +namespace { + +REGISTER_CPU_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp); + +} // namespace +} // namespace gloo +} // namespace caffe2 diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h new file mode 100644 index 0000000..8837b32 --- /dev/null +++ b/caffe2/contrib/gloo/allreduce_ops.h @@ -0,0 +1,131 @@ +#pragma once + +#include + +#include "caffe2/contrib/gloo/common.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/math.h" + +#include +#include +#include + +namespace caffe2 { +namespace gloo { + +template +class AllreduceOp final : public Operator { + enum Mode { RING_FULL, RING_CHUNKED, HALVING_DOUBLING }; + + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + AllreduceOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + ws_(ws), + status_blob_( + OperatorBase::GetSingleArgument("status_blob", "")), + gpu_direct_( + OperatorBase::GetSingleArgument("gpu_direct", false)) { + if (status_blob_ != "") { + ws_->CreateBlob(status_blob_); + } + } + + virtual ~AllreduceOp() {} + + bool RunOnDevice() override { + std::call_once(once_, [&] { initialize(); }); + + // If any parameter has changed in between runs, the initialized + // algorithm is invalid and cannot be used. + update(current_); + CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed"); + + try { + algorithm_->run(); + } catch (::gloo::IoException& ioe) { + LOG(ERROR) << "Caught gloo IO exception: " << ioe.what(); + if (status_blob_ != "") { + signalFailure(ws_->GetBlob(status_blob_), ioe); + return false; + } else { + throw; + } + } + return true; + } + + protected: + void initialize() { + Mode mode = HALVING_DOUBLING; + auto bytes = Input(1).nbytes(); + + // Store which inputs/outputs this instance initialized with + update(init_); + + // Verify inputs == ouputs + CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size()); + for (auto i = 0; i < init_.inputs.size(); i++) { + CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]); + } + + // Verify tensors all have same size + size_t size = Input(1).size(); + for (auto i = 2; i < InputSize(); i++) { + CAFFE_ENFORCE_EQ(Input(i).size(), size); + } + + // Verify tensors all have same type + TypeMeta meta = Input(1).meta(); + for (auto i = 2; i < InputSize(); i++) { + CAFFE_ENFORCE(Input(i).meta() == meta); + } + + switch (mode) { + case RING_FULL: + initializeRingFull(); + return; + case RING_CHUNKED: + initializeRingChunked(); + return; + case HALVING_DOUBLING: + initializeHalvingDoubling(); + return; + } + + CAFFE_ENFORCE(false, "Unreachable code"); + } + + void initializeHalvingDoubling(); + void initializeRingFull(); + void initializeRingChunked(); + + std::once_flag once_; + std::unique_ptr<::gloo::Algorithm> algorithm_; + + // Captures the parameters passed to Gloo when first initialized. + // An instance is updated every time this op runs and is compared + // to the reference instance for equality. If any parameter has + // changed from run to run, the initialized algorithm is invalid. + void update(GlooParameters& params) { + params.context = OperatorBase::Input>(0); + params.inputs.resize(InputSize() - 1); + params.outputs.resize(OutputSize()); + for (auto i = 0; i < params.inputs.size(); i++) { + params.inputs[i] = Input(i + 1).template raw_data(); + params.outputs[i] = Output(i)->template raw_mutable_data(); + } + params.size = Output(0)->size(); + params.meta = Output(0)->meta(); + } + + GlooParameters init_; + GlooParameters current_; + Workspace* ws_; + std::string status_blob_; + const bool gpu_direct_; +}; + +} // namespace gloo +} // namespace caffe2 diff --git a/caffe2/contrib/gloo/allreduce_ops_gpu.cc b/caffe2/contrib/gloo/allreduce_ops_gpu.cc new file mode 100644 index 0000000..bbc187c --- /dev/null +++ b/caffe2/contrib/gloo/allreduce_ops_gpu.cc @@ -0,0 +1,109 @@ +#include "allreduce_ops.h" + +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/logging.h" + +#include +#include +#include +#include + +namespace caffe2 { +namespace gloo { + +namespace { + +// Decides on using GPUDirect based on device support. +template