feat: 接入CNNL，并添加unary/binary/softmax/batchnorm/reduce/transpose/pooli…

…ng算子
InfiniTensor · Jan 10, 2024 · 9f34dda · 9f34dda
1 parent 7f82d74
commit 9f34dda
Show file tree

Hide file tree

Showing 40 changed files with 1,918 additions and 9 deletions.
diff --git a/src/02hardware/CMakeLists.txt b/src/02hardware/CMakeLists.txt
@@ -3,14 +3,10 @@ project(hardware VERSION 0.0.0 LANGUAGES CXX)
 message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})
 
 # Source files
-file(GLOB HARDWARE_SRC src/*.cc src/*.cpp src/devices/cpu/*.cc)
+file(GLOB_RECURSE HARDWARE_SRC src/*.cc src/*.cpp)
 
 if(USE_CUDA)
-    file(GLOB_RECURSE HARDWARE_CUDA_SRC src/devices/nvidia/*.cu src/devices/nvidia/*.cc)
-endif()
-
-if(USE_BANG)
-    file(GLOB_RECURSE HARDWARE_BANG_SRC src/devices/mlu/*.cc)
+    file(GLOB_RECURSE HARDWARE_CUDA_SRC src/devices/nvidia/*.cu)
 endif()
 
 add_library(hardware STATIC ${HARDWARE_SRC} ${HARDWARE_CUDA_SRC} ${HARDWARE_BANG_SRC})

diff --git a/src/02hardware/src/device_manager.cpp b/src/02hardware/src/device_manager.cpp
@@ -1,6 +1,7 @@
 #include "hardware/device_manager.h"
 #include "hardware/devices/cpu.h"
 #include "hardware/devices/nvidia.h"
+#include "hardware/devices/mlu.h"
 
 namespace refactor::hardware::device {
 
@@ -37,6 +38,7 @@ namespace refactor::hardware::device {
         using T = Device::Type;
         // clang-format off
         auto device = type == T::Nvidia ? std::make_shared<Nvidia>(card)
+                    : type == T::Mlu    ? std::make_shared<Mlu>(card)
                     : UNREACHABLEX(Arc<Device>, "");
         // clang-format on
         auto [kind, ok] = DEVICES.try_emplace(static_cast<int32_t>(type));

diff --git a/src/02hardware/src/devices/mlu/device.cc b/src/02hardware/src/devices/mlu/device.cc
@@ -1,21 +1,25 @@
-#include "functions.cc"
+#include "functions.hh"
 #include "hardware/devices/mlu.h"
 #include "hardware/mem_pool.h"
 #include "memory.hh"
 
 namespace refactor::hardware {
 
     static Arc<Memory> bangMemory(int32_t card) {
+#ifdef USE_BANG
         ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
         setDevice(card);
         auto [free, total] = getMemInfo();
         auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
-        fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}",
+        fmt::println("initializing Cambricon MLU {}, memory {} / {}, alloc {}",
                      card, free, total, size);
         return std::make_shared<MemPool>(
             std::make_shared<MluMemory>(),
             size,
             256ul);
+#else
+        return nullptr;
+#endif
     }
 
     Mlu::Mlu(int32_t card) : Device(card, bangMemory(card)) {}

diff --git a/src/02hardware/src/devices/mlu/functions.cc b/src/02hardware/src/devices/mlu/functions.cc
@@ -2,6 +2,7 @@
 
 namespace refactor::hardware {
 
+#ifdef USE_BANG
     int getDeviceCount() {
         unsigned deviceCount;
         BANG_ASSERT(cnrtGetDeviceCount(&deviceCount));
@@ -15,5 +16,6 @@ namespace refactor::hardware {
         BANG_ASSERT(cnrtMemGetInfo(&memInfo.free, &memInfo.total));
         return memInfo;
     }
+#endif
 
 }// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/functions.hh b/src/02hardware/src/devices/mlu/functions.hh
@@ -1,14 +1,17 @@
 #ifndef HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
 #define HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
 
-#include "cnrt.h"
 #include "common.h"
 
+#ifdef USE_BANG
+#include "cnrt.h"
+
 #define BANG_ASSERT(STATUS)                                                          \
     if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
         RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
                                   cnrtGetErrorStr(status), (int) status));           \
     }
+#endif 
 
 namespace refactor::hardware {
 

diff --git a/src/02hardware/src/devices/mlu/memory.cc b/src/02hardware/src/devices/mlu/memory.cc
@@ -2,6 +2,7 @@
 #include "functions.hh"
 
 namespace refactor::hardware {
+#ifdef USE_BANG
     using M = MluMemory;
 
     void *M::malloc(size_t size) {
@@ -27,5 +28,6 @@ namespace refactor::hardware {
                                CNRT_MEM_TRANS_DIR_PEER2PEER));
         return dst;
     }
+#endif
 
 }// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/nvidia/device.cc b/src/02hardware/src/devices/nvidia/device.cc
@@ -6,6 +6,7 @@
 namespace refactor::hardware {
 
     static Arc<Memory> cudaMemory(int32_t card) {
+#ifdef USE_CUDA
         ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
         setDevice(card);
         auto [free, total] = getMemInfo();
@@ -16,6 +17,9 @@ namespace refactor::hardware {
             std::make_shared<NvidiaMemory>(),
             size,
             256ul);
+#else
+        return nullptr;
+#endif
     }
 
     Nvidia::Nvidia(int32_t card) : Device(card, cudaMemory(card)) {}

diff --git a/src/04kernel/src/collectors/batch_normalization.cc b/src/04kernel/src/collectors/batch_normalization.cc
@@ -1,6 +1,7 @@
 #include "kernel/collectors/batch_normalization.h"
 #include "../kernels/batch_normalization/cpu_kernel.hh"
 #include "../kernels/batch_normalization/cudnn_kernel.hh"
+#include "../kernels/batch_normalization/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -20,6 +21,9 @@ namespace refactor::kernel {
             case decltype(_target)::Nvidia:
                 REGISTER(BatchNormalizationCudnn)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(BatchNormalizationCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/collectors/reduce.cc b/src/04kernel/src/collectors/reduce.cc
@@ -1,6 +1,7 @@
 #include "kernel/collectors/reduce.h"
 #include "../kernels/reduce/cpu_kernel.hh"
 #include "../kernels/reduce/cudnn_kernel.hh"
+#include "../kernels/reduce/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -27,6 +28,9 @@ namespace refactor::kernel {
             case decltype(_target)::Nvidia:
                 REGISTER(ReduceCudnn)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(ReduceCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/collectors/simple_binary.cc b/src/04kernel/src/collectors/simple_binary.cc
@@ -2,6 +2,7 @@
 #include "../kernels/simple_binary/binary_cudnn.hh"
 #include "../kernels/simple_binary/cpu_kernel.hh"
 #include "../kernels/simple_binary/cuda_kernel.hh"
+#include "../kernels/simple_binary/binary_cnnl.hh"
 
 namespace refactor::kernel {
 
@@ -48,6 +49,9 @@ namespace refactor::kernel {
                 REGISTER_BROCAST(BinaryCudnn)
                 REGISTER(BinaryCuda)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER_BROCAST(BinaryCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/collectors/simple_unary.cc b/src/04kernel/src/collectors/simple_unary.cc
@@ -2,6 +2,8 @@
 #include "../kernels/simple_unary/cpu_kernel.hh"
 #include "../kernels/simple_unary/cuda_kernel.hh"
 #include "../kernels/simple_unary/cudnn_activation_kernel.hh"
+#include "../kernels/simple_unary/cnnl_activation_kernel.hh"
+#include "../kernels/simple_unary/cnnl_simple_unary_kernel.hh"
 #include "common.h"
 
 namespace refactor::kernel {
@@ -54,6 +56,10 @@ namespace refactor::kernel {
                 REGISTER(ActivationCudnn)
                 REGISTER(SimpleUnaryCuda)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(ActivationCnnl)
+                REGISTER(SimpleUnaryCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/collectors/softmax.cc b/src/04kernel/src/collectors/softmax.cc
@@ -1,4 +1,5 @@
 #include "kernel/collectors/softmax.h"
+#include "../kernels/softmax/cnnl_kernel.hh"
 #include "../kernels/softmax/cpu_kernel.hh"
 #include "../kernels/softmax/cuda_kernel.hh"
 #include "../kernels/softmax/cudnn_kernel.hh"
@@ -28,6 +29,12 @@ namespace refactor::kernel {
                 }
                 break;
             }
+            case decltype(_target)::Mlu: {
+                if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            }
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/collectors/transpose.cc b/src/04kernel/src/collectors/transpose.cc
@@ -1,6 +1,7 @@
 #include "kernel/collectors/transpose.h"
 #include "../kernels/transpose/cpu_kernel.hh"
 #include "../kernels/transpose/cuda_kernel.hh"
+#include "../kernels/transpose/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -25,6 +26,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = TransposeCnnl::build(data.dataType, data.shape, perm); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
@@ -0,0 +1,158 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = BatchNormalizationCnnl;
+    using DT = DataType;
+
+    K::BatchNormalizationCnnl(decltype(info) info_) noexcept
+        : info(info_) {}
+
+    auto K::build(float epsilon, TensorRefs inputs) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        auto const &x = inputs[0].get();
+        auto const &scale = inputs[1].get();
+        auto const &mean = inputs[3].get();
+
+        if (x.rank() != 4) {
+            return nullptr;
+        }
+
+        // see "Supported Configurations for `cnnlBatchNormalizationForwardInference`"
+        if (scale.dataType != mean.dataType) {
+            return nullptr;
+        }
+        if (x.dataType == DT::F64) {
+            if (scale.dataType != DT::F64) {
+                return nullptr;
+            }
+        } else {
+            if (scale.dataType != DT::F32) {
+                return nullptr;
+            }
+        }
+        return std::make_unique<K>(decltype(info){
+            epsilon,
+            x.dataType,
+            scale.dataType,
+            x.layout,
+            {
+                static_cast<int>(x.shape[0]),
+                static_cast<int>(x.shape[1]),
+                static_cast<int>(x.shape[2]),
+                static_cast<int>(x.shape[3]),
+            }});
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing batch normalization for non-training-mode using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto K::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, inDescTrans, p;
+            cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
+            bool f32;
+
+            explicit Descriptors(decltype(f32) f32_)
+                : inDesc(nullptr), inDescTrans(nullptr), p(nullptr),
+                  NCHW2NHWC(nullptr), NHWC2NCHW(nullptr), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDescTrans));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&p));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDescTrans));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(p));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.dtX != DT::F64);
+        int dimNCHW[4] = {info.dimAx[0], info.dimAx[1], info.dimAx[2], info.dimAx[3]};
+        int dimNHWC[4] = {info.dimAx[0], info.dimAx[2], info.dimAx[3], info.dimAx[1]};
+        int dimParam[]{info.dimAx[1]};
+        setCnnlTensor(d->inDesc, info.dtX, slice(dimNCHW, 4));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->inDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dtX), 4, dimNHWC));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->p, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dtP), 1, dimParam));
+        int permute[4] = {0, 2, 3, 1};
+        int permuteOut[4] = {0, 3, 1, 2};
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permute));
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut));
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX);
+        size_t workspaceSize;
+        CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize));
+        size_t totalWorkspaceSize = xTransSize + workspaceSize;
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d),
+                        epsilon = info.epsilon,
+                        xTransSize, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            // name inputs and outputs
+            auto x = inputs[0],
+                 scale = inputs[1],
+                 bias = inputs[2],
+                 mean = inputs[3],
+                 var = inputs[4];
+            auto y = outputs[0];
+
+            void *xTrans = workspace;
+            void *yTrans = xTrans + xTransSize;
+            void *cursor = yTrans + workspaceSize;
+
+            // transpose NCHW input to NHWC
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
+                                         d->inDescTrans, xTrans, cursor, workspaceSize));
+
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+            CNNL_ASSERT(cnnlBatchNormForwardInference(
+                handle, &a, &b,
+                d->inDescTrans, xTrans, d->p, scale, bias, mean, var,
+                epsilon, d->inDescTrans, yTrans));
+
+            // transpose NHWC intermediates to NCHW
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans,
+                                         d->inDesc, y, cursor, workspaceSize));
+
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+
+        return {std::move(routine), totalWorkspaceSize};
+    }
+
+#endif
+
+}// namespace refactor::kernel