InfiniTensor · xgqdut2016 · Mar 1, 2024 · Mar 5, 2024
diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt
@@ -7,6 +7,9 @@ if(USE_CUDA)
     file(GLOB_RECURSE KERNEL_CUDA_SRC src/*.cu)
     add_subdirectory(cuda)
 endif()
+if(USE_BANG)
+    file(GLOB_RECURSE KERNEL_BANG_SRC src/*.mlu)
+endif()
 
 add_library(kernel STATIC ${KERNEL_SRC} ${KERNEL_CUDA_SRC})
 target_link_libraries(kernel PUBLIC runtime)

diff --git a/src/04kernel/src/collectors/softmax.cc b/src/04kernel/src/collectors/softmax.cc
@@ -1,4 +1,6 @@
 #include "kernel/collectors/softmax.h"
+#include "../kernels/softmax/bang_kernel.hh"
+#include "../kernels/softmax/cnnl_kernel.hh"
 #include "../kernels/softmax/cpu_kernel.hh"
 #include "../kernels/softmax/cuda_kernel.hh"
 #include "../kernels/softmax/cudnn_kernel.hh"
@@ -28,6 +30,15 @@ namespace refactor::kernel {
                 }
                 break;
             }
+            case decltype(_target)::Mlu: {
+                if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                if (auto ptr = SoftmaxBang::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            }
             default:
                 UNREACHABLEX(void, "Unknown target");
         }

diff --git a/src/04kernel/src/kernels/softmax/bang_kernel.cc b/src/04kernel/src/kernels/softmax/bang_kernel.cc
@@ -0,0 +1,29 @@
+#include "bang_kernel.hh"
+
+namespace refactor::kernel {
+    using K = SoftmaxBang;
+
+    K::SoftmaxBang(SoftmaxInfo info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(SoftmaxInfo info) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return info.type.isFloat()
+                   ? std::make_unique<K>(std::move(info))
+                   : nullptr;
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing Softmax using BANG";
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/softmax/bang_kernel.hh b/src/04kernel/src/kernels/softmax/bang_kernel.hh
@@ -0,0 +1,26 @@
+#ifndef KERNEL_SOFTMAX_BANG_HH
+#define KERNEL_SOFTMAX_BANG_HH
+
+#include "cnnl.h"
+#include "cnrt.h"
+#include "kernel/attributes/softmax_info.h"
+#include "kernel/collectors/softmax.h"
+namespace refactor::kernel {
+
+    struct SoftmaxBang final : public Kernel {
+        SoftmaxInfo info;
+
+        SoftmaxBang(SoftmaxInfo) noexcept;
+        static KernelBox build(SoftmaxInfo) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif//KERNEL_SOFTMAX_BANG_HH
diff --git a/src/04kernel/src/kernels/softmax/bang_kernel.mlu b/src/04kernel/src/kernels/softmax/bang_kernel.mlu
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
@@ -0,0 +1,88 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = SoftmaxCnnl;
+
+    K::SoftmaxCnnl(cnnl::SoftmaxAlgo algo_, DataType type_,
+                   int pre_, int mid_, int post_) noexcept
+        : Kernel(), algo(algo_), dataType(type_),
+          pre(pre_), mid(mid_), post(post_) {}
+
+    auto K::build(cnnl::SoftmaxAlgo algo, SoftmaxInfo info) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return std::make_unique<K>(algo, info.type, info.pre, info.mid, info.post);
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing softmax forward with CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto SoftmaxCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t t;
+            cnnlSoftmaxAlgorithm_t algo;
+            bool f32;
+
+            Descriptors(decltype(algo) algo_, decltype(f32) f32_)
+                : algo(algo_), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&t));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(t));
+            }
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+
+        auto d = std::make_shared<Descriptors>(
+            static_cast<cnnlSoftmaxAlgorithm_t>(algo),
+            dataType != DataType::F64);
+        int dims[]{pre, mid, post};
+        // cnnlSoftmaxMode_t mode = (pre == 1)  ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
+        //                          : (post == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
+        //                                       : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+        // FIXME(bolun): CNNL Softmax mode
+        cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+
+        // cnnlSoftmaxForward_v2 is applied to a 3D input tensor only
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d), mode](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+            CNNL_ASSERT(cnnlSoftmaxForward_v2(
+                res.fetchOrStore<CnnlContext>()->handle,
+                d->algo,
+                mode,
+                CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
+                &a, d->t, inputs[0],
+                &b, d->t, outputs[0]));
+        };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.hh b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh
@@ -0,0 +1,36 @@
+#ifndef KERNEL_SOFTMAX_CNNL_HH
+#define KERNEL_SOFTMAX_CNNL_HH
+
+#include "kernel/attributes/softmax_info.h"
+#include "kernel/collectors/softmax.h"
+
+namespace refactor::kernel {
+
+    namespace cnnl {
+        enum class SoftmaxAlgo {
+            FAST = 0,
+            ACCURATE = 1,
+            LOG = 2,
+        };
+    }// namespace cnnl
+
+    struct SoftmaxCnnl final : public Kernel {
+        cnnl::SoftmaxAlgo algo;
+        DataType dataType;
+        int pre, mid, post;
+
+        SoftmaxCnnl(cnnl::SoftmaxAlgo, DataType, int, int, int) noexcept;
+
+        static KernelBox build(cnnl::SoftmaxAlgo, SoftmaxInfo) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SOFTMAX_CNNL_HH
diff --git a/src/04kernel/src/utilities/bang/cnnl_context.cc b/src/04kernel/src/utilities/bang/cnnl_context.cc
@@ -0,0 +1,45 @@
+#ifdef USE_BANG
+
+#include "cnnl_context.hh"
+#include "cnnl_functions.h"
+
+namespace refactor::kernel::cnnl {
+
+    CnnlContext::CnnlContext() : runtime::Resource() {
+        BANG_ASSERT(cnrtQueueCreate(&queue));
+        CNNL_ASSERT(cnnlCreate(&handle));
+        CNNL_ASSERT(cnnlSetQueue(handle, queue));
+    }
+    CnnlContext::~CnnlContext() {
+        BANG_ASSERT(cnrtQueueDestroy(queue));
+        CNNL_ASSERT(cnnlDestroy(handle));
+    }
+
+    auto CnnlContext::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto CnnlContext::build() -> runtime::ResourceBox {
+        return std::make_unique<CnnlContext>();
+    }
+
+    auto CnnlContext::resourceTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto CnnlContext::description() const noexcept -> std::string_view {
+        return "CnnlContext";
+    }
+
+    void CnnlContext::copyFromCPU(void *dst, const void *src, size_t size) {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
+                               CNRT_MEM_TRANS_DIR_HOST2DEV));
+    }
+
+    void CnnlContext::queueSync() {
+        BANG_ASSERT(cnrtQueueSync(queue));
+    }
+
+}// namespace refactor::kernel::cnnl
+
+#endif
+
diff --git a/src/04kernel/src/utilities/bang/cnnl_context.hh b/src/04kernel/src/utilities/bang/cnnl_context.hh
@@ -0,0 +1,32 @@
+#ifndef KERNEL_CNNL_CONTEXT_HH
+#define KERNEL_CNNL_CONTEXT_HH
+
+#include "runtime/resource.h"
+#include <cnnl.h>
+#include <cnrt.h>
+
+namespace refactor::kernel::cnnl {
+
+    struct CnnlContext final : public runtime::Resource {
+        cnnlHandle_t handle;
+        cnrtQueue_t queue;
+
+        CnnlContext();
+        ~CnnlContext();
+        CnnlContext(CnnlContext const &) noexcept = delete;
+        CnnlContext(CnnlContext &&) noexcept = delete;
+
+        static size_t typeId() noexcept;
+        static runtime::ResourceBox build();
+
+        size_t resourceTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+
+        void copyFromCPU(void *dst, const void *src, size_t size);
+        void queueSync();
+    };
+
+}// namespace refactor::kernel::cnnl
+
+#endif// KERNEL_CNNL_CONTEXT_HH
+
diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.cpp b/src/04kernel/src/utilities/bang/cnnl_functions.cpp
@@ -0,0 +1,39 @@
+#ifdef USE_BANG
+
+#include "cnnl_functions.h"
+
+namespace refactor::kernel::cnnl {
+
+    cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
+        // clang-format off
+        switch (dataType) {
+            case DataType::F32 : return CNNL_DTYPE_FLOAT;    break;
+            case DataType::F64 : return CNNL_DTYPE_DOUBLE;   break;
+            case DataType::FP16: return CNNL_DTYPE_HALF;     break;
+            case DataType::I8  : return CNNL_DTYPE_INT8;     break;
+            case DataType::I32 : return CNNL_DTYPE_INT32;    break;
+            case DataType::U8  : return CNNL_DTYPE_UINT8;    break;
+            case DataType::BF16: return CNNL_DTYPE_BFLOAT16; break;
+            case DataType::I64 : return CNNL_DTYPE_INT64;    break;
+            case DataType::Bool: return CNNL_DTYPE_BOOL;     break;
+            default: UNREACHABLE();
+        }
+        // clang-format on
+    }
+
+    void setCnnlTensor(cnnlTensorDescriptor_t t, DataType dt, slice_t<int> d) {
+        auto dt_ = cnnlDataTypeConvert(dt);
+        if (auto n = d.size(); n == 4) {
+            CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin()));
+        } else if (n < 4) {
+            int d_[]{1, 1, 1, 1};
+            std::copy_n(d.begin(), n, d_ + 4 - n);
+            CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, 4, std::move(d_)));
+        } else {
+            CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin()));
+        }
+    }
+}// namespace refactor::kernel::cnnl
+
+#endif
+
diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.h b/src/04kernel/src/utilities/bang/cnnl_functions.h
@@ -0,0 +1,41 @@
+#ifndef KERNEL_CNNL_FUNCTIONS_H
+#define KERNEL_CNNL_FUNCTIONS_H
+
+#include "common.h"
+#include <cnnl.h>
+
+#define BANG_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
+        RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cnrtGetErrorStr(status), (int) status));           \
+    }
+
+#define CNNL_ASSERT(STATUS)                                      \
+    if (auto status = (STATUS); status != CNNL_STATUS_SUCCESS) { \
+        fmt::println("cnnl failed on \"" #STATUS "\" with {}",  \
+                     cnnlGetErrorString(status));                \
+        abort();                                                 \
+    }
+
+namespace refactor::kernel::cnnl {
+
+    cnnlDataType_t cnnlDataTypeConvert(DataType);
+
+    // A helper function that set Cnnl tensor descriptor given tensor shape and type
+    void setCnnlTensor(cnnlTensorDescriptor_t, DataType, slice_t<int>);
+
+    template<class T>
+    constexpr uint64_t factor(T x) noexcept {
+        static_assert(std::is_floating_point_v<T>);
+        static_assert(sizeof(T) <= sizeof(uint64_t));
+        union {
+            T f;
+            uint64_t i;
+        } u{x};
+        return u.i;
+    }
+
+}// namespace refactor::kernel::cnnl
+
+#endif// KERNEL_CNNL_FUNCTIONS_H
+