From c291c79c53cd1420f8acae993620bdac549cb033 Mon Sep 17 00:00:00 2001
From: Bolun <chamberlain0w0@gmail.com>
Date: Fri, 5 Jan 2024 06:57:40 +0000
Subject: [PATCH 1/9] =?UTF-8?q?feat:=20=E6=8E=A5=E5=85=A5CNNL=EF=BC=8C?=
 =?UTF-8?q?=E5=B9=B6=E6=B7=BB=E5=8A=A0unary/binary/softmax/batchnorm/reduc?=
 =?UTF-8?q?e/transpose/pooling=E7=AE=97=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/02hardware/src/device_manager.cpp         |   1 +
 .../src/collectors/batch_normalization.cc     |   4 +
 src/04kernel/src/collectors/pool.cc           |   6 +
 src/04kernel/src/collectors/reduce.cc         |   4 +
 src/04kernel/src/collectors/simple_binary.cc  |   4 +
 src/04kernel/src/collectors/simple_unary.cc   |   6 +
 src/04kernel/src/collectors/softmax.cc        |   7 +
 src/04kernel/src/collectors/transpose.cc      |   6 +
 .../batch_normalization/cnnl_kernel.cc        | 158 ++++++++++++++
 .../batch_normalization/cnnl_kernel.hh        |  32 +++
 src/04kernel/src/kernels/pool/cnnl_kernel.cc  | 156 ++++++++++++++
 src/04kernel/src/kernels/pool/cnnl_kernel.hh  |  45 ++++
 .../src/kernels/reduce/cnnl_kernel.cc         | 128 ++++++++++++
 .../src/kernels/reduce/cnnl_kernel.hh         |  32 +++
 .../src/kernels/simple_binary/binary_cnnl.cc  | 195 ++++++++++++++++++
 .../src/kernels/simple_binary/binary_cnnl.hh  |  28 +++
 .../simple_unary/cnnl_activation_kernel.cc    |  91 ++++++++
 .../simple_unary/cnnl_activation_kernel.hh    |  27 +++
 .../simple_unary/cnnl_simple_unary_kernel.cc  |  94 +++++++++
 .../simple_unary/cnnl_simple_unary_kernel.hh  |  27 +++
 .../src/kernels/softmax/cnnl_kernel.cc        |  86 ++++++++
 .../src/kernels/softmax/cnnl_kernel.hh        |  36 ++++
 .../src/kernels/transpose/cnnl_kernel.cc      |  92 +++++++++
 .../src/kernels/transpose/cnnl_kernel.hh      |  32 +++
 .../src/utilities/bang/cnnl_context.cc        |  35 ++++
 .../src/utilities/bang/cnnl_context.hh        |  29 +++
 .../src/utilities/bang/cnnl_functions.cpp     |  38 ++++
 .../src/utilities/bang/cnnl_functions.h       |  40 ++++
 .../kernels/batch_normalization/test_cnnl.cpp |  70 +++++++
 src/04kernel/test/kernels/pool/test_cnnl.cpp  |  70 +++++++
 .../test/kernels/reduce/test_cnnl.cpp         |  64 ++++++
 .../simple_binary/test_binary_cnnl.cpp        |  90 ++++++++
 .../test/kernels/simple_unary/test_cnnl.cpp   |  63 ++++++
 .../test/kernels/softmax/test_cnnl.cpp        |  52 +++++
 .../test/kernels/transpose/test_cnnl.cpp      |  55 +++++
 35 files changed, 1903 insertions(+)
 create mode 100644 src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/pool/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/pool/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/reduce/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/reduce/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
 create mode 100644 src/04kernel/src/kernels/simple_binary/binary_cnnl.hh
 create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
 create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh
 create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
 create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh
 create mode 100644 src/04kernel/src/kernels/softmax/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/softmax/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/transpose/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/transpose/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/utilities/bang/cnnl_context.cc
 create mode 100644 src/04kernel/src/utilities/bang/cnnl_context.hh
 create mode 100644 src/04kernel/src/utilities/bang/cnnl_functions.cpp
 create mode 100644 src/04kernel/src/utilities/bang/cnnl_functions.h
 create mode 100644 src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/pool/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/reduce/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/softmax/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/transpose/test_cnnl.cpp

diff --git a/src/02hardware/src/device_manager.cpp b/src/02hardware/src/device_manager.cpp
index 15ae3b901..8a7f8edd2 100644
--- a/src/02hardware/src/device_manager.cpp
+++ b/src/02hardware/src/device_manager.cpp
@@ -2,6 +2,7 @@
 #include "hardware/devices/cpu.h"
 #include "hardware/devices/mlu.h"
 #include "hardware/devices/nvidia.h"
+#include "hardware/devices/mlu.h"
 
 namespace refactor::hardware::device {
 
diff --git a/src/04kernel/src/collectors/batch_normalization.cc b/src/04kernel/src/collectors/batch_normalization.cc
index 93bcb240e..e944e37d7 100644
--- a/src/04kernel/src/collectors/batch_normalization.cc
+++ b/src/04kernel/src/collectors/batch_normalization.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/batch_normalization.h"
 #include "../kernels/batch_normalization/cpu_kernel.hh"
 #include "../kernels/batch_normalization/cudnn_kernel.hh"
+#include "../kernels/batch_normalization/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -20,6 +21,9 @@ namespace refactor::kernel {
             case decltype(_target)::Nvidia:
                 REGISTER(BatchNormalizationCudnn)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(BatchNormalizationCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/pool.cc b/src/04kernel/src/collectors/pool.cc
index 458d3a375..d034e199f 100644
--- a/src/04kernel/src/collectors/pool.cc
+++ b/src/04kernel/src/collectors/pool.cc
@@ -1,5 +1,6 @@
 ﻿#include "kernel/collectors/pool.h"
 #include "../kernels/pool/cudnn_kernel.hh"
+#include "../kernels/pool/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -29,6 +30,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = PoolCnnl::build(type, ceil, kernelShape, attributes, x, y); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/reduce.cc b/src/04kernel/src/collectors/reduce.cc
index bec37731d..71fa194ba 100644
--- a/src/04kernel/src/collectors/reduce.cc
+++ b/src/04kernel/src/collectors/reduce.cc
@@ -1,6 +1,7 @@
 #include "kernel/collectors/reduce.h"
 #include "../kernels/reduce/cpu_kernel.hh"
 #include "../kernels/reduce/cudnn_kernel.hh"
+#include "../kernels/reduce/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -27,6 +28,9 @@ namespace refactor::kernel {
             case decltype(_target)::Nvidia:
                 REGISTER(ReduceCudnn)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(ReduceCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/simple_binary.cc b/src/04kernel/src/collectors/simple_binary.cc
index 53ae6723c..e61d013f9 100644
--- a/src/04kernel/src/collectors/simple_binary.cc
+++ b/src/04kernel/src/collectors/simple_binary.cc
@@ -2,6 +2,7 @@
 #include "../kernels/simple_binary/binary_cudnn.hh"
 #include "../kernels/simple_binary/cpu_kernel.hh"
 #include "../kernels/simple_binary/cuda_kernel.hh"
+#include "../kernels/simple_binary/binary_cnnl.hh"
 
 namespace refactor::kernel {
 
@@ -50,6 +51,9 @@ namespace refactor::kernel {
                 REGISTER_BROCAST(BinaryCudnn)
                 REGISTER(BinaryCuda)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER_BROCAST(BinaryCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/simple_unary.cc b/src/04kernel/src/collectors/simple_unary.cc
index 51a334c91..c489acecf 100644
--- a/src/04kernel/src/collectors/simple_unary.cc
+++ b/src/04kernel/src/collectors/simple_unary.cc
@@ -2,6 +2,8 @@
 #include "../kernels/simple_unary/cpu_kernel.hh"
 #include "../kernels/simple_unary/cuda_kernel.hh"
 #include "../kernels/simple_unary/cudnn_activation_kernel.hh"
+#include "../kernels/simple_unary/cnnl_activation_kernel.hh"
+#include "../kernels/simple_unary/cnnl_simple_unary_kernel.hh"
 #include "common.h"
 
 namespace refactor::kernel {
@@ -55,6 +57,10 @@ namespace refactor::kernel {
                 REGISTER(ActivationCudnn)
                 REGISTER(SimpleUnaryCuda)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(ActivationCnnl)
+                REGISTER(SimpleUnaryCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/softmax.cc b/src/04kernel/src/collectors/softmax.cc
index 2ce442696..020bc6ded 100644
--- a/src/04kernel/src/collectors/softmax.cc
+++ b/src/04kernel/src/collectors/softmax.cc
@@ -1,4 +1,5 @@
 #include "kernel/collectors/softmax.h"
+#include "../kernels/softmax/cnnl_kernel.hh"
 #include "../kernels/softmax/cpu_kernel.hh"
 #include "../kernels/softmax/cuda_kernel.hh"
 #include "../kernels/softmax/cudnn_kernel.hh"
@@ -28,6 +29,12 @@ namespace refactor::kernel {
                 }
                 break;
             }
+            case decltype(_target)::Mlu: {
+                if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            }
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/transpose.cc b/src/04kernel/src/collectors/transpose.cc
index c8d651974..c91493f98 100644
--- a/src/04kernel/src/collectors/transpose.cc
+++ b/src/04kernel/src/collectors/transpose.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/transpose.h"
 #include "../kernels/transpose/cpu_kernel.hh"
 #include "../kernels/transpose/cuda_kernel.hh"
+#include "../kernels/transpose/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -25,6 +26,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = TransposeCnnl::build(data.dataType, data.shape, perm); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
new file mode 100644
index 000000000..be06233cd
--- /dev/null
+++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
@@ -0,0 +1,158 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = BatchNormalizationCnnl;
+    using DT = DataType;
+
+    K::BatchNormalizationCnnl(decltype(info) info_) noexcept
+        : info(info_) {}
+
+    auto K::build(float epsilon, TensorRefs inputs) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        auto const &x = inputs[0].get();
+        auto const &scale = inputs[1].get();
+        auto const &mean = inputs[3].get();
+
+        if (x.rank() != 4) {
+            return nullptr;
+        }
+
+        // see "Supported Configurations for `cnnlBatchNormalizationForwardInference`"
+        if (scale.dataType != mean.dataType) {
+            return nullptr;
+        }
+        if (x.dataType == DT::F64) {
+            if (scale.dataType != DT::F64) {
+                return nullptr;
+            }
+        } else {
+            if (scale.dataType != DT::F32) {
+                return nullptr;
+            }
+        }
+        return std::make_unique<K>(decltype(info){
+            epsilon,
+            x.dataType,
+            scale.dataType,
+            x.layout,
+            {
+                static_cast<int>(x.shape[0]),
+                static_cast<int>(x.shape[1]),
+                static_cast<int>(x.shape[2]),
+                static_cast<int>(x.shape[3]),
+            }});
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing batch normalization for non-training-mode using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto K::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, inDescTrans, p;
+            cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
+            bool f32;
+
+            explicit Descriptors(decltype(f32) f32_)
+                : inDesc(nullptr), inDescTrans(nullptr), p(nullptr),
+                  NCHW2NHWC(nullptr), NHWC2NCHW(nullptr), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDescTrans));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&p));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDescTrans));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(p));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.dtX != DT::F64);
+        int dimNCHW[4] = {info.dimAx[0], info.dimAx[1], info.dimAx[2], info.dimAx[3]};
+        int dimNHWC[4] = {info.dimAx[0], info.dimAx[2], info.dimAx[3], info.dimAx[1]};
+        int dimParam[]{info.dimAx[1]};
+        setCnnlTensor(d->inDesc, info.dtX, slice(dimNCHW, 4));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->inDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dtX), 4, dimNHWC));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->p, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dtP), 1, dimParam));
+        int permute[4] = {0, 2, 3, 1};
+        int permuteOut[4] = {0, 3, 1, 2};
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permute));
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut));
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX);
+        size_t workspaceSize;
+        CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize));
+        size_t totalWorkspaceSize = xTransSize + workspaceSize;
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d),
+                        epsilon = info.epsilon,
+                        xTransSize, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            // name inputs and outputs
+            auto x = inputs[0],
+                 scale = inputs[1],
+                 bias = inputs[2],
+                 mean = inputs[3],
+                 var = inputs[4];
+            auto y = outputs[0];
+
+            void *xTrans = workspace;
+            void *yTrans = xTrans + xTransSize;
+            void *cursor = yTrans + workspaceSize;
+
+            // transpose NCHW input to NHWC
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
+                                         d->inDescTrans, xTrans, cursor, workspaceSize));
+
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+            CNNL_ASSERT(cnnlBatchNormForwardInference(
+                handle, &a, &b,
+                d->inDescTrans, xTrans, d->p, scale, bias, mean, var,
+                epsilon, d->inDescTrans, yTrans));
+
+            // transpose NHWC intermediates to NCHW
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans,
+                                         d->inDesc, y, cursor, workspaceSize));
+
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+
+        return {std::move(routine), totalWorkspaceSize};
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh
new file mode 100644
index 000000000..978b0dedc
--- /dev/null
+++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh
@@ -0,0 +1,32 @@
+﻿#ifndef KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH
+#define KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+    /// @brief Use `cnnlBatchNormalizationForwardInference`.
+    ///        It only supports 4D and 5D tensors.
+    struct BatchNormalizationCnnl final : public Kernel {
+        struct {
+            float epsilon;
+            DataType dtX, dtP;
+            LayoutType layout;
+            int dimAx[4];// dimA for x
+        } info;
+
+        explicit BatchNormalizationCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(float, TensorRefs) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.cc b/src/04kernel/src/kernels/pool/cnnl_kernel.cc
new file mode 100644
index 000000000..083125b1f
--- /dev/null
+++ b/src/04kernel/src/kernels/pool/cnnl_kernel.cc
@@ -0,0 +1,156 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = PoolCnnl;
+
+    K::PoolCnnl(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(PoolType poolType,
+                  bool ceil,
+                  KernelShape const &kernelShape,
+                  PoolAttributes const &poolAttributes,
+                  Tensor const &x,
+                  Tensor const &y) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        // TODO check data type
+        auto p = poolAttributes.pads(),
+             d = poolAttributes.dilations(),
+             s = poolAttributes.strides();
+        if (x.rank() != 4 ||
+            poolType == PoolType::Lp ||
+            d[0] != 1 || d[1] != 1) {
+            return nullptr;
+        }
+        return std::make_unique<K>(decltype(info){
+            poolType,
+            x.dataType,
+            {
+                static_cast<int>(x.shape[0]),
+                static_cast<int>(x.shape[1]),
+                static_cast<int>(x.shape[2]),
+                static_cast<int>(x.shape[3]),
+            },
+            {
+                static_cast<int>(y.shape[0]),
+                static_cast<int>(y.shape[1]),
+                static_cast<int>(y.shape[2]),
+                static_cast<int>(y.shape[3]),
+            },
+            {
+                static_cast<int>(kernelShape[0]),
+                static_cast<int>(kernelShape[1]),
+            },
+            {p[0], p[1], p[2], p[3]},
+            {s[0], s[1]},
+            {d[0], d[1]},
+            ceil
+        });
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing pool using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto PoolCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using Ty = PoolType;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t x, y;
+            cnnlPoolingDescriptor_t pooling;
+            bool f32;
+
+            Descriptors(decltype(f32) f32_) : f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
+                CNNL_ASSERT(cnnlCreatePoolingDescriptor(&pooling));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
+                CNNL_ASSERT(cnnlDestroyPoolingDescriptor(pooling));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.dt != DataType::F64);
+        int const
+            xs[]{
+                info.xShape[0],
+                info.xShape[1],
+                info.xShape[2] + std::abs(info.pads[0] - info.pads[2]),
+                info.xShape[3] + std::abs(info.pads[1] - info.pads[3]),
+            },
+            *ys = info.yShape;
+        setCnnlTensor(d->x, info.dt, slice(xs, 4));
+        setCnnlTensor(d->y, info.dt, slice(ys, 4));
+
+        // clang-format off
+        auto mode = info.poolType == Ty::Average ? CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+                  : info.poolType == Ty::Max     ? CNNL_POOLING_MAX
+                                                 : UNREACHABLEX(cnnlPoolingMode_t, "");
+        // clang-format on
+        auto pp = info.pads;
+        auto ss = info.strides;
+        auto kk = info.kernelShape;
+        auto dd = info.dilations;
+        CNNL_ASSERT(cnnlSetPooling2dDescriptor_v2(
+            d->pooling, mode, CNNL_NOT_PROPAGATE_NAN,
+            kk[0], kk[1], pp[0], pp[2], pp[1], pp[3],
+            ss[0], ss[1], dd[0], dd[1], ceil));
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t extraInputSize, workspaceSize;
+        CNNL_ASSERT(cnnlGetPoolingWorkspaceSize(handle, mode, ys[3], ys[2], &workspaceSize));
+        CNNL_ASSERT(cnnlGetPoolingExtraInputSize(handle, mode, ys[3], ys[2], &extraInputSize));
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d, workspaceSize,
+                        extraInputSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+           
+            void *extraInputDev = workspace;
+            void *poolWorkSpace = workspace + extraInputSize;
+
+            void *extraInputHost = malloc(extraInputSize);
+            CNNL_ASSERT(cnnlInitPoolingExtraInput(handle, d->pooling, d->x, d->y, extraInputHost));
+            BANG_ASSERT(cnrtMemcpy(extraInputDev, extraInputHost, extraInputSize, CNRT_MEM_TRANS_DIR_HOST2DEV));
+
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+            CNNL_ASSERT(cnnlPoolingForward_v2(
+                handle, d->pooling,
+                &a, d->x, inputs[0],
+                &b, extraInputDev, d->y, outputs[0],
+                poolWorkSpace, workspaceSize));
+
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+
+            free(extraInputHost);
+        };
+        return {std::move(routine), workspaceSize + extraInputSize};
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.hh b/src/04kernel/src/kernels/pool/cnnl_kernel.hh
new file mode 100644
index 000000000..0a0298ede
--- /dev/null
+++ b/src/04kernel/src/kernels/pool/cnnl_kernel.hh
@@ -0,0 +1,45 @@
+﻿#ifndef KERNEL_POOL_CNNL_KERNEL_HH
+#define KERNEL_POOL_CNNL_KERNEL_HH
+
+#include "kernel/attributes/pool_attributes.h"
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    /// @brief Use `cnnlPoolingForward`.
+    ///        It only supports 4D tensors.
+    struct PoolCnnl final : public Kernel {
+        struct
+        {
+            PoolType poolType;
+            DataType dt;
+            int xShape[4],
+                yShape[4],
+                kernelShape[2],
+                pads[4],
+                strides[2],
+                dilations[2];
+            bool ceil;
+        } info;
+
+        explicit PoolCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(PoolType,
+                               bool,
+                               KernelShape const &,
+                               PoolAttributes const &,
+                               Tensor const &,
+                               Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_POOL_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
new file mode 100644
index 000000000..752bee690
--- /dev/null
+++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
@@ -0,0 +1,128 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include "hardware/functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = ReduceCnnl;
+
+    K::ReduceCnnl(
+        decltype(dataType) dataType_,
+        decltype(reduceType) reduceType_,
+        decltype(axes) axes_,
+        decltype(shape) shape_) noexcept
+        : Kernel(),
+          dataType(dataType_),
+          reduceType(reduceType_),
+          axes(std::move(axes_)),
+          shape(std::move(shape_)) {}
+
+    auto K::build(decltype(axes) axes_, ReduceType reduceType_, TensorRefs inputs_) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        auto const &x = inputs_[0].get();
+        return x.dataType.isFloat()
+                   ? std::make_unique<K>(x.dataType, reduceType_, std::move(axes_), x.shape)
+                   : nullptr;
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing reduce operation using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto ReduceCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t x, y;
+            cnnlReduceDescriptor_t reduce;
+            bool f32;
+
+            explicit Descriptors(decltype(f32) f32_) : f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
+                CNNL_ASSERT(cnnlCreateReduceDescriptor(&reduce));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
+                CNNL_ASSERT(cnnlDestroyReduceDescriptor(reduce));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(dataType != DataType::F64);
+
+        std::vector<int>
+            dimsI(shape.begin(), shape.end()),
+            dimsO(shape.begin(), shape.end());
+        for (auto axis : axes) {
+            dimsO[axis] = 1;
+        }
+        setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size()));
+        setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size()));
+
+        // clang-format off
+        auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG
+                      : reduceType == ReduceType::Sum  ? CNNL_REDUCE_ADD
+                      : reduceType == ReduceType::Min  ? CNNL_REDUCE_MIN
+                      : reduceType == ReduceType::Max  ? CNNL_REDUCE_MAX
+                      : reduceType == ReduceType::L1   ? CNNL_REDUCE_NORM1
+                      : reduceType == ReduceType::L2   ? CNNL_REDUCE_NORM2
+                      : reduceType == ReduceType::Prod ? CNNL_REDUCE_MUL
+                      : UNREACHABLEX(cnnlReduceOp_t, "");
+        // clang-format on
+        CNNL_ASSERT(cnnlSetReduceDescriptor_v2(
+            d->reduce, (int *) (axes.data()), axes.size(), reduceOp,
+            cnnlDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));
+
+        auto handler = res.fetchOrStore<CnnlContext>()->handle;
+        size_t idxWorkspaceSize = axes.size() * sizeof(int);
+        // idxWorkspaceSize = hardware::alignBytes(idxWorkspaceSize, 256);
+        size_t workspaceSize;
+        // get workspace
+        CNNL_ASSERT(cnnlGetReduceOpWorkspaceSize(handler, d->x, d->y, d->reduce, &workspaceSize));
+        
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d),
+                        idxWorkspaceSize,
+                        workspaceSize](Resources &res,
+                                       void *workspace,
+                                       void const *const *inputs,
+                                       void *const *outputs) {
+            void *idxWorkspace = workspace,
+                 *dataWorkspace = reinterpret_cast<uint8_t *>(workspace) + idxWorkspaceSize;
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+            CNNL_ASSERT(cnnlReduce(
+                res.fetchOrStore<CnnlContext>()->handle,
+                d->reduce,
+                dataWorkspace, workspaceSize,
+                &a, d->x, inputs[0],
+                idxWorkspaceSize, idxWorkspace,
+                &b, d->y, outputs[0]));
+        };
+        return RoutineWorkspace(std::move(routine), idxWorkspaceSize + workspaceSize);
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.hh b/src/04kernel/src/kernels/reduce/cnnl_kernel.hh
new file mode 100644
index 000000000..6ffaf7387
--- /dev/null
+++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.hh
@@ -0,0 +1,32 @@
+#ifndef KERNEL_REDUCE_MEAN_CNNL_KERNEL_HH
+#define KERNEL_REDUCE_MEAN_CNNL_KERNEL_HH
+
+#include "kernel/collectors/reduce.h"
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct ReduceCnnl final : public Kernel {
+        DataType dataType;
+        ReduceType reduceType;
+        Axes axes;
+        Shape shape;
+
+        ReduceCnnl(decltype(dataType),
+                    decltype(reduceType),
+                    decltype(axes),
+                    decltype(shape)) noexcept;
+
+        static KernelBox build(decltype(axes), ReduceType, TensorRefs) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+}// namespace refactor::kernel
+
+#endif// KERNEL_REDUCE_MEAN_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
new file mode 100644
index 000000000..cfe2ce0fa
--- /dev/null
+++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -0,0 +1,195 @@
+#include "binary_cnnl.hh"
+#include <unordered_set>
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = BinaryCnnl;
+    using Op = SimpleBinaryType;
+    using DT = DataType;
+
+    K::BinaryCnnl(Op opType_, DT dataType_, std::vector<int> aDims_, std::vector<int> bDims_, std::vector<int> cDims_) noexcept
+        : Kernel(), dataType(dataType_), opType(opType_), aDims(aDims_), bDims(bDims_), cDims(cDims_) {}
+
+    auto K::build(Op op, Tensor const &a, Tensor const &b, Tensor const &c) noexcept -> KernelBox {
+        static const std::unordered_set<Op>
+            ARTHIMETIC{Op::Add, Op::Sub, Op::Mul, Op::Div, Op::And, Op::Or, Op::Xor, Op::Pow};
+
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        if (a.dataType != b.dataType ||
+            !a.dataType.isFloat() ||
+            !ARTHIMETIC.contains(op) ||
+            // At least one of a,b should have the same shape as c
+            (a.shape != c.shape && b.shape != c.shape) ||
+            // Sub only supports brocasting b
+            (a.shape != c.shape && op == Op::Sub) ||
+            // Cnnl binary op only supports up to 5D
+            !((a.rank() == 5 && b.rank() == 5) || (a.rank() <= 4 && b.rank() <= 4))) {
+            return nullptr;
+        }
+
+        auto shape2IntVec = [](Shape shape) -> std::vector<int> {
+            std::vector<int> intVector;
+            intVector.reserve(shape.size());
+            for (const uint32_t &element : shape) {
+                intVector.push_back(static_cast<int>(element));
+            }
+            return intVector;
+        };
+
+        return std::make_unique<K>(op, a.dataType, shape2IntVec(a.shape), shape2IntVec(b.shape), shape2IntVec(c.shape));
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing element-wise op of 2 tensors with CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto BinaryCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlOpTensorDescriptor_t opDesc;
+            cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+            bool f32, sub;
+
+            Descriptors(decltype(f32) f32_) : f32(f32_), sub(false) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&aDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&bDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&cDesc));
+                CNNL_ASSERT(cnnlCreateOpTensorDescriptor(&opDesc));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(aDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(bDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(cDesc));
+                CNNL_ASSERT(cnnlDestroyOpTensorDescriptor(opDesc));
+            }
+        };
+        auto d = std::make_shared<Descriptors>(dataType != DT::F64);
+        cnnlOpTensorDesc_t cnnlOP;
+        cnnlLogicOp_t cnnlLogicOP;
+        if (opType == SimpleBinaryType::Add) {
+            cnnlOP = CNNL_OP_TENSOR_ADD;
+        } else if (opType == SimpleBinaryType::Sub) {
+            cnnlOP = CNNL_OP_TENSOR_ADD;
+            d->sub = true;
+        } else if (opType == SimpleBinaryType::Mul) {
+            cnnlOP = CNNL_OP_TENSOR_MUL;
+        } else if (opType == SimpleBinaryType::And) {
+            cnnlLogicOP = CNNL_LOGIC_OP_AND;
+        } else if (opType == SimpleBinaryType::Or) {
+            cnnlLogicOP = CNNL_LOGIC_OP_OR;
+        } else if (opType == SimpleBinaryType::Xor) {
+            cnnlLogicOP = CNNL_LOGIC_OP_XOR;
+        }
+
+        setCnnlTensor(d->aDesc, dataType, slice(aDims.data(), aDims.size()));
+        setCnnlTensor(d->bDesc, dataType, slice(bDims.data(), bDims.size()));
+        setCnnlTensor(d->cDesc, dataType, slice(cDims.data(), cDims.size()));
+        CNNL_ASSERT(cnnlSetOpTensorDescriptor(
+            d->opDesc, cnnlOP,
+            cnnlDataTypeConvert(d->f32 ? DT::F32 : DT::F64),
+            CNNL_NOT_PROPAGATE_NAN));
+
+        auto cnnlGetBinaryWorkspaceSize =
+            (opType == SimpleBinaryType::Add || opType == SimpleBinaryType::Sub || opType == SimpleBinaryType::Mul)  ? cnnlGetOpTensorWorkspaceSize
+            : (opType == SimpleBinaryType::Div)                                                                      ? cnnlGetDivWorkspaceSize
+            : (opType == SimpleBinaryType::And || opType == SimpleBinaryType::Or || opType == SimpleBinaryType::Xor) ? cnnlGetLogicOpWorkspaceSize
+            : (opType == SimpleBinaryType::Pow)                                                                      ? cnnlGetPowWorkspaceSize
+                                                                                                                     : nullptr;
+
+        if (cnnlGetBinaryWorkspaceSize == nullptr) {
+            UNREACHABLE();
+        }
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t workspaceSize;
+        if (aDims != cDims) {
+            CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->bDesc,
+                                                   d->aDesc, d->cDesc,
+                                                   &workspaceSize));
+        } else {
+            CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc,
+                                                   d->bDesc, d->cDesc,
+                                                   &workspaceSize));
+        }
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [swap = aDims != cDims, d,
+                        workspaceSize, cnnlLogicOP,
+                        op = this->opType](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+            // name inputs and outputs
+            auto a = inputs[0],
+                 b = inputs[1];
+            auto c = outputs[0];
+            if (op == SimpleBinaryType::Add || op == SimpleBinaryType::Sub || op == SimpleBinaryType::Mul) {
+                auto alphaA = d->f32
+                                  ? factor<fp32_t>(1)
+                                  : factor<fp64_t>(1),
+                     alphaB = d->f32
+                                  ? factor<fp32_t>(d->sub ? -1 : 1)
+                                  : factor<fp64_t>(d->sub ? -1 : 1),
+                     beta = d->f32
+                                ? factor<fp32_t>(0)
+                                : factor<fp64_t>(0);
+
+                if (swap) {
+                    CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
+                                             &alphaB, d->bDesc, b,
+                                             &alphaA, d->aDesc, a,
+                                             workspace, workspaceSize,
+                                             &beta, d->cDesc, c));
+                } else {
+                    CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
+                                             &alphaA, d->aDesc, a,
+                                             &alphaB, d->bDesc, b,
+                                             workspace, workspaceSize,
+                                             &beta, d->cDesc, c));
+                }
+            } else if (op == SimpleBinaryType::Div) {
+                CNNL_ASSERT(cnnlDiv_v2(handle,
+                                       CNNL_COMPUTATION_HIGH_PRECISION,
+                                       d->aDesc, a,
+                                       d->bDesc, b,
+                                       workspace, workspaceSize,
+                                       d->cDesc, c));
+            } else if (op == SimpleBinaryType::And || op == SimpleBinaryType::Or || op == SimpleBinaryType::Xor) {
+                CNNL_ASSERT(cnnlLogicOp(handle, cnnlLogicOP,
+                                        d->aDesc, a,
+                                        d->bDesc, b,
+                                        workspace, workspaceSize,
+                                        d->cDesc, c));
+            } else if (op == SimpleBinaryType::Pow) {
+                CNNL_ASSERT(cnnlPow(handle,
+                                    CNNL_COMPUTATION_HIGH_PRECISION,
+                                    d->aDesc, a,
+                                    d->bDesc, b,
+                                    workspace, workspaceSize,
+                                    d->cDesc, c));
+            }
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.hh b/src/04kernel/src/kernels/simple_binary/binary_cnnl.hh
new file mode 100644
index 000000000..2d5c7cfaa
--- /dev/null
+++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.hh
@@ -0,0 +1,28 @@
+#ifndef KERNEL_BINARY_CNNL_HH
+#define KERNEL_BINARY_CNNL_HH
+
+#include "kernel/collectors/simple_binary.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct BinaryCnnl final : public Kernel {
+        DataType dataType;
+        SimpleBinaryType opType;
+        std::vector<int> aDims, bDims, cDims;
+
+        BinaryCnnl(SimpleBinaryType, DataType, std::vector<int> aDims_, std::vector<int> bDims_, std::vector<int> cDims_) noexcept;
+
+        static KernelBox build(SimpleBinaryType, Tensor const &, Tensor const &, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_BINARY_CNNL_HH
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
new file mode 100644
index 000000000..d35535948
--- /dev/null
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
@@ -0,0 +1,91 @@
+﻿#include "cnnl_activation_kernel.hh"
+#include "kernel/collectors/simple_unary.h"
+#include <unordered_set>
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = ActivationCnnl;
+    using DT = DataType;
+    using Op = SimpleUnaryType;
+
+    K::ActivationCnnl(Op type_, DT dataType_, int size_) noexcept
+        : Kernel(), type(type_), dataType(dataType_), size(size_) {}
+
+    auto K::build(Op op, Tensor const &a) noexcept -> KernelBox {
+        static const std::unordered_set<Op> ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh};
+
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return ARTHIMETIC.contains(op)
+                   ? std::make_unique<K>(op, a.dataType, static_cast<int>(a.elementsSize()))
+                   : nullptr;
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing activation using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto ActivationCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using Ty = SimpleUnaryType;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlActivationDescriptor_t activation;
+            cnnlTensorDescriptor_t tensor;
+
+            Descriptors() : activation(nullptr), tensor(nullptr) {
+                CNNL_ASSERT(cnnlCreateActivationDescriptor(&activation));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyActivationDescriptor(activation));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>();
+
+        // clang-format off
+        auto mode = type == Ty::Relu    ? CNNL_ACTIVATION_RELU
+                  : type == Ty::Sigmoid ? CNNL_ACTIVATION_SIGMOID
+                  : type == Ty::Tanh    ? CNNL_ACTIVATION_TANH
+                  : UNREACHABLEX(cnnlActivationMode_t, "");
+        // clang-format on
+
+        setCnnlTensor(d->tensor, dataType, slice(&size, 1));
+        CNNL_ASSERT(cnnlSetActivationDescriptor_v2(d->activation, mode, CNNL_ACTIVATION_HIGH_PRECISION,
+                                                   CNNL_NOT_PROPAGATE_NAN, 0.0));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d)]//
+            (Resources & res, void *, void const *const *inputs, void *const *outputs) {
+                float alpha = 1, beta = 0;
+                CNNL_ASSERT(cnnlActivationForward(
+                    res.fetchOrStore<CnnlContext>()->handle,
+                    d->activation,
+                    &alpha, d->tensor, inputs[0],
+                    &beta, d->tensor, outputs[0]));
+            };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh
new file mode 100644
index 000000000..a5d7ad65c
--- /dev/null
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh
@@ -0,0 +1,27 @@
+﻿#ifndef KERNEL_ACTIVATION_CNNL_KERNEL_HH
+#define KERNEL_ACTIVATION_CNNL_KERNEL_HH
+
+#include "kernel/collectors/simple_unary.h"
+
+namespace refactor::kernel {
+
+    struct ActivationCnnl final : public Kernel {
+        SimpleUnaryType type;
+        DataType dataType;
+        int size;
+
+        ActivationCnnl(SimpleUnaryType, DataType, int) noexcept;
+
+        static KernelBox build(SimpleUnaryType, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_ACTIVATION_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
new file mode 100644
index 000000000..f6e32159d
--- /dev/null
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
@@ -0,0 +1,94 @@
+﻿#include "cnnl_simple_unary_kernel.hh"
+#include "kernel/collectors/simple_unary.h"
+#include <unordered_set>
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = SimpleUnaryCnnl;
+    using DT = DataType;
+    using Op = SimpleUnaryType;
+
+    K::SimpleUnaryCnnl(Op type_, DT dataType_, int size_) noexcept
+        : Kernel(), type(type_), dataType(dataType_), size(size_) {}
+
+    auto K::build(Op op, Tensor const &a) noexcept -> KernelBox {
+        static const std::unordered_set<Op> supportedOp{Op::Abs, Op::Sqrt, Op::Neg};
+
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return supportedOp.contains(op)
+                   ? std::make_unique<K>(op, a.dataType, static_cast<int>(a.elementsSize()))
+                   : nullptr;
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing simple unary using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto SimpleUnaryCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using Ty = SimpleUnaryType;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t tensor;
+
+            Descriptors() : tensor(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>();
+
+        setCnnlTensor(d->tensor, dataType, slice(&size, 1));
+
+        auto cnnlUnaryForward = [this](cnnlHandle_t handle,
+                                       const cnnlTensorDescriptor_t x_desc,
+                                       const void *x,
+                                       const cnnlTensorDescriptor_t y_desc,
+                                       void *y) -> cnnlStatus_t {
+            switch (this->type) {
+                case Ty::Abs:
+                    return cnnlAbs(handle, x_desc, x, y_desc, y);
+                case Ty::Neg:
+                    return cnnlNegTensor(handle, x_desc, x, y_desc, y);
+                case Ty::Sqrt:
+                    return cnnlSqrt_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y);
+                default:
+                    UNREACHABLE();
+            }
+        };
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d), cnnlUnaryForward]//
+            (Resources & res, void *, void const *const *inputs, void *const *outputs) {
+                CNNL_ASSERT(cnnlUnaryForward(
+                    res.fetchOrStore<CnnlContext>()->handle,
+                    d->tensor, inputs[0],
+                    d->tensor, outputs[0]));
+            };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh
new file mode 100644
index 000000000..b69902f7b
--- /dev/null
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh
@@ -0,0 +1,27 @@
+﻿#ifndef KERNEL_SIMPLE_UNARY_CNNL_KERNEL_HH
+#define KERNEL_SIMPLE_UNARY_CNNL_KERNEL_HH
+
+#include "kernel/collectors/simple_unary.h"
+
+namespace refactor::kernel {
+
+    struct SimpleUnaryCnnl final : public Kernel {
+        SimpleUnaryType type;
+        DataType dataType;
+        int size;
+
+        SimpleUnaryCnnl(SimpleUnaryType, DataType, int) noexcept;
+
+        static KernelBox build(SimpleUnaryType, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SIMPLE_UNARY_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
new file mode 100644
index 000000000..865e452e1
--- /dev/null
+++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
@@ -0,0 +1,86 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = SoftmaxCnnl;
+
+    K::SoftmaxCnnl(cnnl::SoftmaxAlgo algo_, DataType type_,
+                   int pre_, int mid_, int post_) noexcept
+        : Kernel(), algo(algo_), dataType(type_),
+          pre(pre_), mid(mid_), post(post_) {}
+
+    auto K::build(cnnl::SoftmaxAlgo algo, SoftmaxInfo info) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return std::make_unique<K>(algo, info.type, info.pre, info.mid, info.post);
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing softmax forward with CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto SoftmaxCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t t;
+            cnnlSoftmaxAlgorithm_t algo;
+            bool f32;
+
+            Descriptors(decltype(algo) algo_, decltype(f32) f32_)
+                : algo(algo_), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&t));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(t));
+            }
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+
+        auto d = std::make_shared<Descriptors>(
+            static_cast<cnnlSoftmaxAlgorithm_t>(algo),
+            dataType != DataType::F64);
+        int dims[]{pre, mid, post};
+        cnnlSoftmaxMode_t mode = (post == 1)  ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
+                                 : (pre == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
+                                              : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+
+        // cnnlSoftmaxForward_v2 is applied to a 3D input tensor only
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d), mode](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+            CNNL_ASSERT(cnnlSoftmaxForward_v2(
+                res.fetchOrStore<CnnlContext>()->handle,
+                d->algo,
+                mode,
+                CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
+                &a, d->t, inputs[0],
+                &b, d->t, outputs[0]));
+        };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.hh b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh
new file mode 100644
index 000000000..b9bedb5a4
--- /dev/null
+++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh
@@ -0,0 +1,36 @@
+#ifndef KERNEL_SOFTMAX_CNNL_HH
+#define KERNEL_SOFTMAX_CNNL_HH
+
+#include "kernel/attributes/softmax_info.h"
+#include "kernel/collectors/softmax.h"
+
+namespace refactor::kernel {
+
+    namespace cnnl {
+        enum class SoftmaxAlgo {
+            FAST = 0,
+            ACCURATE = 1,
+            LOG = 2,
+        };
+    }// namespace cnnl
+
+    struct SoftmaxCnnl final : public Kernel {
+        cnnl::SoftmaxAlgo algo;
+        DataType dataType;
+        int pre, mid, post;
+
+        SoftmaxCnnl(cnnl::SoftmaxAlgo, DataType, int, int, int) noexcept;
+
+        static KernelBox build(cnnl::SoftmaxAlgo, SoftmaxInfo) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SOFTMAX_CNNL_HH
diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
new file mode 100644
index 000000000..26a0bef1b
--- /dev/null
+++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
@@ -0,0 +1,92 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = TransposeCnnl;
+    using Info = TransposeInfo;
+
+    K::TransposeCnnl(DataType dataType_, Shape dimIn_, Shape dimOut_, Permutation perm_) noexcept
+        : Kernel(), dataType(dataType_), dimIn(std::move(dimIn_)),
+          dimOut(std::move(dimOut_)), perm(std::move(perm_)) {}
+
+    auto K::build(DataType dataType, Shape shape_, Permutation perm_) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        Shape dimOut_;
+        for (uint32_t i = 0; i < shape_.size(); i++) {
+            dimOut_.push_back(shape_[perm_[i]]);
+        }
+        return std::make_unique<K>(dataType, std::move(shape_), std::move(dimOut_), std::move(perm_));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing transpose operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto TransposeCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t x, y;
+            cnnlTransposeDescriptor_t trans;
+            bool f32;
+
+            explicit Descriptors(decltype(f32) f32_)
+                : x(nullptr), y(nullptr), trans(nullptr), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&trans));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(trans));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+
+        auto d = std::make_shared<Descriptors>(dataType != DT::F64);
+        setCnnlTensor(d->x, dataType, slice((int *)(dimIn.data()), dimIn.size()));
+        setCnnlTensor(d->y, dataType, slice((int *)(dimOut.data()), dimOut.size()));
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, perm.size(), (int *)perm.data()));
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t workspaceSize;
+        CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->x, d->trans, &workspaceSize));
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            // name inputs and outputs
+            auto x = inputs[0];
+            auto y = outputs[0];
+
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->trans, d->x, x,
+                                         d->y, y, workspace, workspaceSize));
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.hh b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh
new file mode 100644
index 000000000..37bb7e088
--- /dev/null
+++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh
@@ -0,0 +1,32 @@
+﻿#ifndef KERNEL_TRANSPOSE_CNNL_KERNEL_HH
+#define KERNEL_TRANSPOSE_CNNL_KERNEL_HH
+
+#include "kernel/collectors/transpose.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    using Shape = absl::InlinedVector<dim_t, 4>;
+    using Permutation = Shape;
+
+    struct TransposeCnnl final : public Kernel {
+        DataType dataType;
+        Shape dimIn;
+        Shape dimOut;
+        Permutation perm;
+
+        TransposeCnnl(DataType, Shape, Shape, Permutation) noexcept;
+
+        static KernelBox build(DataType, Shape, Permutation) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_TRANSPOSE_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/utilities/bang/cnnl_context.cc b/src/04kernel/src/utilities/bang/cnnl_context.cc
new file mode 100644
index 000000000..15cc13829
--- /dev/null
+++ b/src/04kernel/src/utilities/bang/cnnl_context.cc
@@ -0,0 +1,35 @@
+﻿#ifdef USE_BANG
+
+#include "cnnl_context.hh"
+#include "cnnl_functions.h"
+
+namespace refactor::kernel::cnnl {
+
+    CnnlContext::CnnlContext() : runtime::Resource() {
+        BANG_ASSERT(cnrtQueueCreate(&queue));
+        CNNL_ASSERT(cnnlCreate(&handle));
+        CNNL_ASSERT(cnnlSetQueue(handle, queue));
+    }
+    CnnlContext::~CnnlContext() {
+        BANG_ASSERT(cnrtQueueDestroy(queue));
+        CNNL_ASSERT(cnnlDestroy(handle));
+    }
+
+    auto CnnlContext::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto CnnlContext::build() -> runtime::ResourceBox {
+        return std::make_unique<CnnlContext>();
+    }
+
+    auto CnnlContext::resourceTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto CnnlContext::description() const noexcept -> std::string_view {
+        return "CnnlContext";
+    }
+
+}// namespace refactor::kernel::cnnl
+
+#endif
diff --git a/src/04kernel/src/utilities/bang/cnnl_context.hh b/src/04kernel/src/utilities/bang/cnnl_context.hh
new file mode 100644
index 000000000..7db40d3d3
--- /dev/null
+++ b/src/04kernel/src/utilities/bang/cnnl_context.hh
@@ -0,0 +1,29 @@
+﻿#ifndef KERNEL_CNNL_CONTEXT_HH
+#define KERNEL_CNNL_CONTEXT_HH
+
+#include "runtime/resource.h"
+#include <cnnl.h>
+#include <cnrt.h>
+
+namespace refactor::kernel::cnnl {
+
+    struct CnnlContext final : public runtime::Resource {
+        cnnlHandle_t handle;
+        cnrtQueue_t queue;
+
+        CnnlContext();
+        ~CnnlContext();
+        CnnlContext(CnnlContext const &) noexcept = delete;
+        CnnlContext(CnnlContext &&) noexcept = delete;
+
+        static size_t typeId() noexcept;
+        static runtime::ResourceBox build();
+
+        size_t resourceTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+
+    };
+
+}// namespace refactor::kernel::cnnl
+
+#endif// KERNEL_CNNL_CONTEXT_HH
diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.cpp b/src/04kernel/src/utilities/bang/cnnl_functions.cpp
new file mode 100644
index 000000000..8dfeb6457
--- /dev/null
+++ b/src/04kernel/src/utilities/bang/cnnl_functions.cpp
@@ -0,0 +1,38 @@
+﻿#ifdef USE_BANG
+
+#include "cnnl_functions.h"
+
+namespace refactor::kernel::cnnl {
+
+    cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
+        // clang-format off
+        switch (dataType) {
+            case DataType::F32 : return CNNL_DTYPE_FLOAT;    break;
+            case DataType::F64 : return CNNL_DTYPE_DOUBLE;   break;
+            case DataType::FP16: return CNNL_DTYPE_HALF;     break;
+            case DataType::I8  : return CNNL_DTYPE_INT8;     break;
+            case DataType::I32 : return CNNL_DTYPE_INT32;    break;
+            case DataType::U8  : return CNNL_DTYPE_UINT8;    break;
+            case DataType::BF16: return CNNL_DTYPE_BFLOAT16; break;
+            case DataType::I64 : return CNNL_DTYPE_INT64;    break;
+            case DataType::Bool: return CNNL_DTYPE_BOOL;     break;
+            default: UNREACHABLE();
+        }
+        // clang-format on
+    }
+
+    void setCnnlTensor(cnnlTensorDescriptor_t t, DataType dt, slice_t<int> d) {
+        auto dt_ = cnnlDataTypeConvert(dt);
+        if (auto n = d.size(); n == 4) {
+            CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin()));
+        } else if (n < 4) {
+            int d_[]{1, 1, 1, 1};
+            std::copy_n(d.begin(), n, d_ + 4 - n);
+            CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, 4, std::move(d_)));
+        } else {
+            CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin()));
+        }
+    }
+}// namespace refactor::kernel::cnnl
+
+#endif
diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.h b/src/04kernel/src/utilities/bang/cnnl_functions.h
new file mode 100644
index 000000000..4ba2f89d7
--- /dev/null
+++ b/src/04kernel/src/utilities/bang/cnnl_functions.h
@@ -0,0 +1,40 @@
+﻿#ifndef KERNEL_CNNL_FUNCTIONS_H
+#define KERNEL_CNNL_FUNCTIONS_H
+
+#include "common.h"
+#include <cnnl.h>
+
+#define BANG_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
+        RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cnrtGetErrorStr(status), (int) status));           \
+    }
+
+#define CNNL_ASSERT(STATUS)                                      \
+    if (auto status = (STATUS); status != CNNL_STATUS_SUCCESS) { \
+        fmt::println("cnnl failed on \"" #STATUS "\" with {}",  \
+                     cnnlGetErrorString(status));                \
+        abort();                                                 \
+    }
+
+namespace refactor::kernel::cnnl {
+
+    cnnlDataType_t cnnlDataTypeConvert(DataType);
+
+    // A helper function that set Cnnl tensor descriptor given tensor shape and type
+    void setCnnlTensor(cnnlTensorDescriptor_t, DataType, slice_t<int>);
+
+    template<class T>
+    constexpr uint64_t factor(T x) noexcept {
+        static_assert(std::is_floating_point_v<T>);
+        static_assert(sizeof(T) <= sizeof(uint64_t));
+        union {
+            T f;
+            uint64_t i;
+        } u{x};
+        return u.i;
+    }
+
+}// namespace refactor::kernel::cnnl
+
+#endif// KERNEL_CNNL_FUNCTIONS_H
diff --git a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
new file mode 100644
index 000000000..14a1a07f4
--- /dev/null
+++ b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
@@ -0,0 +1,70 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/batch_normalization/cnnl_kernel.hh"
+#include "../../../src/kernels/batch_normalization/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, BatchNormalizationCnnl) {
+    // build routine
+    auto xTensor = Tensor::share(DataType::F32, Shape{1, 2, 3, 2});
+    auto outTensor = Tensor::share(DataType::F32, Shape{1, 2, 3, 2});
+    auto scaleTensor = Tensor::share(DataType::F32, Shape{2});
+    auto biasTensor = Tensor::share(DataType::F32, Shape{2});
+    auto meanTensor = Tensor::share(DataType::F32, Shape{2});
+    auto varTensor = Tensor::share(DataType::F32, Shape{2});
+    float epsilon = 0.00001;
+    TensorRefs inputs = TensorRefs{*xTensor, *scaleTensor, *biasTensor, *meanTensor, *varTensor};
+    auto kCpu = BatchNormalization::build(epsilon, inputs);
+    auto kCnnl = BatchNormalizationCnnl::build(epsilon, inputs);
+    ASSERT_TRUE(kCpu && kCnnl);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto [rMlu, workspaceSize] = kCnnl->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         mluIn = dev.malloc(xTensor->bytesSize()),
+         mluScale = dev.malloc(scaleTensor->bytesSize()),
+         mluBias = dev.malloc(biasTensor->bytesSize()),
+         mluMean = dev.malloc(meanTensor->bytesSize()),
+         mluVar = dev.malloc(varTensor->bytesSize()),
+         mluOut = dev.malloc(outTensor->bytesSize());
+    // put input data
+    std::vector<float>
+        data(xTensor->elementsSize(), 1.0f),
+        scale(scaleTensor->elementsSize(), 0.5f),
+        bias(biasTensor->elementsSize(), 1.0f),
+        mean(meanTensor->elementsSize(), 0.5f),
+        var(varTensor->elementsSize(), 1.0f),
+        cpuOut(outTensor->elementsSize());
+    mluIn->copyFromHost(data.data(), xTensor->bytesSize());
+    mluScale->copyFromHost(scale.data(), scaleTensor->bytesSize());
+    mluBias->copyFromHost(bias.data(), biasTensor->bytesSize());
+    mluMean->copyFromHost(mean.data(), meanTensor->bytesSize());
+    mluVar->copyFromHost(var.data(), varTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{data.data(), scale.data(), bias.data(), mean.data(), var.data()};
+        void *outputs[]{cpuOut.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{*mluIn, *mluScale, *mluBias, *mluMean, *mluVar};
+        void *outputs[]{*mluOut};
+        rMlu(res, *workspace, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(outTensor->elementsSize());
+    mluOut->copyToHost(result.data(), outTensor->bytesSize());
+    // check
+    for (auto i : range0_(result.size())) {
+        EXPECT_FLOAT_EQ(cpuOut[i], result[i]);
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/pool/test_cnnl.cpp b/src/04kernel/test/kernels/pool/test_cnnl.cpp
new file mode 100644
index 000000000..405bf3f8c
--- /dev/null
+++ b/src/04kernel/test/kernels/pool/test_cnnl.cpp
@@ -0,0 +1,70 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/pool/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+void testPoolCnnl(PoolType poolType, int rank, const int64_t *pads, const int64_t *strides, KernelShape kernelShape, Shape xShape, Shape yShape, const std::vector<float> &ExpectData) {
+    auto dataTensor = Tensor::share(DataType::F32, xShape);
+    auto yTensor = Tensor::share(DataType::F32, yShape);
+    //bool ceil = false;
+    bool ceil = true;
+    int64_t const dilations[] = {1, 1};
+    PoolAttributes poolAttributes(rank, dilations, pads, strides);
+
+    auto kernel = PoolCnnl::build(poolType, ceil, kernelShape, poolAttributes, *dataTensor, *yTensor);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // bang malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         mluMem = dev.malloc(dataTensor->bytesSize());
+    // put input data
+    std::vector<float> data(dataTensor->elementsSize());
+    for (auto i : range0_(data.size())) { data[i] = i * 0.1f; }
+    mluMem->copyFromHost(data.data(), dataTensor->bytesSize());
+    // inference
+    void const *inputs[]{*mluMem};
+    void *outputs[]{*mluMem};
+    routine(res, *workspace, inputs, outputs);
+    // take output data
+    std::vector<float> result(yTensor->elementsSize());
+    mluMem->copyToHost(result.data(), yTensor->bytesSize());
+    // check
+    for (auto i : range0_(ExpectData.size())) {
+        EXPECT_FLOAT_EQ(ExpectData[i], result[i]);
+    }
+}
+
+TEST(kernel, PoolCnnlMax) {
+    int rank = 2;
+    int64_t const
+        pads[]{0, 0, 0, 0},
+        strides[]{2, 2};
+    KernelShape kernelShape{2, 2};
+    Shape
+        xShape{1, 1, 4, 4},
+        yShape{1, 1, 2, 2};
+    const std::vector<float> ExpectData = {0.5, 0.7, 1.3, 1.5};
+    testPoolCnnl(PoolType::Max, rank, pads, strides, kernelShape, xShape, yShape, ExpectData);
+}
+
+TEST(kernel, PoolCnnlAvg) {
+    int rank = 2;
+    int64_t const
+        pads[]{0, 0, 0, 0},
+        strides[]{2, 2};
+    KernelShape kernelShape{2, 2};
+    Shape
+        xShape{1, 1, 4, 4},
+        yShape{1, 1, 2, 2};
+    const std::vector<float> ExpectData = {0.25, 0.45, 1.05, 1.25};
+    testPoolCnnl(PoolType::Average, rank, pads, strides, kernelShape, xShape, yShape, ExpectData);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/reduce/test_cnnl.cpp b/src/04kernel/test/kernels/reduce/test_cnnl.cpp
new file mode 100644
index 000000000..32952fead
--- /dev/null
+++ b/src/04kernel/test/kernels/reduce/test_cnnl.cpp
@@ -0,0 +1,64 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/reduce/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+static void testReducemean(const Shape &shape, const std::vector<float> &data,
+                           Axes axes, const std::vector<float> ExpectData) {
+    // build routine
+    auto dataTensor = Tensor::share(DataType::F32, shape);
+    auto kernel = ReduceCnnl::build(axes, ReduceType::Mean, {*dataTensor});
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // bang malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         mluMemIn = dev.malloc(dataTensor->bytesSize()),
+         mluMemOut = dev.malloc(dataTensor->bytesSize());
+    // put input output data
+    mluMemIn->copyFromHost(data.data(), dataTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluMemIn};
+        void *outputs[]{*mluMemOut};
+        routine(res, *workspace, inputs, outputs);
+    }
+    // take output data
+    Shape outDimArray;
+    std::unordered_set axesSet(axes.begin(), axes.end());
+    for (size_t i = 0; i < shape.size(); ++i) {
+        if (axesSet.contains(i)) {
+            outDimArray.push_back(shape[i]);
+        }
+    }
+    auto outputTensor = Tensor::share(DataType::F32, outDimArray);
+    std::vector<float> result(outDimArray.size());
+    mluMemOut->copyToHost(result.data(), outputTensor->bytesSize());
+    // check
+    for (auto i : range0_(ExpectData.size())) {
+        EXPECT_FLOAT_EQ(ExpectData[i], result[i]);
+    }
+}
+
+TEST(kernel, ReduceMeanCnnl) {
+    testReducemean({2, 3, 2, 2},
+                   {0, 1, 2, 3, 4, 5, 6, 7,
+                    8, 9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23},
+                   {1, 2},
+                   {5, 6, 17, 18});
+    testReducemean({2, 3, 2, 2, 1},
+                   {0, 1, 2, 3, 4, 5, 6, 7,
+                    8, 9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23},
+                   {1, 2},
+                   {5, 6, 17, 18});
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
new file mode 100644
index 000000000..4ef7c6d23
--- /dev/null
+++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
@@ -0,0 +1,90 @@
+#ifdef USE_BANG
+
+#include "../src/kernels/simple_binary/binary_cnnl.hh"
+#include "../src/kernels/simple_binary/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape dimC) {
+    // Create Tensor and build kernels
+    auto aTensor = Tensor::share(DataType::F32, dimA, LayoutType::NCHW);
+    auto bTensor = Tensor::share(DataType::F32, dimB, LayoutType::NCHW);
+    auto cTensor = Tensor::share(DataType::F32, dimC, LayoutType::NCHW);
+    auto kernel = BinaryCnnl::build(binaryOPT, *aTensor, *bTensor, *cTensor);
+    auto kCpu = BinaryCpu::build(binaryOPT, *aTensor, *bTensor);
+    ASSERT_TRUE(kCpu && kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    auto rCpu = kCpu->lower(res).routine;
+    // Init inputs and outputs
+    std::vector<float>
+        a(aTensor->elementsSize(), 3.0f),
+        b(bTensor->elementsSize(), 2.0f),
+        c(cTensor->elementsSize());
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         aMLU = dev.malloc(aTensor->bytesSize()),
+         bMLU = dev.malloc(bTensor->bytesSize()),
+         cMLU = dev.malloc(cTensor->bytesSize());
+    aMLU->copyFromHost(a.data(), aTensor->bytesSize());
+    bMLU->copyFromHost(b.data(), bTensor->bytesSize());
+    // Compute
+    {
+        void const *inputs[]{*aMLU, *bMLU};
+        void *outputs[]{*cMLU};
+        routine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{a.data(), b.data()};
+        void *outputs[]{c.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // Compare
+    std::vector<float> result(cTensor->elementsSize());
+    cMLU->copyToHost(result.data(), cTensor->bytesSize());
+    for (auto i : range0_(result.size())) {
+        EXPECT_FLOAT_EQ(c[i], result[i]);
+    }
+}
+
+TEST(kernel, BinaryCnnlAdd) {
+    testBinaryCnnl(SimpleBinaryType::Add, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+}
+
+TEST(kernel, BinaryCnnlMul) {
+    testBinaryCnnl(SimpleBinaryType::Mul, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+}
+
+TEST(kernel, BinaryCnnlSub) {
+    testBinaryCnnl(SimpleBinaryType::Sub, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+}
+
+TEST(kernel, BinaryCnnlDiv) {
+    testBinaryCnnl(SimpleBinaryType::Div, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+}
+
+// TEST(kernel, BinaryCnnlAnd) {
+//     testBinaryCnnl(SimpleBinaryType::And, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+// }
+
+// TEST(kernel, BinaryCnnlOr) {
+//     testBinaryCnnl(SimpleBinaryType::Or, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+// }
+
+// TEST(kernel, BinaryCnnlXor) {
+//     testBinaryCnnl(SimpleBinaryType::Xor, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+// }
+
+TEST(kernel, BinaryCnnlPow) {
+    testBinaryCnnl(SimpleBinaryType::Pow, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+}
+
+TEST(kernel, BinaryCnnlBroadcast) {
+    testBinaryCnnl(SimpleBinaryType::Add, Shape{3, 4, 5, 6}, Shape{}, Shape{3, 4, 5, 6});
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
new file mode 100644
index 000000000..2707e6274
--- /dev/null
+++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
@@ -0,0 +1,63 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/simple_unary/cnnl_activation_kernel.hh"
+#include "../../../src/kernels/simple_unary/cnnl_simple_unary_kernel.hh"
+#include "../../../src/kernels/simple_unary/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+static void testOp(SimpleUnaryType opType, bool activation = true) {
+    // build routine
+    auto dataTensor = Tensor::share(DataType::F32, Shape{20, 30, 50});
+    auto kernel = activation ? ActivationCnnl::build(opType, *dataTensor)
+                             : SimpleUnaryCnnl::build(opType, *dataTensor);
+    auto kCpu = SimpleUnaryCpu::build(opType, *dataTensor);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine,
+         rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluMem = dev.malloc(dataTensor->bytesSize());
+    // put input data
+    std::vector<float> data(dataTensor->elementsSize());
+    for (auto i : range0_(data.size())) { data[i] = i * 1e-4f; }
+    mluMem->copyFromHost(data.data(), dataTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluMem};
+        void *outputs[]{*mluMem};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{data.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(dataTensor->elementsSize());
+    mluMem->copyToHost(result.data(), dataTensor->bytesSize());
+    // check
+    for (auto i : range0_(data.size())) {
+        EXPECT_NEAR(data[i], result[i], 1e-4);
+    }
+}
+
+TEST(kernel, SimpleUnaryCnnl) {
+    testOp(SimpleUnaryType::Abs, false);
+    testOp(SimpleUnaryType::Neg, false);
+    testOp(SimpleUnaryType::Sqrt, false);
+}
+
+TEST(kernel, ActivationCnnl) {
+    testOp(SimpleUnaryType::Relu);
+    testOp(SimpleUnaryType::Sigmoid);
+    testOp(SimpleUnaryType::Tanh);
+}
+
+
+#endif// USE_BANG
diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp
new file mode 100644
index 000000000..a8c7fb283
--- /dev/null
+++ b/src/04kernel/test/kernels/softmax/test_cnnl.cpp
@@ -0,0 +1,52 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/softmax/cpu_kernel.hh"
+#include "../../../src/kernels/softmax/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, SoftmaxCnnl) {
+    // build routine
+    auto xTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4});
+    auto outTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4});
+    dim_t axis = 2;
+    auto kCpu = SoftmaxCpu::build(SoftmaxInfo(*xTensor, axis));
+    auto kCnnl = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::FAST, SoftmaxInfo(*xTensor, axis));
+    ASSERT_TRUE(kCpu && kCnnl);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto rCnnl = kCnnl->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluIn = dev.malloc(xTensor->bytesSize()),
+         mluOut = dev.malloc(outTensor->bytesSize());
+    // put input data
+    std::vector<float>
+        data(xTensor->elementsSize(), 0),
+        cpuOut(outTensor->elementsSize());
+    mluIn->copyFromHost(data.data(), xTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{cpuOut.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{*mluIn};
+        void *outputs[]{*mluOut};
+        rCnnl(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(outTensor->elementsSize());
+    mluOut->copyToHost(result.data(), outTensor->bytesSize());
+    // check
+    for (auto i : range0_(result.size())) {
+        EXPECT_FLOAT_EQ(cpuOut[i], result[i]);
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
new file mode 100644
index 000000000..4f4301d86
--- /dev/null
+++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
@@ -0,0 +1,55 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/transpose/cnnl_kernel.hh"
+#include "../../../src/kernels/transpose/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, TransposeCnnl) {
+    // build routine
+    auto dataTensor = Tensor::share(DataType::F32, Shape{1, 3, 2, 5});
+    auto info = TransposeInfo(dataTensor->shape, Permutation{2, 3, 0, 1});
+    auto kCpu = TransposeCpu::build(dataTensor->dataType, info);
+    auto kernel = TransposeCnnl::build(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1});
+    ASSERT_TRUE(kCpu && kernel);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto bytes = dataTensor->bytesSize();
+    auto workspace = dev.malloc(workspaceSize),
+         mluIn = dev.malloc(bytes),
+         mluOut = dev.malloc(bytes);
+    // put input data
+    std::vector<float>
+        cpuIn(dataTensor->elementsSize()),
+        cpuOut(cpuIn.size());
+    std::iota(cpuIn.begin(), cpuIn.end(), 0);
+    mluIn->copyFromHost(cpuIn.data(), bytes);
+    // inference
+    {
+        void const *inputs[]{cpuIn.data()};
+        void *outputs[]{cpuOut.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{*mluIn};
+        void *outputs[]{*mluOut};
+        routine(res, *workspace, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(dataTensor->elementsSize());
+    mluOut->copyToHost(result.data(), bytes);
+    // check
+    for (auto i : range0_(result.size())) {
+        EXPECT_FLOAT_EQ(cpuOut[i], result[i]);
+    }
+}
+
+#endif

From e329552d84db11a4626fe8ab08777f47e8d0f5bc Mon Sep 17 00:00:00 2001
From: Zhang Bolun <chamberlain0w0@gmail.com>
Date: Thu, 11 Jan 2024 03:41:10 +0000
Subject: [PATCH 2/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=92?=
 =?UTF-8?q?=E6=AD=A6=E7=BA=AA=E5=B9=B3=E5=8F=B0where/expand/conv=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/04kernel/src/collectors/conv.cc           |   6 +
 src/04kernel/src/collectors/where.cc          |   8 +-
 .../batch_normalization/cnnl_kernel.cc        |   6 +-
 src/04kernel/src/kernels/conv/cnnl_kernel.cc  | 243 ++++++++++++++++++
 src/04kernel/src/kernels/conv/cnnl_kernel.hh  |  43 ++++
 .../src/kernels/expand/cnnl_kernel.cc         |  68 +++++
 .../src/kernels/expand/cnnl_kernel.hh         |  31 +++
 .../src/kernels/transpose/cnnl_kernel.cc      |  36 ++-
 .../src/kernels/transpose/cnnl_kernel.hh      |  15 +-
 src/04kernel/src/kernels/where/cnnl_kernel.cc | 116 +++++++++
 src/04kernel/src/kernels/where/cnnl_kernel.hh |  32 +++
 src/04kernel/test/kernels/conv/test_cnnl.cpp  |  76 ++++++
 .../test/kernels/expand/test_cnnl.cpp         |  50 ++++
 src/04kernel/test/kernels/where/test_cnnl.cpp |  68 +++++
 14 files changed, 775 insertions(+), 23 deletions(-)
 create mode 100644 src/04kernel/src/kernels/conv/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/conv/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/expand/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/expand/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/where/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/where/cnnl_kernel.hh
 create mode 100644 src/04kernel/test/kernels/conv/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/expand/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/where/test_cnnl.cpp

diff --git a/src/04kernel/src/collectors/conv.cc b/src/04kernel/src/collectors/conv.cc
index 14b61835f..d9cc0ea27 100644
--- a/src/04kernel/src/collectors/conv.cc
+++ b/src/04kernel/src/collectors/conv.cc
@@ -1,4 +1,5 @@
 ﻿#include "kernel/collectors/conv.h"
+#include "../kernels/conv/cnnl_kernel.hh"
 #include "../kernels/conv/cudnn_kernel.hh"
 
 namespace refactor::kernel {
@@ -23,6 +24,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = ConvCnnl::build(poolAttrs, x, w, b, y); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/where.cc b/src/04kernel/src/collectors/where.cc
index 1ce2b60c6..14aa21952 100644
--- a/src/04kernel/src/collectors/where.cc
+++ b/src/04kernel/src/collectors/where.cc
@@ -1,11 +1,12 @@
 #include "kernel/collectors/where.h"
+#include "../kernels/where/cnnl_kernel.hh"
 #include "../kernels/where/cpu_kernel.hh"
 #include "../kernels/where/where_cuda.hh"
 
 namespace refactor::kernel {
 
     std::vector<KernelBox>
-    WhereCollector::filter(TensorRefs inputs, TensorRefs) const {
+    WhereCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
         std::vector<KernelBox> ans;
         switch (_target) {
             case decltype(_target)::Cpu:
@@ -18,6 +19,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = WhereCnnl::build(inputs, outputs); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
index be06233cd..e3efcd1d3 100644
--- a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
@@ -107,10 +107,10 @@ namespace refactor::kernel {
         CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut));
 
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
-        auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX);
+        auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * info.dtX.size();
         size_t workspaceSize;
         CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize));
-        size_t totalWorkspaceSize = xTransSize + workspaceSize;
+        size_t totalWorkspaceSize = xTransSize * 2 + workspaceSize;
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d),
@@ -129,7 +129,7 @@ namespace refactor::kernel {
 
             void *xTrans = workspace;
             void *yTrans = xTrans + xTransSize;
-            void *cursor = yTrans + workspaceSize;
+            void *cursor = yTrans + xTransSize;
 
             // transpose NCHW input to NHWC
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
new file mode 100644
index 000000000..85dcb9517
--- /dev/null
+++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
@@ -0,0 +1,243 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include "../expand/cnnl_kernel.hh"
+#include "hardware/functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = ConvCnnl;
+
+    K::ConvCnnl(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(PoolAttributes const &poolAttributes,
+                  Tensor const &x,
+                  Tensor const &w,
+                  std::optional<std::reference_wrapper<Tensor const>> b,
+                  Tensor const &y) -> KernelBox {
+        static const std::unordered_set<decltype(DataType::internal)>
+            SET{DataType::FP16, DataType::BF16, DataType::F32, DataType::F64, DataType::I8};
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        auto dt = x.dataType;
+        if (!SET.contains(dt) || w.dataType != dt || y.dataType != dt) {
+            return nullptr;
+        }
+
+        std::optional<ExpandInfoCnnl> biasExpand = std::nullopt;
+        if (b) {
+            ASSERT(b->get().shape[0] == y.shape[1], "");
+            std::vector<dim_t> input(y.rank(), 1);
+            input[1] = y.shape[1];
+            biasExpand.emplace(ExpandInfoCnnl(
+                b->get().dataType,
+                slice(input.data(), input.size()),
+                slice(y.shape.data(), y.rank())));
+        }
+
+        // group is not supported
+        if (w.rank() != 4 || poolAttributes.rank() != 2) {
+            return nullptr;
+        }
+        auto d = poolAttributes.dilations(),
+             p = poolAttributes.pads(),
+             s = poolAttributes.strides();
+        return std::make_unique<K>(decltype(info){
+            dt,
+            {
+                static_cast<int>(x.shape[0]),
+                static_cast<int>(x.shape[1]),
+                static_cast<int>(x.shape[2]),
+                static_cast<int>(x.shape[3]),
+            },
+            {
+                static_cast<int>(w.shape[0]),
+                static_cast<int>(w.shape[1]),
+                static_cast<int>(w.shape[2]),
+                static_cast<int>(w.shape[3]),
+            },
+            {
+                static_cast<int>(y.shape[0]),
+                static_cast<int>(y.shape[1]),
+                static_cast<int>(y.shape[2]),
+                static_cast<int>(y.shape[3]),
+            },
+            {d[0], d[1]},
+            {p[0], p[1], p[2], p[3]},
+            {s[0], s[1]},
+            std::move(biasExpand),
+        });
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing conv using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto ConvCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t x, y, w;
+            cnnlTensorDescriptor_t xTrans, yTrans, wTrans;
+            cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
+            cnnlConvolutionDescriptor_t conv;
+            cnnlConvolutionForwardAlgo_t algo;
+            // std::optional<ExtraPadding> extraPadding;
+            std::optional<Routine> biasExpand;
+            bool f32;
+
+            Descriptors(decltype(f32) f32_)
+                :// extraPadding(std::nullopt),
+                  biasExpand(std::nullopt),
+                  f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&w));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&xTrans));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&yTrans));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&wTrans));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC));
+                CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW));
+                CNNL_ASSERT(cnnlCreateConvolutionDescriptor(&conv));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(w));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(xTrans));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(yTrans));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(wTrans));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC));
+                CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW));
+                CNNL_ASSERT(cnnlDestroyConvolutionDescriptor(conv));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.dt != DataType::F64);
+        // d->extraPadding = ExtraPadding::build(info.dt, info.xShape, info.pad);
+        if (info.biasExpand) {
+            d->biasExpand = ExpandCnnl(*info.biasExpand).lower(res).routine;
+        }
+        int xs[]{
+            info.xShape[0],
+            info.xShape[1],
+            info.xShape[2] + std::abs(info.pad[0] - info.pad[2]),
+            info.xShape[3] + std::abs(info.pad[1] - info.pad[3]),
+        };
+
+        auto NHWC = [](const int shape[]) -> std::vector<int> {
+            return {
+                shape[0], shape[2], shape[3], shape[1]};
+        };
+
+        std::vector<int> xsNHWC = NHWC(xs);
+        std::vector<int> wsNHWC = NHWC(info.wShape);
+        std::vector<int> ysNHWC = NHWC(info.yShape);
+
+        setCnnlTensor(d->x, info.dt, slice(xs, 4));
+        setCnnlTensor(d->y, info.dt, slice(info.yShape, 4));
+        setCnnlTensor(d->w, info.dt, slice(info.wShape, 4));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->xTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, xsNHWC.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->yTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, ysNHWC.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->wTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, wsNHWC.data()));
+        
+        auto xTransSize = cnnlGetTensorElementNum(d->xTrans) * info.dt.size();
+        auto yTransSize = cnnlGetTensorElementNum(d->yTrans) * info.dt.size();
+        auto wTransSize = cnnlGetTensorElementNum(d->wTrans) * info.dt.size();
+
+        int permuteIn[4] = {0, 2, 3, 1};
+        int permuteOut[4] = {0, 3, 1, 2};
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permuteIn));
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut));
+
+        size_t xWorkspaceSize, yWorkspaceSize, wWorkspaceSize, convWorkspaceSize;
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->x, d->NCHW2NHWC, &xWorkspaceSize));
+        CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->w, d->NCHW2NHWC, &wWorkspaceSize));
+        CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->yTrans, d->NHWC2NCHW, &yWorkspaceSize));
+
+        // clang-format off
+        auto computation = info.dt == DataType::F64 ? DataType::F64
+                         : info.dt == DataType::I8  ? DataType::I32
+                         : DataType::F32;
+        // clang-format on
+        auto group = xs[1] / info.wShape[1];
+        CNNL_ASSERT(cnnlSetConvolutionDescriptor(d->conv, 4, info.pad, info.stride, info.dilation, group, cnnlDataTypeConvert(computation)));
+        CNNL_ASSERT(cnnlGetConvolutionForwardAlgorithm(
+            handle, d->conv, d->xTrans, d->wTrans, d->yTrans,
+            CNNL_CONVOLUTION_FWD_FASTEST, &d->algo));
+
+        CNNL_ASSERT(cnnlGetConvolutionForwardWorkspaceSize(
+            handle, d->xTrans, d->wTrans, d->yTrans, NULL,
+            d->conv, d->algo, &convWorkspaceSize));
+
+        // if (d->extraPadding) {
+        //     workspaceSize = hardware::alignBytes(workspaceSize, 256);
+        // }
+
+        size_t workspaceSize = xTransSize + yTransSize + wTransSize + std::max({xWorkspaceSize, wWorkspaceSize, yWorkspaceSize, convWorkspaceSize});
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d, xTransSize, yTransSize, wTransSize,
+                        xWorkspaceSize, wWorkspaceSize,
+                        yWorkspaceSize, convWorkspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+            void const *x = inputs[0], *w = inputs[1];
+            void *y = outputs[0];
+            // if (auto f = d->extraPadding; f) {
+            //     x = (*f)(x, reinterpret_cast<uint8_t *>(workspace) + workspaceSize);
+            // }
+            // if (auto f = d->biasExpand; f) {
+            //     (*f)(res, workspace, inputs + 2, outputs);
+            // }
+
+            void *xTrans = workspace;
+            void *wTrans = xTrans + xTransSize;
+            void *yTrans = wTrans + wTransSize;
+            void *opWorkspace = yTrans + yTransSize;
+
+            // transpose NCHW input to NHWC
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->x, x,
+                                         d->xTrans, xTrans, opWorkspace, xWorkspaceSize));
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->w, w,
+                                         d->wTrans, wTrans, opWorkspace, wWorkspaceSize));
+            
+            // build alpha/beta for double
+            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 b = d->f32
+                         ? factor<fp32_t>(d->biasExpand ? 1 : 0)
+                         : factor<fp64_t>(d->biasExpand ? 1 : 0);
+            CNNL_ASSERT(cnnlConvolutionForward(
+                handle,
+                d->conv, d->algo, &a,
+                d->xTrans, xTrans, d->wTrans, wTrans,
+                NULL, NULL, opWorkspace, convWorkspaceSize,
+                &b, d->yTrans, yTrans));
+            
+            // transpose NHWC intermediates to NCHW
+            CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->yTrans, yTrans,
+                                         d->y, y, opWorkspace, yWorkspaceSize));
+        };
+        return {std::move(routine), workspaceSize};
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.hh b/src/04kernel/src/kernels/conv/cnnl_kernel.hh
new file mode 100644
index 000000000..814e75ad5
--- /dev/null
+++ b/src/04kernel/src/kernels/conv/cnnl_kernel.hh
@@ -0,0 +1,43 @@
+﻿#ifndef KERNEL_CONV_CNNL_KERNEL_HH
+#define KERNEL_CONV_CNNL_KERNEL_HH
+
+#include "../../kernels/expand/cnnl_kernel.hh"
+#include "kernel/attributes/pool_attributes.h"
+#include "kernel/kernel.h"
+#include <optional>
+
+namespace refactor::kernel {
+
+    /// @brief Use `cnnlConvolutionForward`.
+    ///        It only supports 4D tensors.
+    struct ConvCnnl final : public Kernel {
+        struct {
+            DataType dt;
+            int xShape[4],
+                wShape[4],
+                yShape[4],
+                dilation[2],
+                pad[4],
+                stride[2];
+            std::optional<ExpandInfoCnnl> biasExpand;
+        } info;
+
+        explicit ConvCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(PoolAttributes const &,
+                               Tensor const &,
+                               Tensor const &,
+                               std::optional<std::reference_wrapper<Tensor const>>,
+                               Tensor const &);
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_CONV_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.cc b/src/04kernel/src/kernels/expand/cnnl_kernel.cc
new file mode 100644
index 000000000..18f586422
--- /dev/null
+++ b/src/04kernel/src/kernels/expand/cnnl_kernel.cc
@@ -0,0 +1,68 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = ExpandCnnl;
+
+    K::ExpandCnnl(ExpandInfoCnnl info_) noexcept
+        : Kernel(), info(info_) {}
+
+    auto K::build(Tensor const &input, Tensor const &output) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<K>(ExpandInfoCnnl(
+            input.dataType,
+            slice(input.shape.data(), input.rank()),
+            slice(output.shape.data(), output.rank())
+        ));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing expand operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto ExpandCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, outDesc;
+
+            Descriptors() : inDesc(nullptr), outDesc(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
+            }
+        };
+        auto d = std::make_shared<Descriptors>();
+        std::vector<int> in(info.inDims.begin(), info.inDims.end()),
+            out(info.outDims.begin(), info.outDims.end());
+        setCnnlTensor(d->inDesc, info.dataType, slice(in.data(), in.size()));
+        setCnnlTensor(d->outDesc, info.dataType, slice(out.data(), out.size()));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            CNNL_ASSERT(cnnlExpand(res.fetchOrStore<CnnlContext>()->handle,
+                                   d->inDesc, inputs[0], d->outDesc, outputs[0]));
+            // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.hh b/src/04kernel/src/kernels/expand/cnnl_kernel.hh
new file mode 100644
index 000000000..a6271907e
--- /dev/null
+++ b/src/04kernel/src/kernels/expand/cnnl_kernel.hh
@@ -0,0 +1,31 @@
+﻿#ifndef KERNEL_EXPAND_CNNL_KERNEL_HH
+#define KERNEL_EXPAND_CNNL_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct ExpandInfoCnnl {
+        DataType dataType;
+        slice_t<dim_t> inDims, outDims;
+    };
+
+    struct ExpandCnnl final : public Kernel {
+        ExpandInfoCnnl info;
+
+        explicit ExpandCnnl(ExpandInfoCnnl) noexcept;
+
+        static KernelBox build(Tensor const &input, Tensor const &output) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_EXPAND_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
index 26a0bef1b..a5a6881e4 100644
--- a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
@@ -8,21 +8,29 @@
 
 namespace refactor::kernel {
     using K = TransposeCnnl;
-    using Info = TransposeInfo;
+    using Info = TransposeInfoCnnl;
 
-    K::TransposeCnnl(DataType dataType_, Shape dimIn_, Shape dimOut_, Permutation perm_) noexcept
-        : Kernel(), dataType(dataType_), dimIn(std::move(dimIn_)),
-          dimOut(std::move(dimOut_)), perm(std::move(perm_)) {}
+    Info::TransposeInfoCnnl(DataType dataType_, std::vector<int> input_, std::vector<int> perm_) 
+        : dataType(dataType_), inDim(input_), perm(perm_) {
+        ASSERT(input_.size() == perm_.size(), "Unreachable");
+        for (uint32_t i = 0; i < input_.size(); i++) {
+            outDim.push_back(input_[perm_[i]]);
+        }
+    }
+
+    Info::TransposeInfoCnnl(DataType dataType, Shape shape, Permutation perm) 
+        : TransposeInfoCnnl(dataType, 
+                            std::move(std::vector<int>(shape.begin(), shape.end())),
+                            std::move(std::vector<int>(perm.begin(), perm.end()))) { }
 
-    auto K::build(DataType dataType, Shape shape_, Permutation perm_) noexcept -> KernelBox {
+    K::TransposeCnnl(Info info_) noexcept
+        : Kernel(), info(std::move(info_)) { }
+
+    auto K::build(DataType dataType, Shape shape, Permutation perm) noexcept -> KernelBox {
 #ifndef USE_BANG
         return nullptr;
 #endif
-        Shape dimOut_;
-        for (uint32_t i = 0; i < shape_.size(); i++) {
-            dimOut_.push_back(shape_[perm_[i]]);
-        }
-        return std::make_unique<K>(dataType, std::move(shape_), std::move(dimOut_), std::move(perm_));
+        return std::make_unique<K>(TransposeInfoCnnl(dataType, shape, perm));
     }
     auto K::typeId() noexcept -> size_t {
         static uint8_t ID = 1;
@@ -63,10 +71,10 @@ namespace refactor::kernel {
             Descriptors(Descriptors &&) = delete;
         };
 
-        auto d = std::make_shared<Descriptors>(dataType != DT::F64);
-        setCnnlTensor(d->x, dataType, slice((int *)(dimIn.data()), dimIn.size()));
-        setCnnlTensor(d->y, dataType, slice((int *)(dimOut.data()), dimOut.size()));
-        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, perm.size(), (int *)perm.data()));
+        auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);
+        setCnnlTensor(d->x, info.dataType, slice(info.inDim.data(), info.inDim.size()));
+        setCnnlTensor(d->y, info.dataType, slice(info.outDim.data(), info.outDim.size()));
+        CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, info.perm.size(), info.perm.data()));
 
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
         size_t workspaceSize;
diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.hh b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh
index 37bb7e088..62254dc4e 100644
--- a/src/04kernel/src/kernels/transpose/cnnl_kernel.hh
+++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh
@@ -9,13 +9,18 @@ namespace refactor::kernel {
     using Shape = absl::InlinedVector<dim_t, 4>;
     using Permutation = Shape;
 
-    struct TransposeCnnl final : public Kernel {
+    struct TransposeInfoCnnl {
         DataType dataType;
-        Shape dimIn;
-        Shape dimOut;
-        Permutation perm;
+        std::vector<int> inDim, outDim, perm;
+
+        TransposeInfoCnnl(DataType, std::vector<int>, std::vector<int>);
+        TransposeInfoCnnl(DataType, Shape, Permutation);
+    };
+
+    struct TransposeCnnl final : public Kernel {
+        TransposeInfoCnnl info;
 
-        TransposeCnnl(DataType, Shape, Shape, Permutation) noexcept;
+        TransposeCnnl(TransposeInfoCnnl) noexcept;
 
         static KernelBox build(DataType, Shape, Permutation) noexcept;
         static size_t typeId() noexcept;
diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc
new file mode 100644
index 000000000..774c5513d
--- /dev/null
+++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc
@@ -0,0 +1,116 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = WhereCnnl;
+
+    K::WhereCnnl(decltype(info) info_) noexcept
+        : Kernel(), info(info_) {}
+
+    auto K::build(TensorRefs const &inputs, TensorRefs const &outputs) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<K>(decltype(info) {
+            inputs[1].get().dataType,
+            inputs[0].get().shape,
+            inputs[1].get().shape,
+            inputs[2].get().shape,
+            outputs[0].get().shape,
+        });
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing where operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto WhereCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t cond, x, y, ans;
+            bool f32;
+
+            explicit Descriptors(decltype(f32) f32_)
+                : cond(nullptr), x(nullptr), y(nullptr),
+                  ans(nullptr), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&cond));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&ans));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(cond));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(ans));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);
+
+        std::vector<int> cDim(info.condDim.begin(), info.condDim.end()),
+            xDim(info.thenDim.begin(), info.thenDim.end()),
+            yDim(info.elseDim.begin(), info.elseDim.end()),
+            ansDim(info.outputDim.begin(), info.outputDim.end());
+
+        auto rightAlign = [](std::vector<int> &dim, uint32_t targetLength) {
+            if (dim.size() < targetLength) {
+                dim.insert(dim.begin(), targetLength - dim.size(), 1);
+            }
+        };
+        if (ansDim.size() == 0) {
+            ansDim.push_back(1);
+        }
+        rightAlign(cDim, ansDim.size());
+        rightAlign(xDim, ansDim.size());
+        rightAlign(yDim, ansDim.size());
+
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->cond, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(DT::Bool), cDim.size(), cDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), xDim.size(), xDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), yDim.size(), yDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->ans, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), ansDim.size(), ansDim.data()));
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t workspaceSize;
+        CNNL_ASSERT(cnnlGetSelectV2WorkspaceSize(handle, d->cond, d->x, d->y, &workspaceSize));
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+            auto cond = inputs[0],
+                 x = inputs[1],
+                 y = inputs[2];
+            auto ans = outputs[0];
+
+            CNNL_ASSERT(cnnlSelectV2(
+                handle, d->cond, cond, d->x, x,
+                d->y, y, workspace, workspaceSize,
+                d->ans, ans));
+
+            cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue);
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.hh b/src/04kernel/src/kernels/where/cnnl_kernel.hh
new file mode 100644
index 000000000..6139b1832
--- /dev/null
+++ b/src/04kernel/src/kernels/where/cnnl_kernel.hh
@@ -0,0 +1,32 @@
+#ifndef KERNEL_WHERE_CNNL_HH
+#define KERNEL_WHERE_CNNL_HH
+
+#include "kernel/collectors/where.h"
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    using Shape = absl::InlinedVector<dim_t, 4>;
+
+    struct WhereCnnl final : public Kernel {
+        struct {
+            DataType dataType;
+            Shape condDim, thenDim, elseDim, outputDim;
+        } info;
+
+        WhereCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(TensorRefs const &, TensorRefs const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_WHERE_CNNL_HH
diff --git a/src/04kernel/test/kernels/conv/test_cnnl.cpp b/src/04kernel/test/kernels/conv/test_cnnl.cpp
new file mode 100644
index 000000000..bf4bff3c8
--- /dev/null
+++ b/src/04kernel/test/kernels/conv/test_cnnl.cpp
@@ -0,0 +1,76 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/conv/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+void testConvCnnl(int rank, const int64_t *pads, const int64_t *strides, const int64_t *dilations,
+                  Shape xShape, Shape wShape, Shape yShape,
+                  const std::vector<float> &ExpectData) {
+    auto xTensor = Tensor::share(DataType::F32, xShape);
+    auto wTensor = Tensor::share(DataType::F32, wShape);
+    auto yTensor = Tensor::share(DataType::F32, yShape);
+    PoolAttributes poolAttributes(rank, dilations, pads, strides);
+    auto kernel = ConvCnnl::build(poolAttributes, *xTensor, *wTensor, std::nullopt, *yTensor);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // bang malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         xMlu = dev.malloc(xTensor->bytesSize()),
+         wMlu = dev.malloc(wTensor->bytesSize()),
+         yMlu = dev.malloc(yTensor->bytesSize());
+    // put input data
+    std::vector<int> xIncremental(xTensor->elementsSize()),
+        wIncremental(wTensor->elementsSize());
+    std::iota(xIncremental.begin(), xIncremental.end(), 0);
+    std::iota(wIncremental.begin(), wIncremental.end(), 0);
+    std::vector<float> xData(xIncremental.begin(), xIncremental.end()),
+        wData(wIncremental.begin(), wIncremental.end());
+    xMlu->copyFromHost(xData.data(), xTensor->bytesSize());
+    wMlu->copyFromHost(wData.data(), wTensor->bytesSize());
+    // inference
+    void const *inputs[]{*xMlu, *wMlu};
+    void *outputs[]{*yMlu};
+    routine(res, *workspace, inputs, outputs);
+
+    xMlu->copyToHost(xData.data(), xTensor->bytesSize());
+    wMlu->copyToHost(wData.data(), wTensor->bytesSize());
+    // fmt::println("{}", vec2str(xData));
+    // fmt::println("{}", vec2str(wData));
+
+    // std::vector<float> ws(workspaceSize);
+    // workspace->copyToHost(ws.data(), workspaceSize);
+    // fmt::println("{}", vec2str(ws));
+
+    // take output data
+    std::vector<float> result(yTensor->elementsSize());
+    yMlu->copyToHost(result.data(), yTensor->bytesSize());
+    // check
+    for (auto i : range0_(ExpectData.size())) {
+        EXPECT_FLOAT_EQ(ExpectData[i], result[i]);
+    }
+}
+
+TEST(kernel, ConvCnnl) {
+    int rank = 2;
+    int64_t const
+        pads[]{1, 1, 1, 1},
+        strides[]{1, 1},
+        dilations[]{1, 1};
+    Shape
+        xShape{1, 3, 3, 2},
+        wShape{1, 3, 3, 2},
+        yShape{1, 1, 3, 3};
+    const std::vector<float> ExpectData = {570, 1158, 582, 888, 1785, 888, 582, 1158, 570};
+    testConvCnnl(rank, pads, strides, dilations, xShape, wShape, yShape, ExpectData);
+}
+
+
+#endif
diff --git a/src/04kernel/test/kernels/expand/test_cnnl.cpp b/src/04kernel/test/kernels/expand/test_cnnl.cpp
new file mode 100644
index 000000000..e06e59777
--- /dev/null
+++ b/src/04kernel/test/kernels/expand/test_cnnl.cpp
@@ -0,0 +1,50 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/expand/cnnl_kernel.hh"
+#include "../../../src/kernels/expand/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, ExpandCnnl) {
+    // build routine
+    auto input = Tensor::share(DataType::F32, {3, 4, 1, 6}),
+         output = Tensor::share(DataType::F32, {2, 3, 4, 5, 6});
+    auto kernel = ExpandCnnl::build(*input, *output);
+    auto kCpu = ExpandCpu::build(ExpandInfo(*input, *output));
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    auto rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluIn = dev.malloc(input->bytesSize()),
+         mluOut = dev.malloc(output->bytesSize());
+    // put input data
+    std::vector<float>
+        data(input->elementsSize()),
+        ans(output->elementsSize()),
+        result(ans.size());
+    std::iota(data.begin(), data.end(), 0);
+    mluIn->copyFromHost(data.data(), input->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluIn};
+        void *outputs[]{*mluOut};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{ans.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    mluOut->copyToHost(result.data(), output->bytesSize());
+    EXPECT_EQ(result, ans);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/where/test_cnnl.cpp b/src/04kernel/test/kernels/where/test_cnnl.cpp
new file mode 100644
index 000000000..aaad65857
--- /dev/null
+++ b/src/04kernel/test/kernels/where/test_cnnl.cpp
@@ -0,0 +1,68 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/where/cnnl_kernel.hh"
+#include "../../../src/kernels/where/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+void testWhereCnnl(Shape cDim, Shape xDim, Shape yDim, Shape outDim) {
+    // build routine
+    auto cTensor = Tensor::share(DataType::Bool, cDim);
+    auto xTensor = Tensor::share(DataType::F32, xDim);
+    auto yTensor = Tensor::share(DataType::F32, yDim);
+    auto outTensor = Tensor::share(DataType::F32, outDim);
+    auto kCpu = WhereCpu::build({*cTensor, *xTensor, *yTensor});
+    auto kCnnl = WhereCnnl::build({*cTensor, *xTensor, *yTensor}, {*outTensor});
+    ASSERT_TRUE(kCpu && kCnnl);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto [rCnnl, workspaceSize] = kCnnl->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         mluC = dev.malloc(cTensor->bytesSize()),
+         mluX = dev.malloc(xTensor->bytesSize()),
+         mluY = dev.malloc(yTensor->bytesSize()),
+         mluOut = dev.malloc(outTensor->bytesSize());
+    // put input data
+    int dataC[cTensor->elementsSize()];
+    memset(dataC, 1, cTensor->elementsSize() * sizeof(bool));
+    mluC->copyFromHost(dataC, cTensor->bytesSize());
+    std::vector<float> dataX(xTensor->elementsSize());
+    for (auto i : range0_(dataX.size())) { dataX[i] = 7; }
+    mluX->copyFromHost(dataX.data(), xTensor->bytesSize());
+    std::vector<float> dataY(yTensor->elementsSize());
+    for (auto i : range0_(dataY.size())) { dataY[i] = 3; }
+    mluY->copyFromHost(dataY.data(), yTensor->bytesSize());
+    std::vector<float> cpuOut(outTensor->elementsSize());
+    // inference
+    {
+        void const *inputs[]{dataC, dataX.data(), dataY.data()};
+        void *outputs[]{cpuOut.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{*mluC, *mluX, *mluY};
+        void *outputs[]{*mluOut};
+        rCnnl(res, *workspace, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(outTensor->elementsSize());
+    mluOut->copyToHost(result.data(), outTensor->bytesSize());
+    // check
+    for (auto i : range0_(result.size())) {
+        EXPECT_FLOAT_EQ(cpuOut[i], result[i]);
+    }
+}
+
+TEST(kernel, WhereCnnl) {
+    testWhereCnnl(Shape{2, 5}, Shape{2, 3, 1, 5}, Shape{3, 2, 5}, Shape{2, 3, 2, 5});
+    testWhereCnnl(Shape{1}, Shape{4}, Shape{1}, Shape{4});
+    testWhereCnnl(Shape{3}, Shape{2, 3}, Shape{2, 3}, Shape{2, 3});
+}
+
+#endif

From 917113056411739a30fbf4403ab13d5047a77ede Mon Sep 17 00:00:00 2001
From: Zhang Bolun <chamberlain0w0@gmail.com>
Date: Fri, 12 Jan 2024 07:50:39 +0000
Subject: [PATCH 3/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=92?=
 =?UTF-8?q?=E6=AD=A6=E7=BA=AA=E5=B9=B3=E5=8F=B0split/concat/slice/matmul?=
 =?UTF-8?q?=E7=AE=97=E5=AD=90=EF=BC=8C=E5=B9=B6merge=20from=20master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/04kernel/src/collectors/concat.cc         |   6 +
 src/04kernel/src/collectors/mat_mul.cc        |   6 +
 src/04kernel/src/collectors/slice.cc          |   6 +
 src/04kernel/src/collectors/split.cc          |   6 +
 .../src/kernels/concat/cnnl_kernel.cc         |  93 ++++++++
 .../src/kernels/concat/cnnl_kernel.hh         |  26 +++
 .../src/kernels/mat_mul/cnnl_kernel.cc        | 153 +++++++++++++
 .../src/kernels/mat_mul/cnnl_kernel.hh        |  32 +++
 src/04kernel/src/kernels/slice/cnnl_kernel.cc |  88 ++++++++
 src/04kernel/src/kernels/slice/cnnl_kernel.hh |  32 +++
 src/04kernel/src/kernels/split/cnnl_kernel.cc | 111 ++++++++++
 src/04kernel/src/kernels/split/cnnl_kernel.hh |  37 ++++
 .../test/kernels/concat/test_cnnl.cpp         |  79 +++++++
 .../test/kernels/mat_mul/test_cnnl.cpp        | 205 ++++++++++++++++++
 src/04kernel/test/kernels/slice/test_cnnl.cpp |  59 +++++
 src/04kernel/test/kernels/split/test_cnnl.cpp |  80 +++++++
 src/07onnx/src/operators/simple_binary.cc     |   2 +-
 17 files changed, 1020 insertions(+), 1 deletion(-)
 create mode 100644 src/04kernel/src/kernels/concat/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/concat/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/slice/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/slice/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/split/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/split/cnnl_kernel.hh
 create mode 100644 src/04kernel/test/kernels/concat/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/slice/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/split/test_cnnl.cpp

diff --git a/src/04kernel/src/collectors/concat.cc b/src/04kernel/src/collectors/concat.cc
index 8e6386907..27f27c776 100644
--- a/src/04kernel/src/collectors/concat.cc
+++ b/src/04kernel/src/collectors/concat.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/concat.h"
 #include "../kernels/concat/cpu_kernel.hh"
 #include "../kernels/concat/cuda_kernel.hh"
+#include "../kernels/concat/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -20,6 +21,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = ConcatCnnl::build(axis, inputs, outputs[0].get()); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/mat_mul.cc b/src/04kernel/src/collectors/mat_mul.cc
index 7581200cd..d6b323260 100644
--- a/src/04kernel/src/collectors/mat_mul.cc
+++ b/src/04kernel/src/collectors/mat_mul.cc
@@ -1,4 +1,5 @@
 ﻿#include "kernel/collectors/mat_mul.h"
+#include "../kernels/mat_mul/cnnl_kernel.hh"
 #include "../kernels/mat_mul/cpu_kernel.hh"
 #include "../kernels/mat_mul/cublas_kernel.hh"
 #include "kernel/attributes/mat_mul_info.h"
@@ -26,6 +27,11 @@ namespace refactor::kernel {
             case decltype(_target)::Nvidia:
                 REGISTER(MatMulCublas)
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = MatMulCnnl::build(inputs, outputs, transA, transB, alpha, beta); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/slice.cc b/src/04kernel/src/collectors/slice.cc
index 0b063dd17..60c93cb28 100644
--- a/src/04kernel/src/collectors/slice.cc
+++ b/src/04kernel/src/collectors/slice.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/slice.h"
 #include "../kernels/slice/cpu_kernel.hh"
 #include "../kernels/slice/cuda_kernel.hh"
+#include "../kernels/slice/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -26,6 +27,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = SliceCnnl::build(inputs[0].get().dataType, dimentions, inputs[0].get().shape, outputs[0].get().shape); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/split.cc b/src/04kernel/src/collectors/split.cc
index 6fe22548d..6a3be0a65 100644
--- a/src/04kernel/src/collectors/split.cc
+++ b/src/04kernel/src/collectors/split.cc
@@ -1,4 +1,5 @@
 ﻿#include "kernel/collectors/split.h"
+#include "../kernels/split/cnnl_kernel.hh"
 #include "../kernels/split/cpu_kernel.hh"
 #include "../kernels/split/cuda_kernel.hh"
 
@@ -20,6 +21,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = SplitCnnl::build(axis, inputs[0].get(), outputs); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
new file mode 100644
index 000000000..7d11dbaa9
--- /dev/null
+++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
@@ -0,0 +1,93 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = ConcatCnnl;
+
+    K::ConcatCnnl(SplitInfoCnnl info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(int axis, TensorRefs inputs, Tensor output) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<K>(SplitInfoCnnl(axis, output, inputs));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing split operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto ConcatCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t in;
+            std::vector<cnnlTensorDescriptor_t> out;
+            bool f32;
+
+            explicit Descriptors(int n, decltype(f32) f32_)
+                : in(nullptr),
+                  out(std::vector<cnnlTensorDescriptor_t>(n, nullptr)),
+                  f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&in));
+                for (auto i = 0; i < n; i++) {
+                    CNNL_ASSERT(cnnlCreateTensorDescriptor(&out[i]));
+                }
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
+                for (auto i = 0; i < out.size(); i++) {
+                    CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
+                }
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.num, info.dataType != DT::F64);
+        setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
+        for (auto i = 0; i < info.outDims.size(); i++) {
+            setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
+        }
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t workspaceSize;
+        CNNL_ASSERT(cnnlGetSplitWorkspaceSize(handle, info.num, &workspaceSize));
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d), n = info.num, axis = info.axis, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            const void *argv[n];
+            for (auto i = 0; i < n; i++) {
+                argv[i] = inputs[i];
+            }
+
+            CNNL_ASSERT(cnnlConcat(
+                handle, n, axis, d->out.data(), argv,
+                workspace, workspaceSize, d->in, outputs[0]));
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.hh b/src/04kernel/src/kernels/concat/cnnl_kernel.hh
new file mode 100644
index 000000000..c49d5607a
--- /dev/null
+++ b/src/04kernel/src/kernels/concat/cnnl_kernel.hh
@@ -0,0 +1,26 @@
+﻿#ifndef KERNEL_CONCAT_CNNL_KERNEL_HH
+#define KERNEL_CONCAT_CNNL_KERNEL_HH
+
+#include "../../kernels/split/cnnl_kernel.hh"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct ConcatCnnl final : public Kernel {
+        SplitInfoCnnl info;
+
+        explicit ConcatCnnl(SplitInfoCnnl) noexcept;
+
+        static KernelBox build(int, TensorRefs, Tensor) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_CONCAT_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
new file mode 100644
index 000000000..7be05b7b2
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
@@ -0,0 +1,153 @@
+﻿#include "cnnl_kernel.hh"
+#include <numeric>
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = MatMulCnnl;
+    using DT = DataType;
+
+    K::MatMulCnnl(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(TensorRefs inputs_, TensorRefs outputs_, bool transA_, bool transB_, float alpha_, float beta_) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        auto dt = inputs_[0].get().dataType;
+        return dt.isIeee754() || dt == DT::I8
+                   ? std::make_unique<K>(decltype(info){
+                         dt,
+                         transA_,
+                         transB_,
+                         alpha_,
+                         beta_,
+                         std::vector<int>(inputs_[0].get().shape.begin(), inputs_[0].get().shape.end()),
+                         std::vector<int>(inputs_[1].get().shape.begin(), inputs_[1].get().shape.end()),
+                         std::vector<int>(outputs_[0].get().shape.begin(), outputs_[0].get().shape.end()),
+                         inputs_.size() == 3
+                             ? inputs_[2].get().shape.size() == 0 ? std::make_optional(std::vector<int>(1, 1))
+                                                                  : std::make_optional(std::vector<int>(
+                                                                        inputs_[2].get().shape.begin(),
+                                                                        inputs_[2].get().shape.end()))
+                             : std::nullopt,
+                     })
+                   : nullptr;
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing MatMul using CNNL";
+    }
+
+
+#ifdef USE_BANG
+    auto K::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlTensorDescriptor_t a, b, c;
+            cnnlMatMulDescriptor_t bmm;
+            cnnlMatMulAlgo_t algo;
+            cnnlMatMulHeuristicResult_t heuristic;
+            cnnlTensorDescriptor_t bias;
+            bool addBias, f32;
+
+            explicit Descriptors(bool addBias_, bool f32_)
+                : a(nullptr), b(nullptr), c(nullptr),
+                  bmm(nullptr), algo(nullptr), heuristic(nullptr),
+                  bias(nullptr), addBias(addBias_), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&a));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&b));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&c));
+                if (addBias) {
+                    CNNL_ASSERT(cnnlCreateTensorDescriptor(&bias));
+                }
+                CNNL_ASSERT(cnnlMatMulDescCreate(&bmm));
+                CNNL_ASSERT(cnnlMatMulAlgoCreate(&algo));
+                CNNL_ASSERT(cnnlCreateMatMulHeuristicResult(&heuristic));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(a));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(b));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(c));
+                if (addBias) {
+                    CNNL_ASSERT(cnnlDestroyTensorDescriptor(bias));
+                }
+                CNNL_ASSERT(cnnlMatMulDescDestroy(bmm));
+                CNNL_ASSERT(cnnlMatMulAlgoDestroy(algo));
+                CNNL_ASSERT(cnnlDestroyMatMulHeuristicResult(heuristic));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.biasDim.has_value(), info.dataType != DT::F64);
+        setCnnlTensor(d->a, info.dataType, slice(info.aDim.data(), info.aDim.size()));
+        setCnnlTensor(d->b, info.dataType, slice(info.bDim.data(), info.bDim.size()));
+        setCnnlTensor(d->c, info.dataType, slice(info.cDim.data(), info.cDim.size()));
+        if (d->addBias) {
+            CNNL_ASSERT(cnnlSetTensorDescriptor(
+                d->bias, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType),
+                info.biasDim.value().size(), info.biasDim.value().data()));
+        }
+        int32_t tA = info.transA, tB = info.transB;
+        CNNL_ASSERT(cnnlSetMatMulDescAttr(d->bmm, CNNL_MATMUL_DESC_TRANSA,
+                                          &tA, sizeof(int32_t)));
+        CNNL_ASSERT(cnnlSetMatMulDescAttr(d->bmm, CNNL_MATMUL_DESC_TRANSB,
+                                          &tB, sizeof(int32_t)));
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        int returnedAlgoCount = 0;
+        CNNL_ASSERT(cnnlGetBatchMatMulAlgoHeuristic(
+            handle, d->bmm, d->a, d->b, d->c,
+            NULL, 1, &(d->heuristic), &returnedAlgoCount));
+
+        size_t algoWorkspaceSize;
+        CNNL_ASSERT(cnnlGetBatchMatMulHeuristicResult(d->heuristic, d->algo, &algoWorkspaceSize));
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d), algoWorkspaceSize,
+                        aa = info.alpha, bb = info.beta](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            // build alpha/beta for double
+            auto alpha = d->f32 ? factor<fp32_t>(aa) : factor<fp64_t>(aa),
+                 beta = d->f32 ? factor<fp32_t>(bb) : factor<fp64_t>(bb),
+                //  one = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
+                 zero = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
+
+            if (d->addBias) {
+                CNNL_ASSERT(cnnlExpand(handle, d->bias, inputs[2], d->c, outputs[0]));
+            }
+
+            if (alpha != 0) {
+                CNNL_ASSERT(cnnlBatchMatMulBCast_v2(
+                    handle, d->bmm, d->algo, &alpha,
+                    d->a, inputs[0], d->b, inputs[1],
+                    d->addBias ? &beta : &zero, d->c, outputs[0],
+                    workspace, algoWorkspaceSize));
+            }
+
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+
+        return {std::move(routine), algoWorkspaceSize};
+    }
+
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh
new file mode 100644
index 000000000..9b44b192c
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh
@@ -0,0 +1,32 @@
+﻿#ifndef KERNEL_MATMUL_CNNL_KERNEL_HH
+#define KERNEL_MATMUL_CNNL_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct MatMulCnnl final : public Kernel {
+        struct {
+            DataType dataType;
+            bool transA, transB;
+            float alpha, beta;
+            std::vector<int> aDim, bDim, cDim;
+            std::optional<std::vector<int>> biasDim;
+        } info;
+
+        explicit MatMulCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(TensorRefs, TensorRefs, bool, bool, float, float) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_MATMUL_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.cc b/src/04kernel/src/kernels/slice/cnnl_kernel.cc
new file mode 100644
index 000000000..b5f8b4c85
--- /dev/null
+++ b/src/04kernel/src/kernels/slice/cnnl_kernel.cc
@@ -0,0 +1,88 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = SliceCnnl;
+
+    K::SliceCnnl(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(DataType dt_, Dimensions dims_, Shape in_, Shape out_) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<K>(decltype(info){
+            dt_,
+            dims_,
+            std::vector<int>(in_.begin(), in_.end()),
+            std::vector<int>(out_.begin(), out_.end()),
+        });
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing slice operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto SliceCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t in, out;
+            bool f32;
+
+            explicit Descriptors(decltype(f32) f32_)
+                : in(nullptr), out(nullptr), f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&in));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&out));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(out));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);
+        // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
+        // setCnnlTensor(d->out, info.dataType, slice(info.outDim.data(), info.outDim.size()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->out, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDim.size(), info.outDim.data()));
+        std::vector<int> begin, end, stride;
+        for (auto i = 0; i < info.dims.size(); i++) {
+            // [begin, end), end is not inclued
+            begin.push_back(info.dims[i].start);
+            auto sign = info.dims[i].step > 0 ? 1 : -1;
+            end.push_back(info.dims[i].start + info.dims[i].step * (info.dims[i].length - 1) + sign);
+            stride.push_back(info.dims[i].step);
+        }
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d), begin, end, stride](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            CNNL_ASSERT(cnnlStridedSlice(
+                handle, d->in, inputs[0],
+                begin.data(), end.data(), stride.data(),
+                d->out, outputs[0]));
+        };
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.hh b/src/04kernel/src/kernels/slice/cnnl_kernel.hh
new file mode 100644
index 000000000..ace79da1d
--- /dev/null
+++ b/src/04kernel/src/kernels/slice/cnnl_kernel.hh
@@ -0,0 +1,32 @@
+﻿#ifndef KERNEL_SLICE_CNNL_KERNEL_HH
+#define KERNEL_SLICE_CNNL_KERNEL_HH
+
+#include "kernel/attributes/slice_info.h"
+#include "kernel/collectors/slice.h"
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct SliceCnnl final : public Kernel {
+        struct {
+            DataType dataType;
+            Dimensions dims;
+            std::vector<int> inDim, outDim;
+        } info;
+
+        explicit SliceCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(DataType, Dimensions, Shape, Shape) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SLICE_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc
new file mode 100644
index 000000000..ace809ffa
--- /dev/null
+++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc
@@ -0,0 +1,111 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = SplitCnnl;
+    using Info = SplitInfoCnnl;
+
+    Info::SplitInfoCnnl(DataType dt_, int axis_, int num_, std::vector<int> in_, std::vector<std::vector<int>> out_)
+        : dataType(dt_), axis(axis_), num(num_), inDim(std::move(in_)), outDims(std::move(out_)) {}
+
+
+    Info::SplitInfoCnnl(int axis, Tensor input, TensorRefs outputs)
+        : SplitInfoCnnl(input.dataType, axis, outputs.size(),
+                        std::move(std::vector<int>(input.shape.begin(), input.shape.end())),
+                        std::move([](TensorRefs tensors) -> std::vector<std::vector<int>> {
+                            std::vector<std::vector<int>> res;
+                            for (uint32_t i = 0; i < tensors.size(); i++) {
+                                res.push_back(std::vector<int>(tensors[i].get().shape.begin(),
+                                                               tensors[i].get().shape.end()));
+                            }
+                            return res;
+                        }(outputs))) {}
+
+    K::SplitCnnl(SplitInfoCnnl info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(int axis, Tensor input, TensorRefs outputs) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<K>(SplitInfoCnnl(axis, input, outputs));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing concat operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto SplitCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+        using DT = DataType;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t in;
+            std::vector<cnnlTensorDescriptor_t> out;
+            bool f32;
+
+            explicit Descriptors(int n, decltype(f32) f32_)
+                : in(nullptr),
+                  out(std::vector<cnnlTensorDescriptor_t>(n, nullptr)),
+                  f32(f32_) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&in));
+                for (auto i = 0; i < n; i++) {
+                    CNNL_ASSERT(cnnlCreateTensorDescriptor(&out[i]));
+                }
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
+                for (auto i = 0; i < out.size(); i++) {
+                    CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
+                }
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(info.num, info.dataType != DT::F64);
+        setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
+        for (auto i = 0; i < info.outDims.size(); i++) {
+            setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
+        }
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t workspaceSize;
+        CNNL_ASSERT(cnnlGetSplitWorkspaceSize(handle, info.num, &workspaceSize));
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d), n = info.num, axis = info.axis, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            void *argv[n];
+            for (auto i = 0; i < n; i++) {
+                argv[i] = outputs[i];
+            }
+
+            CNNL_ASSERT(cnnlSplit(
+                handle, n, axis, d->in, inputs[0],
+                workspace, workspaceSize, d->out.data(), argv));
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+
+#endif
+
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.hh b/src/04kernel/src/kernels/split/cnnl_kernel.hh
new file mode 100644
index 000000000..bcde3d708
--- /dev/null
+++ b/src/04kernel/src/kernels/split/cnnl_kernel.hh
@@ -0,0 +1,37 @@
+﻿#ifndef KERNEL_SPLIT_CNNL_KERNEL_HH
+#define KERNEL_SPLIT_CNNL_KERNEL_HH
+
+#include "kernel/collectors/split.h"
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+    struct SplitInfoCnnl {
+        DataType dataType;
+        int axis;
+        int num;
+        std::vector<int> inDim;
+        std::vector<std::vector<int>> outDims;
+
+        SplitInfoCnnl(DataType, int, int, std::vector<int>, std::vector<std::vector<int>>);
+        SplitInfoCnnl(int, Tensor, TensorRefs);
+    };
+
+    struct SplitCnnl final : public Kernel {
+        SplitInfoCnnl info;
+
+        explicit SplitCnnl(SplitInfoCnnl) noexcept;
+
+        static KernelBox build(int, Tensor, TensorRefs) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_CNNL_KERNEL_HH
diff --git a/src/04kernel/test/kernels/concat/test_cnnl.cpp b/src/04kernel/test/kernels/concat/test_cnnl.cpp
new file mode 100644
index 000000000..2911c9e90
--- /dev/null
+++ b/src/04kernel/test/kernels/concat/test_cnnl.cpp
@@ -0,0 +1,79 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/concat/cpu_kernel.hh"
+#include "../../../src/kernels/concat/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, ConcatCnnl) {
+    // build routine
+    std::vector<Arc<Tensor>> inputTensors{
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 1, 7, 7}),// 勿
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 9, 7, 7}),// 忘
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 3, 7, 7}),// 国
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 7, 7, 7}),// 耻
+    };
+    auto result = Tensor::share(DataType::F32, Shape{2, 3, 1, 20, 7, 7});
+    TensorRefs inputs_;
+    inputs_.reserve(inputTensors.size());
+    std::transform(inputTensors.begin(), inputTensors.end(),
+                   std::back_inserter(inputs_),
+                   [](auto const &it) { return std::cref(*it); });
+    SplitInfo info(3, inputs_);
+    auto kCpu = ConcatCpu::build(info);
+    auto kernel = ConcatCnnl::build(3, inputs_, *result);
+    ASSERT_TRUE(kCpu && kernel);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    Arc<Device::Blob>
+        workspace = dev.malloc(workspaceSize),
+        mluIns[]{
+            dev.malloc(inputTensors[0]->bytesSize()),
+            dev.malloc(inputTensors[1]->bytesSize()),
+            dev.malloc(inputTensors[2]->bytesSize()),
+            dev.malloc(inputTensors[3]->bytesSize()),
+        },
+        mluOut = dev.malloc(result->bytesSize());
+    // put input data
+    std::vector<float>
+        cpuIns[]{
+            std::vector<float>(inputTensors[0]->elementsSize()),
+            std::vector<float>(inputTensors[1]->elementsSize()),
+            std::vector<float>(inputTensors[2]->elementsSize()),
+            std::vector<float>(inputTensors[3]->elementsSize()),
+        },
+        cpuOut(result->elementsSize()),
+        out(result->elementsSize());
+    std::iota(cpuIns[0].begin(), cpuIns[0].end(), 0);
+    std::iota(cpuIns[1].begin(), cpuIns[1].end(), 0);
+    std::iota(cpuIns[2].begin(), cpuIns[2].end(), 0);
+    std::iota(cpuIns[3].begin(), cpuIns[3].end(), 0);
+    mluIns[0]->copyFromHost(cpuIns[0].data(), inputTensors[0]->bytesSize());
+    mluIns[1]->copyFromHost(cpuIns[1].data(), inputTensors[1]->bytesSize());
+    mluIns[2]->copyFromHost(cpuIns[2].data(), inputTensors[2]->bytesSize());
+    mluIns[3]->copyFromHost(cpuIns[3].data(), inputTensors[3]->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2], *mluIns[3]};
+        void *outputs[]{*mluOut};
+        routine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()};
+        void *outputs[]{cpuOut.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    mluOut->copyToHost(out.data(), result->bytesSize());
+    EXPECT_EQ(out, cpuOut);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
new file mode 100644
index 000000000..251c5c898
--- /dev/null
+++ b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
@@ -0,0 +1,205 @@
+#ifdef USE_BANG
+
+#include "../src/kernels/mat_mul/cnnl_kernel.hh"
+#include "../src/kernels/mat_mul/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TensorRefs getRefs(std::vector<Arc<Tensor>> tensors) {
+    TensorRefs refs;
+    std::transform(tensors.begin(), tensors.end(), std::back_inserter(refs),
+                   [](auto const &it) { return std::cref(*it); });
+    return refs;
+}
+
+TEST(kernel, MatMulCnnl_OnlyBias) {
+    // build routine
+    auto A = Tensor::share(DataType::F32, Shape{2, 2, 2});
+    auto B = Tensor::share(DataType::F32, Shape{2, 2});
+    auto C = Tensor::share(DataType::F32, Shape{});
+    auto Y = Tensor::share(DataType::F32, Shape{2, 2, 2});
+    bool tA = false, tB = false;
+    float alpha = 0.0, beta = 1.0;
+    MatMulInfo info(*A, *B, *C, tA, tB, alpha, beta);
+    auto kernel = MatMulCnnl::build(getRefs({A, B, C}), getRefs({Y}), tA, tB, 0, 0);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         ma = dev.malloc(A->bytesSize()),
+         mb = dev.malloc(B->bytesSize()),
+         mc = dev.malloc(C->bytesSize()),
+         my = dev.malloc(Y->bytesSize());
+    // put input data
+    std::vector<float> dataA{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> dataB{0.0, 0.0, 0.0, 0.0};
+    std::vector<float> dataC{2.5};
+    std::vector<float> ans{2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5};
+    ma->copyFromHost(dataA.data(), A->bytesSize());
+    mb->copyFromHost(dataB.data(), B->bytesSize());
+    mc->copyFromHost(dataC.data(), C->bytesSize());
+    // inference
+    void const *inputs[]{*ma, *mb, *mc};
+    void *outputs[]{*my};
+    routine(res, *workspace, inputs, outputs);
+    // take output data
+    std::vector<float> result(Y->elementsSize());
+    my->copyToHost(result.data(), Y->bytesSize());
+    // check
+    for (auto i : range0_(result.size())) {
+        EXPECT_FLOAT_EQ(result[i], ans[i]);
+    }
+}
+
+TEST(kernel, MatMulCnnl_Broadcast) {
+    // build routine
+    auto A = Tensor::share(DataType::F32, Shape{2, 1, 2, 2});
+    auto B = Tensor::share(DataType::F32, Shape{1, 2, 2, 2});
+    auto C = Tensor::share(DataType::F32, Shape{2, 1});
+    auto Y = Tensor::share(DataType::F32, Shape{2, 2, 2, 2});
+    MatMulInfo info(*A, *B, *C, false, false, 1, 1);
+    auto cpuKernel = MatMulCPU::build(info);
+    auto mluKernel = MatMulCnnl::build(getRefs({A, B, C}), getRefs({Y}), false, false, 1.0, 1.0);
+    ASSERT_TRUE(cpuKernel && mluKernel);
+    auto res = runtime::Resources();
+    auto cpuRoutine = cpuKernel->lower(res).routine;
+    auto [mluRoutine, workspaceSize] = mluKernel->lower(res);
+    // put input data
+    std::vector<float> dataA{1.0, 2.0, 0.0, 0.5,
+                             1.0, 0.0, 0.0, 1.0};
+    std::vector<float> dataB{1.0, 2.0, 0.0, 0.5,
+                             1.0, 0.0, 0.0, 1.0};
+    std::vector<float> dataC{1.0, 0.0};
+    std::vector<float> cpuOut(Y->elementsSize());
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         ma = dev.malloc(A->bytesSize()),
+         mb = dev.malloc(B->bytesSize()),
+         mc = dev.malloc(C->bytesSize()),
+         my = dev.malloc(Y->bytesSize());
+    ma->copyFromHost(dataA.data(), A->bytesSize());
+    mb->copyFromHost(dataB.data(), B->bytesSize());
+    mc->copyFromHost(dataC.data(), C->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*ma, *mb, *mc};
+        void *outputs[]{*my};
+        mluRoutine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
+        void *outputs[]{cpuOut.data()};
+        cpuRoutine(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(Y->elementsSize());
+    my->copyToHost(result.data(), Y->bytesSize());
+    // check
+    EXPECT_EQ(result, cpuOut);
+}
+
+TEST(kernel, MatMulCnnl_TransABNoBias) {
+    // build routine
+    auto A = Tensor::share(DataType::F32, Shape{1, 3, 2, 2});
+    auto B = Tensor::share(DataType::F32, Shape{2, 1, 2, 2});
+    auto Y = Tensor::share(DataType::F32, Shape{2, 3, 2, 2});
+    MatMulInfo info(*A, *B, {}, true, true, 2.0, 1);
+    auto cpuKernel = MatMulCPU::build(info);
+    auto mluKernel = MatMulCnnl::build(getRefs({A, B}), getRefs({Y}), true, true, 2.0, 1.0);
+    ASSERT_TRUE(cpuKernel && mluKernel);
+    auto res = runtime::Resources();
+    auto cpuRoutine = cpuKernel->lower(res).routine;
+    auto [mluRoutine, workspaceSize] = mluKernel->lower(res);
+    // put input data
+    std::vector<float> dataA{1.0, 2.0, 0.0, 0.5,
+                             1.0, 0.0, 0.0, 1.0,
+                             1.0, 2.0, 3.0, 4.0};
+    std::vector<float> dataB{1.0, 2.0, 0.0, 0.5,
+                             1.0, 0.0, 0.0, 1.0};
+    std::vector<float> cpuOut(Y->elementsSize());
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         ma = dev.malloc(A->bytesSize()),
+         mb = dev.malloc(B->bytesSize()),
+         my = dev.malloc(Y->bytesSize());
+    ma->copyFromHost(dataA.data(), A->bytesSize());
+    mb->copyFromHost(dataB.data(), B->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*ma, *mb};
+        void *outputs[]{*my};
+        mluRoutine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{dataA.data(), dataB.data()};
+        void *outputs[]{cpuOut.data()};
+        cpuRoutine(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(Y->elementsSize());
+    my->copyToHost(result.data(), Y->bytesSize());
+    // check
+    EXPECT_EQ(result, cpuOut);
+}
+
+TEST(kernel, MatMulCnnl_Large) {
+    // build routine
+    auto A = Tensor::share(DataType::F32, Shape{1, 512});
+    auto B = Tensor::share(DataType::F32, Shape{1000, 512});
+    auto C = Tensor::share(DataType::F32, Shape{1000});
+    auto Y = Tensor::share(DataType::F32, Shape{1, 1000});
+    MatMulInfo info(*A, *B, *C, false, true, 1, 1);
+    auto cpuKernel = MatMulCPU::build(info);
+    auto mluKernel = MatMulCnnl::build(getRefs({A, B, C}), getRefs({Y}), false, true, 1.0, 1.0);
+    ASSERT_TRUE(cpuKernel && mluKernel);
+    auto res = runtime::Resources();
+    auto cpuRoutine = cpuKernel->lower(res).routine;
+    auto [mluRoutine, workspaceSize] = mluKernel->lower(res);
+    // put input data
+    std::vector<float> dataA(A->elementsSize());
+    for (auto i : range0_(dataA.size())) {
+        dataA[i] = 1.0 * (i % 4) - 2.0;
+    }
+    std::vector<float> dataB(B->elementsSize());
+    for (auto i : range0_(dataB.size())) {
+        dataB[i] = 1.0 * (i % 4) - 2.0;
+    }
+    std::vector<float> dataC(C->elementsSize());
+    for (auto i : range0_(dataC.size())) {
+        dataC[i] = 1.0 * (i % 4) - 2.0;
+    }
+    std::vector<float> cpuOut(Y->elementsSize());
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto workspace = dev.malloc(workspaceSize),
+         ma = dev.malloc(A->bytesSize()),
+         mb = dev.malloc(B->bytesSize()),
+         mc = dev.malloc(C->bytesSize()),
+         my = dev.malloc(Y->bytesSize());
+    ma->copyFromHost(dataA.data(), A->bytesSize());
+    mb->copyFromHost(dataB.data(), B->bytesSize());
+    mc->copyFromHost(dataC.data(), C->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*ma, *mb, *mc};
+        void *outputs[]{*my};
+        mluRoutine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
+        void *outputs[]{cpuOut.data()};
+        cpuRoutine(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(Y->elementsSize());
+    my->copyToHost(result.data(), Y->bytesSize());
+    // check
+    EXPECT_EQ(result, cpuOut);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/slice/test_cnnl.cpp b/src/04kernel/test/kernels/slice/test_cnnl.cpp
new file mode 100644
index 000000000..914a8fee4
--- /dev/null
+++ b/src/04kernel/test/kernels/slice/test_cnnl.cpp
@@ -0,0 +1,59 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/slice/cpu_kernel.hh"
+#include "../../../src/kernels/slice/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, SliceCnnl) {
+    // build routine
+    Dimensions dims{
+        {5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360}
+        {2, 3, 2}, // 6 -> {2, 5}    -> { 36,  60,   90}
+        {1, 1, 3}, // 5 -> {1, 2, 3} -> { 18,   6,   30}
+        {0, 1, 1}, // 1 -> {0}
+        {0, 1, 2}, // 2 -> {0, 1}
+        {0, 1, 3}, // 3 -> {0, 1, 2}
+    };
+    auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3}),
+         output = Tensor::share(DataType::F32, Shape{3, 2, 3, 1, 2, 3});
+    SliceInfo info(dims, *input);
+    auto kernel = SliceCnnl::build(DataType::F32, dims, input->shape, output->shape);
+    auto kCpu = SliceCpu::build(info);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    auto rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluIn = dev.malloc(input->bytesSize()),
+         mluOut = dev.malloc(output->bytesSize());
+    // put input data
+    std::vector<float>
+        data(input->elementsSize()),
+        ans(output->elementsSize()),
+        result(ans.size());
+    std::iota(data.begin(), data.end(), 0);
+    mluIn->copyFromHost(data.data(), input->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluIn};
+        void *outputs[]{*mluOut};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{ans.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    mluOut->copyToHost(result.data(), output->bytesSize());
+    EXPECT_EQ(result, ans);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/split/test_cnnl.cpp b/src/04kernel/test/kernels/split/test_cnnl.cpp
new file mode 100644
index 000000000..5afa434e6
--- /dev/null
+++ b/src/04kernel/test/kernels/split/test_cnnl.cpp
@@ -0,0 +1,80 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/split/cpu_kernel.hh"
+#include "../../../src/kernels/split/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, SplitCnnl) {
+    // build routine
+    auto dataTensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 20, 7, 7});
+    std::vector<Arc<Tensor>> outputTensors{
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 1, 7, 7}),// 勿
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 9, 7, 7}),// 忘
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 3, 7, 7}),// 国
+        Tensor::share(DataType::F32, Shape{2, 3, 1, 7, 7, 7}),// 耻
+    };
+    TensorRefs outputs_;
+    outputs_.reserve(outputTensors.size());
+    std::transform(outputTensors.begin(), outputTensors.end(),
+                   std::back_inserter(outputs_),
+                   [](auto const &it) { return std::cref(*it); });
+    auto info = SplitInfo(3, outputs_);
+    auto kCpu = SplitCpu::build(info);
+    auto kernel = SplitCnnl::build(3, *dataTensor, outputs_);
+    ASSERT_TRUE(kCpu && kernel);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto [routine, workspaceSize]= kernel->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    Arc<Device::Blob>
+        workspace = dev.malloc(workspaceSize),
+        mluIn = dev.malloc(dataTensor->bytesSize()),
+        mluOuts[]{
+            dev.malloc(outputTensors[0]->bytesSize()),
+            dev.malloc(outputTensors[1]->bytesSize()),
+            dev.malloc(outputTensors[2]->bytesSize()),
+            dev.malloc(outputTensors[3]->bytesSize()),
+        };
+    // put input data
+    std::vector<float>
+        data(dataTensor->elementsSize()),
+        outsCpu[]{
+            std::vector<float>(outputTensors[0]->elementsSize()),
+            std::vector<float>(outputTensors[1]->elementsSize()),
+            std::vector<float>(outputTensors[2]->elementsSize()),
+            std::vector<float>(outputTensors[3]->elementsSize()),
+        },
+        outs[]{
+            std::vector<float>(outputTensors[0]->elementsSize()),
+            std::vector<float>(outputTensors[1]->elementsSize()),
+            std::vector<float>(outputTensors[2]->elementsSize()),
+            std::vector<float>(outputTensors[3]->elementsSize()),
+        };
+    std::iota(data.begin(), data.end(), 0);
+    mluIn->copyFromHost(data.data(), dataTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluIn};
+        void *outputs[]{*mluOuts[0], *mluOuts[1], *mluOuts[2], *mluOuts[3]};
+        routine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{outsCpu[0].data(), outsCpu[1].data(), outsCpu[2].data(), outsCpu[3].data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    for (auto i : range0_(outputTensors.size())) {
+        mluOuts[i]->copyToHost(outs[i].data(), outputTensors[i]->bytesSize());
+        EXPECT_EQ(outs[i], outsCpu[i]);
+    }
+}
+
+#endif
diff --git a/src/07onnx/src/operators/simple_binary.cc b/src/07onnx/src/operators/simple_binary.cc
index 2db99bdd3..8ff7660b4 100644
--- a/src/07onnx/src/operators/simple_binary.cc
+++ b/src/07onnx/src/operators/simple_binary.cc
@@ -10,7 +10,7 @@ namespace refactor::onnx {
         : Operator(), type(type_) {}
 
     auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox {
-        auto fmod = attributes.getOrInsert( "fmod", {0}).int_();
+        auto fmod = attributes.getOrInsert("fmod", {0}).int_();
         // clang-format off
         auto type =
             opType == "onnx::Add" ? Ty::Add :

From beda02971cd50cca84626b657db803982e9b40f6 Mon Sep 17 00:00:00 2001
From: Zhang Bolun <chamberlain0w0@gmail.com>
Date: Tue, 16 Jan 2024 03:19:08 +0000
Subject: [PATCH 4/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=92?=
 =?UTF-8?q?=E6=AD=A6=E7=BA=AA=E5=B9=B3=E5=8F=B0erf/mod/cast/clip/gather/sc?=
 =?UTF-8?q?atternd=E7=AE=97=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/04kernel/src/collectors/cast.cc           |   6 +
 src/04kernel/src/collectors/clip.cc           |   6 +
 src/04kernel/src/collectors/gather.cc         |   6 +
 src/04kernel/src/collectors/scatter_nd.cc     |   6 +
 src/04kernel/src/kernels/cast/cnnl_kernel.cc  | 226 ++++++++++++++++++
 src/04kernel/src/kernels/cast/cnnl_kernel.hh  |  27 +++
 src/04kernel/src/kernels/clip/cnnl_kernel.cc  |  66 +++++
 src/04kernel/src/kernels/clip/cnnl_kernel.hh  |  28 +++
 .../src/kernels/gather/cnnl_kernel.cc         |  86 +++++++
 .../src/kernels/gather/cnnl_kernel.hh         |  30 +++
 .../src/kernels/scatter_nd/cnnl_kernel.cc     |  86 +++++++
 .../src/kernels/scatter_nd/cnnl_kernel.hh     |  30 +++
 .../src/kernels/simple_binary/binary_cnnl.cc  |  27 ++-
 .../simple_unary/cnnl_simple_unary_kernel.cc  |   4 +-
 src/04kernel/test/kernels/cast/test_cnnl.cpp  |  49 ++++
 src/04kernel/test/kernels/clip/test_cnnl.cpp  |  51 ++++
 .../test/kernels/gather/test_gather_cnnl.cpp  |  99 ++++++++
 .../test/kernels/scatter_nd/test_cnnl.cpp     |  63 +++++
 .../simple_binary/test_binary_cnnl.cpp        |  89 +++++--
 .../test/kernels/simple_unary/test_cnnl.cpp   |   1 +
 20 files changed, 951 insertions(+), 35 deletions(-)
 create mode 100644 src/04kernel/src/kernels/cast/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/cast/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/clip/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/clip/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/gather/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/gather/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh
 create mode 100644 src/04kernel/test/kernels/cast/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/clip/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp

diff --git a/src/04kernel/src/collectors/cast.cc b/src/04kernel/src/collectors/cast.cc
index bca9d2629..45bf3372b 100644
--- a/src/04kernel/src/collectors/cast.cc
+++ b/src/04kernel/src/collectors/cast.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/cast.h"
 #include "../kernels/cast/cpu_kernel.hh"
 #include "../kernels/cast/cuda_kernel.hh"
+#include "../kernels/cast/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -24,6 +25,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = CastCnnl::build(from, to); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/clip.cc b/src/04kernel/src/collectors/clip.cc
index 06ccd020b..364fbe588 100644
--- a/src/04kernel/src/collectors/clip.cc
+++ b/src/04kernel/src/collectors/clip.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/clip.h"
 #include "../kernels/clip/cpu_kernel.hh"
 #include "../kernels/clip/cuda_kernel.hh"
+#include "../kernels/clip/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -24,6 +25,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = ClipCnnl::build(data, hasMax); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/gather.cc b/src/04kernel/src/collectors/gather.cc
index 9c30f1c62..3b66b91f0 100644
--- a/src/04kernel/src/collectors/gather.cc
+++ b/src/04kernel/src/collectors/gather.cc
@@ -1,4 +1,5 @@
 ﻿#include "kernel/collectors/gather.h"
+#include "../kernels/gather/cnnl_kernel.hh"
 #include "../kernels/gather/cpu_kernel.hh"
 #include "../kernels/gather/cuda_kernel.hh"
 
@@ -20,6 +21,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = GatherCnnl::build(axis, inputs[0].get(), inputs[1].get(), outputs[0].get()); ptr != nullptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/scatter_nd.cc b/src/04kernel/src/collectors/scatter_nd.cc
index 62d63c802..3b43a25fb 100644
--- a/src/04kernel/src/collectors/scatter_nd.cc
+++ b/src/04kernel/src/collectors/scatter_nd.cc
@@ -1,6 +1,7 @@
 ﻿#include "kernel/collectors/scatter_nd.h"
 #include "../kernels/scatter_nd/cpu_kernel.hh"
 #include "../kernels/scatter_nd/cuda_kernel.hh"
+#include "../kernels/scatter_nd/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -23,6 +24,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = ScatterNDCnnl::build(inputs, outputs); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
new file mode 100644
index 000000000..951209741
--- /dev/null
+++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
@@ -0,0 +1,226 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+
+namespace refactor::kernel {
+    using K = CastCnnl;
+    using DT = DataType;
+
+    K::CastCnnl(decltype(from) from_,
+                decltype(to) to_,
+                decltype(shape) shape_) noexcept
+        : from(from_), to(to_), shape(shape_) {}
+
+    auto K::build(Tensor const &from, Tensor const &to) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return std::make_unique<K>(from.dataType, to.dataType,
+                                   std::vector<int>(from.shape.begin(), from.shape.end()));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing cast operation using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    static cnnlCastDataType_t castType(DataType from, DataType to);
+
+    auto K::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, outDesc;
+            cnnlCastDataType_t cast;
+
+            Descriptors() : inDesc(nullptr), outDesc(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
+            }
+        };
+        auto d = std::make_shared<Descriptors>();
+        d->cast = castType(from, to);
+        setCnnlTensor(d->inDesc, from, slice(shape.data(), shape.size()));
+        setCnnlTensor(d->outDesc, to, slice(shape.data(), shape.size()));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
+                                         d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
+            // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+    }
+
+    static cnnlCastDataType_t castType(DataType from, DataType to) {
+        switch (from) {
+            case DT::F32:
+                switch (to) {
+                    case DT::F64:
+                        return CNNL_CAST_FLOAT_TO_DOUBLE;
+                    case DT::FP16:
+                        return CNNL_CAST_FLOAT_TO_HALF;
+                    case DT::I64:
+                        return CNNL_CAST_FLOAT_TO_INT64;
+                    case DT::I32:
+                        return CNNL_CAST_FLOAT_TO_INT32;
+                    case DT::I16:
+                        return CNNL_CAST_FLOAT_TO_INT16;
+                    case DT::I8:
+                        return CNNL_CAST_FLOAT_TO_INT8;
+                    case DT::U8:
+                        return CNNL_CAST_FLOAT_TO_UINT8;
+                    // case DT::BF16:
+                    //     return CNNL_CAST_FLOAT_TO_BFLOAT16;
+                    case DT::Bool:
+                        return CNNL_CAST_FLOAT_TO_BOOL;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::FP16:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_HALF_TO_FLOAT;
+                    case DT::I64:
+                        return CNNL_CAST_HALF_TO_INT64;
+                    case DT::I32:
+                        return CNNL_CAST_HALF_TO_INT32;
+                    case DT::I16:
+                        return CNNL_CAST_HALF_TO_INT16;
+                    case DT::I8:
+                        return CNNL_CAST_HALF_TO_INT8;
+                    case DT::U8:
+                        return CNNL_CAST_HALF_TO_UINT8;
+                    case DT::Bool:
+                        return CNNL_CAST_HALF_TO_BOOL;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::I32:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_INT32_TO_FLOAT;
+                    case DT::FP16:
+                        return CNNL_CAST_INT32_TO_HALF;
+                    case DT::I64:
+                        return CNNL_CAST_INT32_TO_INT64;
+                    case DT::I16:
+                        return CNNL_CAST_INT32_TO_INT16;
+                    case DT::I8:
+                        return CNNL_CAST_INT32_TO_INT8;
+                    case DT::Bool:
+                        return CNNL_CAST_INT32_TO_BOOL;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::I16:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_INT16_TO_FLOAT;
+                    case DT::FP16:
+                        return CNNL_CAST_INT16_TO_HALF;
+                    case DT::I32:
+                        return CNNL_CAST_INT16_TO_INT32;
+                    // case DT::I8:
+                    //     return CNNL_CAST_INT16_TO_INT8;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::I8:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_INT8_TO_FLOAT;
+                    case DT::FP16:
+                        return CNNL_CAST_INT8_TO_HALF;
+                    case DT::I32:
+                        return CNNL_CAST_INT8_TO_INT32;
+                    case DT::I16:
+                        return CNNL_CAST_INT8_TO_INT16;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::U8:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_UINT8_TO_FLOAT;
+                    case DT::FP16:
+                        return CNNL_CAST_UINT8_TO_HALF;
+                    case DT::I64:
+                        return CNNL_CAST_UINT8_TO_INT64;
+                    case DT::I32:
+                        return CNNL_CAST_UINT8_TO_INT32;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::Bool:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_BOOL_TO_FLOAT;
+                    case DT::FP16:
+                        return CNNL_CAST_BOOL_TO_HALF;
+                    case DT::I32:
+                        return CNNL_CAST_BOOL_TO_INT32;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::I64:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_INT64_TO_FLOAT;
+                    case DT::FP16:
+                        return CNNL_CAST_INT64_TO_HALF;
+                    case DT::I32:
+                        return CNNL_CAST_INT64_TO_INT32;
+                    case DT::U32:
+                        return CNNL_CAST_INT64_TO_UINT32;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::U32:
+                switch (to) {
+                    case DT::I64:
+                        return CNNL_CAST_UINT32_TO_INT64;
+                    case DT::U64:
+                        return CNNL_CAST_UINT32_TO_UINT64;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::F64:
+                switch (to) {
+                    case DT::F32:
+                        return CNNL_CAST_DOUBLE_TO_FLOAT;
+                    default:
+                        UNREACHABLE();
+                }
+            case DT::BF16:
+                switch (to) {
+                    // case DT::F32:
+                    //     return CNNL_CAST_BF16_TO_FLOAT;
+                    default:
+                        UNREACHABLE();
+                }
+            default:
+                UNREACHABLE();
+        }
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.hh b/src/04kernel/src/kernels/cast/cnnl_kernel.hh
new file mode 100644
index 000000000..b1e638080
--- /dev/null
+++ b/src/04kernel/src/kernels/cast/cnnl_kernel.hh
@@ -0,0 +1,27 @@
+﻿#ifndef KERNEL_CAST_CNNL_KERNEL_HH
+#define KERNEL_CAST_CNNL_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct CastCnnl final : public Kernel {
+        DataType from, to;
+        std::vector<int> shape;
+
+        CastCnnl(decltype(from), decltype(to), decltype(shape)) noexcept;
+
+        static KernelBox build(Tensor const &, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_CAST_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.cc b/src/04kernel/src/kernels/clip/cnnl_kernel.cc
new file mode 100644
index 000000000..423939bc9
--- /dev/null
+++ b/src/04kernel/src/kernels/clip/cnnl_kernel.cc
@@ -0,0 +1,66 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = ClipCnnl;
+
+    K::ClipCnnl(decltype(dataType) dt,
+                decltype(shape) shape_,
+                decltype(hasMax) hasMax_) noexcept
+        : dataType(dt), shape(shape_), hasMax(hasMax_) {
+    }
+
+    auto K::build(Tensor const &data, bool hasMax) noexcept -> KernelBox {
+        return data.dataType.isCpuNumberic()
+                   ? std::make_unique<K>(data.dataType,
+                                         std::vector<int>(data.shape.begin(), data.shape.end()),
+                                         hasMax)
+                   : nullptr;
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing clip operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto K::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t t;
+
+            Descriptors() : t(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&t));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(t));
+            }
+        };
+        auto d = std::make_shared<Descriptors>();
+        setCnnlTensor(d->t, dataType, slice(shape.data(), shape.size()));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d), hasMax = this->hasMax](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            CNNL_ASSERT(cnnlClip_v2(res.fetchOrStore<CnnlContext>()->handle,
+                                    CNNL_POINTER_MODE_DEVICE, d->t,
+                                    inputs[0], inputs[1], hasMax ? inputs[2] : nullptr,
+                                    d->t, outputs[0]));
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.hh b/src/04kernel/src/kernels/clip/cnnl_kernel.hh
new file mode 100644
index 000000000..37d168062
--- /dev/null
+++ b/src/04kernel/src/kernels/clip/cnnl_kernel.hh
@@ -0,0 +1,28 @@
+﻿#ifndef KERNEL_CLIP_CNNL_KERNEL_HH
+#define KERNEL_CLIP_CNNL_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct ClipCnnl final : public Kernel {
+        DataType dataType;
+        std::vector<int> shape;
+        bool hasMax;
+
+        ClipCnnl(decltype(dataType), decltype(shape), decltype(hasMax)) noexcept;
+
+        static KernelBox build(Tensor const &, bool hasMax) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_CLIP_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
new file mode 100644
index 000000000..19f9ed358
--- /dev/null
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -0,0 +1,86 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = GatherCnnl;
+
+    K::GatherCnnl(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(int axis, Tensor input, Tensor index, Tensor output) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<K>(decltype(info){
+            input.dataType,
+            index.dataType,
+            axis,
+            std::vector<int>(input.shape.begin(), input.shape.end()),
+            std::vector<int>(index.shape.begin(), index.shape.end()),
+            std::vector<int>(output.shape.begin(), output.shape.end()),
+        });
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing gather using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto K::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, indexDesc, outDesc;
+
+            Descriptors() : inDesc(nullptr), indexDesc(nullptr), outDesc(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&indexDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(indexDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
+            }
+        };
+        auto d = std::make_shared<Descriptors>();
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.inDim.size(), info.inDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType),
+            info.indexDim.size(), info.indexDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.outDim.size(), info.outDim.data()));
+
+        size_t workspaceSize = info.inDim.size() * sizeof(int);
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d),
+                        shape = info.inDim.data(), workspaceSize,
+                        dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            BANG_ASSERT(cnrtMemcpy(workspace, (void*) shape, workspaceSize, CNRT_MEM_TRANS_DIR_HOST2DEV));
+            CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore<CnnlContext>()->handle, dim,
+                                     d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
+                                      d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
+                                     d->outDesc, outputs[0]));
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.hh b/src/04kernel/src/kernels/gather/cnnl_kernel.hh
new file mode 100644
index 000000000..f60a8ada0
--- /dev/null
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.hh
@@ -0,0 +1,30 @@
+﻿#ifndef KERNEL_GATHER_CNNL_KERNEL_HH
+#define KERNEL_GATHER_CNNL_KERNEL_HH
+
+#include "kernel/attributes/gather_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct GatherCnnl final : public Kernel {
+        struct {
+            DataType dataType, indexDataType;
+            int axis;
+            std::vector<int> inDim, indexDim, outDim;
+        } info;
+
+        explicit GatherCnnl(decltype(info)) noexcept;
+
+        static KernelBox build(int, Tensor, Tensor, Tensor) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_TRANSPOSE_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc
new file mode 100644
index 000000000..8f181b58b
--- /dev/null
+++ b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc
@@ -0,0 +1,86 @@
+﻿#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = ScatterNDCnnl;
+
+    K::ScatterNDCnnl(decltype(info) info_)
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(TensorRefs inputs, TensorRefs outputs) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        return std::make_unique<ScatterNDCnnl>(decltype(info){
+            inputs[0].get().dataType,
+            inputs[1].get().dataType,
+            inputs[2].get().dataType,
+            std::vector<int>(inputs[0].get().shape.begin(), inputs[0].get().shape.end()),
+            std::vector<int>(inputs[1].get().shape.begin(), inputs[1].get().shape.end()),
+            std::vector<int>(inputs[2].get().shape.begin(), inputs[2].get().shape.end()),
+            std::vector<int>(outputs[0].get().shape.begin(), outputs[0].get().shape.end()),
+        });
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing scatterNd operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto K::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, indexDesc, updateDesc, outDesc;
+
+            Descriptors() : inDesc(nullptr), indexDesc(nullptr),
+                            updateDesc(nullptr), outDesc(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&indexDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&updateDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(indexDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(updateDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
+            }
+        };
+        auto d = std::make_shared<Descriptors>();
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.inDim.size(), info.inDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType),
+            info.indexDim.size(), info.indexDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->updateDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.updateDataType),
+            info.updateDim.size(), info.updateDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.updateDataType),
+            info.outDim.size(), info.outDim.data()));
+    
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            CNNL_ASSERT(cnnlScatterNd_v2(res.fetchOrStore<CnnlContext>()->handle, CNNL_SCATTERND_UPDATE,
+                                         d->indexDesc, inputs[1], d->updateDesc, inputs[2],
+                                         d->inDesc, inputs[0], d->outDesc, outputs[0]));
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+        };
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh
new file mode 100644
index 000000000..81fca0357
--- /dev/null
+++ b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh
@@ -0,0 +1,30 @@
+﻿#ifndef KERNEL_SCATTER_ND_CNNL_KERNEL_HH
+#define KERNEL_SCATTER_ND_CNNL_KERNEL_HH
+
+#include "kernel/attributes/scatter_nd_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct ScatterNDCnnl final : public Kernel {
+        struct {
+            DataType dataType, indexDataType, updateDataType;
+            std::vector<int> inDim, indexDim, updateDim, outDim;
+        } info;
+
+        explicit ScatterNDCnnl(decltype(info));
+
+        static KernelBox build(TensorRefs, TensorRefs) noexcept;
+
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SCATTER_ND_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
index cfe2ce0fa..86e605349 100644
--- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
+++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -16,21 +16,19 @@ namespace refactor::kernel {
 
     auto K::build(Op op, Tensor const &a, Tensor const &b, Tensor const &c) noexcept -> KernelBox {
         static const std::unordered_set<Op>
-            ARTHIMETIC{Op::Add, Op::Sub, Op::Mul, Op::Div, Op::And, Op::Or, Op::Xor, Op::Pow};
+            ARTHIMETIC{Op::Add, Op::Sub, Op::Mul, Op::Div, Op::And, Op::Or, Op::Xor, Op::Pow, Op::Mod, Op::Fmod};
 
 #ifndef USE_BANG
         return nullptr;
 #endif
 
         if (a.dataType != b.dataType ||
-            !a.dataType.isFloat() ||
+            // !a.dataType.isFloat() ||
             !ARTHIMETIC.contains(op) ||
             // At least one of a,b should have the same shape as c
             (a.shape != c.shape && b.shape != c.shape) ||
             // Sub only supports brocasting b
-            (a.shape != c.shape && op == Op::Sub) ||
-            // Cnnl binary op only supports up to 5D
-            !((a.rank() == 5 && b.rank() == 5) || (a.rank() <= 4 && b.rank() <= 4))) {
+            (a.shape != c.shape && op == Op::Sub)) {
             return nullptr;
         }
 
@@ -103,16 +101,19 @@ namespace refactor::kernel {
         setCnnlTensor(d->aDesc, dataType, slice(aDims.data(), aDims.size()));
         setCnnlTensor(d->bDesc, dataType, slice(bDims.data(), bDims.size()));
         setCnnlTensor(d->cDesc, dataType, slice(cDims.data(), cDims.size()));
-        CNNL_ASSERT(cnnlSetOpTensorDescriptor(
-            d->opDesc, cnnlOP,
-            cnnlDataTypeConvert(d->f32 ? DT::F32 : DT::F64),
-            CNNL_NOT_PROPAGATE_NAN));
+        if (cnnlOP) {
+            CNNL_ASSERT(cnnlSetOpTensorDescriptor(
+                d->opDesc, cnnlOP,
+                cnnlDataTypeConvert(dataType),
+                CNNL_NOT_PROPAGATE_NAN));
+        }
 
         auto cnnlGetBinaryWorkspaceSize =
             (opType == SimpleBinaryType::Add || opType == SimpleBinaryType::Sub || opType == SimpleBinaryType::Mul)  ? cnnlGetOpTensorWorkspaceSize
             : (opType == SimpleBinaryType::Div)                                                                      ? cnnlGetDivWorkspaceSize
             : (opType == SimpleBinaryType::And || opType == SimpleBinaryType::Or || opType == SimpleBinaryType::Xor) ? cnnlGetLogicOpWorkspaceSize
             : (opType == SimpleBinaryType::Pow)                                                                      ? cnnlGetPowWorkspaceSize
+            : (opType == SimpleBinaryType::Mod || opType == SimpleBinaryType::Fmod)                                  ? cnnlGetFloorModWorkspaceSize
                                                                                                                      : nullptr;
 
         if (cnnlGetBinaryWorkspaceSize == nullptr) {
@@ -184,7 +185,15 @@ namespace refactor::kernel {
                                     d->bDesc, b,
                                     workspace, workspaceSize,
                                     d->cDesc, c));
+            } else if (op == SimpleBinaryType::Mod || op == SimpleBinaryType::Fmod) {
+                CNNL_ASSERT(cnnlFloorMod(handle,
+                                         d->aDesc, a,
+                                         d->bDesc, b,
+                                         d->cDesc, c,
+                                         workspace, workspaceSize));
             }
+            
+            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
index f6e32159d..0c91a28d3 100644
--- a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
@@ -17,7 +17,7 @@ namespace refactor::kernel {
         : Kernel(), type(type_), dataType(dataType_), size(size_) {}
 
     auto K::build(Op op, Tensor const &a) noexcept -> KernelBox {
-        static const std::unordered_set<Op> supportedOp{Op::Abs, Op::Sqrt, Op::Neg};
+        static const std::unordered_set<Op> supportedOp{Op::Abs, Op::Sqrt, Op::Neg, Op::Erf};
 
 #ifndef USE_BANG
         return nullptr;
@@ -74,6 +74,8 @@ namespace refactor::kernel {
                     return cnnlNegTensor(handle, x_desc, x, y_desc, y);
                 case Ty::Sqrt:
                     return cnnlSqrt_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y);
+                case Ty::Erf:
+                    return cnnlErf_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y);
                 default:
                     UNREACHABLE();
             }
diff --git a/src/04kernel/test/kernels/cast/test_cnnl.cpp b/src/04kernel/test/kernels/cast/test_cnnl.cpp
new file mode 100644
index 000000000..72f842470
--- /dev/null
+++ b/src/04kernel/test/kernels/cast/test_cnnl.cpp
@@ -0,0 +1,49 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/cast/cpu_kernel.hh"
+#include "../../../src/kernels/cast/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, CastCnnl) {
+    // build routine
+    auto x = Tensor::share(DataType::F32, Shape{2, 3, 4, 5});
+    auto y = Tensor::share(DataType::I8, Shape{2, 3, 4, 5});
+    auto kernel = CastCnnl::build(*x, *y),
+         kCpu = CastCpu::build(*x, *y);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine,
+         rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto xMlu = dev.malloc(x->bytesSize()),
+         yMlu = dev.malloc(y->bytesSize());
+    // put input data
+    std::vector<float> x_(x->elementsSize());
+    std::vector<int8_t> y_(y->elementsSize());
+    std::iota(x_.begin(), x_.end(), 0);
+    xMlu->copyFromHost(x_.data(), x->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*xMlu};
+        void *outputs[]{*yMlu};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{x_.data()};
+        void *outputs[]{y_.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    std::vector<int8_t> result(y->elementsSize());
+    yMlu->copyToHost(result.data(), y->bytesSize());
+    EXPECT_EQ(result, y_);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/clip/test_cnnl.cpp b/src/04kernel/test/kernels/clip/test_cnnl.cpp
new file mode 100644
index 000000000..ab29a1cb5
--- /dev/null
+++ b/src/04kernel/test/kernels/clip/test_cnnl.cpp
@@ -0,0 +1,51 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/clip/cpu_kernel.hh"
+#include "../../../src/kernels/clip/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, ClipCnnl) {
+    // build routine
+    auto data = Tensor::share(DataType::F32, Shape{2, 3, 4, 5});
+    auto kernel = ClipCnnl::build(*data, true),
+         kCpu = ClipCpu::build(*data, true);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine,
+         rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluMem = dev.malloc(data->bytesSize()),
+         mluMin = dev.malloc(sizeof(float)),
+         mluMax = dev.malloc(sizeof(float));
+    // put input data
+    std::vector<float> value(data->elementsSize());
+    float min = 30, max = 80;
+    std::iota(value.begin(), value.end(), 0);
+    mluMem->copyFromHost(value.data(), data->bytesSize());
+    mluMin->copyFromHost(&min, sizeof(float));
+    mluMax->copyFromHost(&max, sizeof(float));
+    // inference
+    {
+        void const *inputs[]{*mluMem, *mluMin, *mluMax};
+        void *outputs[]{*mluMem};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{value.data(), &min, &max};
+        void *outputs[]{value.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    std::vector<float> result(data->elementsSize());
+    mluMem->copyToHost(result.data(), data->bytesSize());
+    EXPECT_EQ(result, value);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
new file mode 100644
index 000000000..a3f739cd0
--- /dev/null
+++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
@@ -0,0 +1,99 @@
+#ifdef USE_BANG
+
+#include "../src/kernels/gather/cnnl_kernel.hh"
+#include "../src/kernels/gather/cpu_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, GatherCnnl) {
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    // Case axis = 0, indexType= int32
+    {
+        // Create Tensor and build kernels
+        auto data = Tensor::share(DataType::F32, Shape{3, 2}, LayoutType::NCHW);
+        auto indices = Tensor::share(DataType::I32, Shape{2, 2}, LayoutType::NCHW);
+        auto output = Tensor::share(DataType::F32, Shape{2, 2, 2}, LayoutType::NCHW);
+        GatherInfo info(0, *data, *indices);
+        auto cnnlKernel = GatherCnnl::build(0, *data, *indices, *output);
+        auto cpuKernel = GatherCpu::build(info);
+        ASSERT_TRUE(cnnlKernel && cpuKernel);
+        auto res = runtime::Resources();
+        auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res);
+        auto cpuRoutine = cpuKernel->lower(res).routine;
+        // Init inputs and outputs
+        std::vector<float> a{1.0, 1.2, 2.3, 3.4, 4.5, 5.7};
+        std::vector<int> b{0, 1, 1, 2};
+        std::vector<float> c(output->elementsSize());
+        auto workspace = dev.malloc(workspaceSize),
+             aMLU = dev.malloc(data->bytesSize()),
+             bMLU = dev.malloc(indices->bytesSize()),
+             cMLU = dev.malloc(output->bytesSize());
+        aMLU->copyFromHost(a.data(), data->bytesSize());
+        bMLU->copyFromHost(b.data(), indices->bytesSize());
+        // Compute
+        {
+            void const *inputs[]{*aMLU, *bMLU};
+            void *outputs[]{*cMLU};
+            cnnlRoutine(res, *workspace, inputs, outputs);
+        }
+        {
+            void const *inputs[]{a.data(), b.data()};
+            void *outputs[]{c.data()};
+            cpuRoutine(res, nullptr, inputs, outputs);
+        }
+        // Compare
+        std::vector<float> result(output->elementsSize());
+        cMLU->copyToHost(result.data(), output->bytesSize());
+        for (auto i : range0_(c.size())) {
+            EXPECT_FLOAT_EQ(c[i], result[i]);
+        }
+    }
+
+    // Case axis = 1, indexType= int32
+    {
+        // Create Tensor and build kernels
+        auto data = Tensor::share(DataType::F32, Shape{3, 3}, LayoutType::NCHW);
+        auto indices = Tensor::share(DataType::I32, Shape{1, 2}, LayoutType::NCHW);
+        auto output = Tensor::share(DataType::F32, Shape{3, 1, 2}, LayoutType::NCHW);
+        GatherInfo info(1, *data, *indices);
+        auto cnnlKernel = GatherCnnl::build(1, *data, *indices, *output);
+        auto cpuKernel = GatherCpu::build(info);
+        ASSERT_TRUE(cnnlKernel && cpuKernel);
+        auto res = runtime::Resources();
+        auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res);
+        auto cpuRoutine = cpuKernel->lower(res).routine;
+        // Init inputs and outputs
+        std::vector<float> a{1.0, 1.2, 1.9, 2.3, 3.4, 3.9, 4.5, 5.7, 5.9};
+        std::vector<int> b{0, 2};
+        std::vector<float> c(output->elementsSize());
+        auto workspace = dev.malloc(workspaceSize),
+             aMLU = dev.malloc(data->bytesSize()),
+             bMLU = dev.malloc(indices->bytesSize()),
+             cMLU = dev.malloc(output->bytesSize());
+        aMLU->copyFromHost(a.data(), data->bytesSize());
+        bMLU->copyFromHost(b.data(), indices->bytesSize());
+        // Compute
+        {
+            void const *inputs[]{*aMLU, *bMLU};
+            void *outputs[]{*cMLU};
+            cnnlRoutine(res, *workspace, inputs, outputs);
+        }
+        {
+            void const *inputs[]{a.data(), b.data()};
+            void *outputs[]{c.data()};
+            cpuRoutine(res, nullptr, inputs, outputs);
+        }
+        // Compare
+        std::vector<float> result(output->elementsSize());
+        cMLU->copyToHost(result.data(), output->bytesSize());
+        for (auto i : range0_(c.size())) {
+            EXPECT_FLOAT_EQ(c[i], result[i]);
+        }
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp
new file mode 100644
index 000000000..1cdf72166
--- /dev/null
+++ b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp
@@ -0,0 +1,63 @@
+﻿#ifdef USE_BANG
+
+#include "../../../src/kernels/scatter_nd/cpu_kernel.hh"
+#include "../../../src/kernels/scatter_nd/cnnl_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, ScatterNDCnnl) {
+    // build routine
+    auto data = Tensor::share(DataType::F32, Shape{8});
+    auto indices = Tensor::share(DataType::I64, Shape{4, 1});
+    auto updates = Tensor::share(DataType::F32, Shape{4});
+    auto output = Tensor::share(DataType::F32, Shape{8});
+    ScatterNDInfo info(*data, *indices);
+    auto getRefs = [](std::vector<Arc<Tensor>> tensors) -> TensorRefs {
+        TensorRefs refs;
+        std::transform(tensors.begin(), tensors.end(), std::back_inserter(refs),
+                       [](auto const &it) { return std::cref(*it); });
+        return refs;
+    };
+    auto kernel = ScatterNDCnnl::build(getRefs({data, indices, updates}), getRefs({output})),
+         kCpu = ScatterNDCpu::build(info);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine,
+         rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluData = dev.malloc(data->bytesSize()),
+         mluIndices = dev.malloc(indices->bytesSize()),
+         mluUpdates = dev.malloc(updates->bytesSize()),
+         mluOut = dev.malloc(output->bytesSize());
+    // put input data
+    std::vector<float> data_(data->elementsSize());
+    std::iota(data_.begin(), data_.end(), 1);
+    std::vector<int64_t> indices_{4, 3, 1, 7};
+    std::vector<float> updates_{9, 10, 11, 12};
+    mluData->copyFromHost(data_.data(), data->bytesSize());
+    mluIndices->copyFromHost(indices_.data(), indices->bytesSize());
+    mluUpdates->copyFromHost(updates_.data(), updates->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluData, *mluIndices, *mluUpdates};
+        void *outputs[]{*mluOut};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{data_.data(), indices_.data(), updates_.data()};
+        void *outputs[]{data_.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    std::vector<float> result(output->elementsSize());
+    mluOut->copyToHost(result.data(), output->bytesSize());
+    EXPECT_EQ(result, data_);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
index 4ef7c6d23..7bd18bf6d 100644
--- a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
+++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
@@ -9,11 +9,13 @@ using namespace refactor;
 using namespace kernel;
 using namespace hardware;
 
+template<decltype(DataType::internal) T>
 void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape dimC) {
     // Create Tensor and build kernels
-    auto aTensor = Tensor::share(DataType::F32, dimA, LayoutType::NCHW);
-    auto bTensor = Tensor::share(DataType::F32, dimB, LayoutType::NCHW);
-    auto cTensor = Tensor::share(DataType::F32, dimC, LayoutType::NCHW);
+    using T_ = primitive<T>::type;
+    auto aTensor = Tensor::share(T, dimA, LayoutType::NCHW);
+    auto bTensor = Tensor::share(T, dimB, LayoutType::NCHW);
+    auto cTensor = Tensor::share(T, dimC, LayoutType::NCHW);
     auto kernel = BinaryCnnl::build(binaryOPT, *aTensor, *bTensor, *cTensor);
     auto kCpu = BinaryCpu::build(binaryOPT, *aTensor, *bTensor);
     ASSERT_TRUE(kCpu && kernel);
@@ -21,10 +23,9 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di
     auto [routine, workspaceSize] = kernel->lower(res);
     auto rCpu = kCpu->lower(res).routine;
     // Init inputs and outputs
-    std::vector<float>
-        a(aTensor->elementsSize(), 3.0f),
-        b(bTensor->elementsSize(), 2.0f),
-        c(cTensor->elementsSize());
+    std::vector<T_> a(aTensor->elementsSize(), 3);
+    std::vector<T_> b(bTensor->elementsSize(), 2);
+    std::vector<T_> c(cTensor->elementsSize());
     auto &dev = *device::init(Device::Type::Mlu, 0, "");
     auto workspace = dev.malloc(workspaceSize),
          aMLU = dev.malloc(aTensor->bytesSize()),
@@ -44,47 +45,85 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di
         rCpu(res, nullptr, inputs, outputs);
     }
     // Compare
-    std::vector<float> result(cTensor->elementsSize());
+    std::vector<T_> result(cTensor->elementsSize());
     cMLU->copyToHost(result.data(), cTensor->bytesSize());
     for (auto i : range0_(result.size())) {
-        EXPECT_FLOAT_EQ(c[i], result[i]);
+        EXPECT_EQ(c[i], result[i]);
     }
 }
-
 TEST(kernel, BinaryCnnlAdd) {
-    testBinaryCnnl(SimpleBinaryType::Add, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Add,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCnnlMul) {
-    testBinaryCnnl(SimpleBinaryType::Mul, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Mul,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCnnlSub) {
-    testBinaryCnnl(SimpleBinaryType::Sub, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Sub,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCnnlDiv) {
-    testBinaryCnnl(SimpleBinaryType::Div, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Div,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
 }
 
-// TEST(kernel, BinaryCnnlAnd) {
-//     testBinaryCnnl(SimpleBinaryType::And, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
-// }
+TEST(kernel, BinaryCnnlPow) {
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Pow,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
+}
+
+TEST(kernel, BinaryCnnlMod) {
+    testBinaryCnnl<DataType::I32>(SimpleBinaryType::Mod,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
+}
 
-// TEST(kernel, BinaryCnnlOr) {
-//     testBinaryCnnl(SimpleBinaryType::Or, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+TEST(kernel, BinaryCnnlFMod) {
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Fmod,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
+}
+
+// TEST(kernel, BinaryCnnlMod) {
+//     testBinaryCnnl<DataType::I8>(SimpleBinaryType::Mod,
+//                                  Shape{2, 5, 10, 20, 3, 4},
+//                                  Shape{2, 5, 10, 20, 3, 4},
+//                                  Shape{2, 5, 10, 20, 3, 4});
 // }
 
-// TEST(kernel, BinaryCnnlXor) {
-//     testBinaryCnnl(SimpleBinaryType::Xor, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
+// TEST(kernel, BinaryCnnlFmodI8) {
+//     testBinaryCnnl<DataType::I8>(SimpleBinaryType::Fmod,
+//                                  Shape{2, 5, 10, 20, 3, 4},
+//                                  Shape{2, 5, 10, 20, 3, 4},
+//                                  Shape{2, 5, 10, 20, 3, 4});
 // }
 
-TEST(kernel, BinaryCnnlPow) {
-    testBinaryCnnl(SimpleBinaryType::Pow, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40});
-}
+// TEST(kernel, BinaryCnnlFmodF32) {
+//     testBinaryCnnl<DataType::F32>(SimpleBinaryType::Fmod,
+//                                   Shape{2, 5, 10, 20, 3, 4},
+//                                   Shape{2, 5, 10, 20, 3, 4},
+//                                   Shape{2, 5, 10, 20, 3, 4});
+// }
 
 TEST(kernel, BinaryCnnlBroadcast) {
-    testBinaryCnnl(SimpleBinaryType::Add, Shape{3, 4, 5, 6}, Shape{}, Shape{3, 4, 5, 6});
+    testBinaryCnnl<DataType::F32>(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6});
 }
 
+
 #endif
diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
index 2707e6274..e4b9428ee 100644
--- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
@@ -51,6 +51,7 @@ TEST(kernel, SimpleUnaryCnnl) {
     testOp(SimpleUnaryType::Abs, false);
     testOp(SimpleUnaryType::Neg, false);
     testOp(SimpleUnaryType::Sqrt, false);
+    testOp(SimpleUnaryType::Erf, false);
 }
 
 TEST(kernel, ActivationCnnl) {

From 5aa7a1e6d8edd320de298d99ad486d4a440e56ad Mon Sep 17 00:00:00 2001
From: Zhang Bolun <chamberlain0w0@gmail.com>
Date: Wed, 17 Jan 2024 08:23:03 +0000
Subject: [PATCH 5/9] =?UTF-8?q?feat:=20mlu=E8=B7=91=E9=80=9Allama=EF=BC=8C?=
 =?UTF-8?q?=E4=BD=86=E6=9C=AA=E5=BE=97=E5=88=B0=E6=AD=A3=E7=A1=AE=E7=BB=93?=
 =?UTF-8?q?=E6=9E=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/04kernel/src/collectors/concat.cc         |  4 ++-
 src/04kernel/src/collectors/gather.cc         |  9 +++++--
 src/04kernel/src/collectors/split.cc          |  4 ++-
 .../src/kernels/concat/cnnl_kernel.cc         |  2 +-
 .../src/kernels/concat/cnnl_kernel.hh         |  2 +-
 .../src/kernels/gather/cnnl_kernel.cc         |  5 ++--
 .../src/kernels/gather/cnnl_kernel.hh         |  2 +-
 .../src/kernels/reduce/cnnl_kernel.cc         |  6 +++--
 .../src/kernels/simple_binary/binary_cnnl.cc  | 25 +++++--------------
 .../simple_unary/cnnl_simple_unary_kernel.cc  | 13 +++++-----
 src/04kernel/src/kernels/split/cnnl_kernel.cc | 11 +++++---
 src/04kernel/src/kernels/split/cnnl_kernel.hh |  4 +--
 src/09python_ffi/src/compiler.cc              |  1 +
 src/09python_ffi/src/import.cpp               |  1 +
 14 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/src/04kernel/src/collectors/concat.cc b/src/04kernel/src/collectors/concat.cc
index 27f27c776..0802de8e8 100644
--- a/src/04kernel/src/collectors/concat.cc
+++ b/src/04kernel/src/collectors/concat.cc
@@ -9,6 +9,8 @@ namespace refactor::kernel {
     ConcatCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
         SplitInfo info(axis, inputs);
 
+        auto const &b = outputs[0];
+
         std::vector<KernelBox> ans;
         switch (_target) {
             case decltype(_target)::Cpu:
@@ -22,7 +24,7 @@ namespace refactor::kernel {
                 }
                 break;
             case decltype(_target)::Mlu:
-                if (auto ptr = ConcatCnnl::build(axis, inputs, outputs[0].get()); ptr) {
+                if (auto ptr = ConcatCnnl::build(axis, inputs, b); ptr) {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
diff --git a/src/04kernel/src/collectors/gather.cc b/src/04kernel/src/collectors/gather.cc
index 3b66b91f0..0a8b8b0e8 100644
--- a/src/04kernel/src/collectors/gather.cc
+++ b/src/04kernel/src/collectors/gather.cc
@@ -9,7 +9,12 @@ namespace refactor::kernel {
     GatherCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
         GatherInfo info(axis, inputs[0], inputs[1]);
 
-        std::vector<KernelBox> ans;
+        auto const &a = inputs[0];
+        auto const &b = inputs[1];
+        auto const &c = outputs[0];
+
+        std::vector<KernelBox>
+            ans;
         switch (_target) {
             case decltype(_target)::Cpu:
                 if (auto ptr = GatherCpu::build(info); ptr != nullptr) {
@@ -22,7 +27,7 @@ namespace refactor::kernel {
                 }
                 break;
             case decltype(_target)::Mlu:
-                if (auto ptr = GatherCnnl::build(axis, inputs[0].get(), inputs[1].get(), outputs[0].get()); ptr != nullptr) {
+                if (auto ptr = GatherCnnl::build(axis, a, b, c); ptr != nullptr) {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
diff --git a/src/04kernel/src/collectors/split.cc b/src/04kernel/src/collectors/split.cc
index 6a3be0a65..b862d8959 100644
--- a/src/04kernel/src/collectors/split.cc
+++ b/src/04kernel/src/collectors/split.cc
@@ -9,6 +9,8 @@ namespace refactor::kernel {
     SplitCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
         SplitInfo info(axis, outputs);
 
+        auto const &a = inputs[0];
+
         std::vector<KernelBox> ans;
         switch (_target) {
             case decltype(_target)::Cpu:
@@ -22,7 +24,7 @@ namespace refactor::kernel {
                 }
                 break;
             case decltype(_target)::Mlu:
-                if (auto ptr = SplitCnnl::build(axis, inputs[0].get(), outputs); ptr) {
+                if (auto ptr = SplitCnnl::build(axis, a, outputs); ptr) {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
index 7d11dbaa9..6c7a0fbaf 100644
--- a/src/04kernel/src/kernels/concat/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
@@ -12,7 +12,7 @@ namespace refactor::kernel {
     K::ConcatCnnl(SplitInfoCnnl info_) noexcept
         : Kernel(), info(std::move(info_)) {}
 
-    auto K::build(int axis, TensorRefs inputs, Tensor output) noexcept -> KernelBox {
+    auto K::build(int axis, TensorRefs inputs, Tensor const &output) noexcept -> KernelBox {
 #ifndef USE_BANG
         return nullptr;
 #endif
diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.hh b/src/04kernel/src/kernels/concat/cnnl_kernel.hh
index c49d5607a..0d4f6f853 100644
--- a/src/04kernel/src/kernels/concat/cnnl_kernel.hh
+++ b/src/04kernel/src/kernels/concat/cnnl_kernel.hh
@@ -11,7 +11,7 @@ namespace refactor::kernel {
 
         explicit ConcatCnnl(SplitInfoCnnl) noexcept;
 
-        static KernelBox build(int, TensorRefs, Tensor) noexcept;
+        static KernelBox build(int, TensorRefs, Tensor const &) noexcept;
         static size_t typeId() noexcept;
 
         size_t kernelTypeId() const noexcept final;
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
index 19f9ed358..d5ddcacec 100644
--- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -11,13 +11,14 @@ namespace refactor::kernel {
     K::GatherCnnl(decltype(info) info_) noexcept
         : Kernel(), info(std::move(info_)) {}
 
-    auto K::build(int axis, Tensor input, Tensor index, Tensor output) noexcept -> KernelBox {
+    auto K::build(int axis, Tensor const &input, Tensor const &index, Tensor const &output) noexcept -> KernelBox {
 #ifndef USE_BANG
         return nullptr;
 #endif
+
         return std::make_unique<K>(decltype(info){
             input.dataType,
-            index.dataType,
+            DataType::I32,
             axis,
             std::vector<int>(input.shape.begin(), input.shape.end()),
             std::vector<int>(index.shape.begin(), index.shape.end()),
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.hh b/src/04kernel/src/kernels/gather/cnnl_kernel.hh
index f60a8ada0..154345929 100644
--- a/src/04kernel/src/kernels/gather/cnnl_kernel.hh
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.hh
@@ -15,7 +15,7 @@ namespace refactor::kernel {
 
         explicit GatherCnnl(decltype(info)) noexcept;
 
-        static KernelBox build(int, Tensor, Tensor, Tensor) noexcept;
+        static KernelBox build(int, Tensor const &, Tensor const &, Tensor const &) noexcept;
         static size_t typeId() noexcept;
 
         size_t kernelTypeId() const noexcept final;
diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
index 752bee690..6b22d793d 100644
--- a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
@@ -75,8 +75,10 @@ namespace refactor::kernel {
         for (auto axis : axes) {
             dimsO[axis] = 1;
         }
-        setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size()));
-        setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size()));
+        // setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size()));
+        // setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));
 
         // clang-format off
         auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
index 86e605349..11712b4f8 100644
--- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
+++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -26,9 +26,10 @@ namespace refactor::kernel {
             // !a.dataType.isFloat() ||
             !ARTHIMETIC.contains(op) ||
             // At least one of a,b should have the same shape as c
-            (a.shape != c.shape && b.shape != c.shape) ||
+            (a.shape != c.shape && b.shape != c.shape)
             // Sub only supports brocasting b
-            (a.shape != c.shape && op == Op::Sub)) {
+            // (a.shape != c.shape && op == Op::Sub)
+            ) {
             return nullptr;
         }
 
@@ -122,18 +123,13 @@ namespace refactor::kernel {
 
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
         size_t workspaceSize;
-        if (aDims != cDims) {
-            CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->bDesc,
-                                                   d->aDesc, d->cDesc,
-                                                   &workspaceSize));
-        } else {
-            CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc,
+        CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc,
                                                    d->bDesc, d->cDesc,
                                                    &workspaceSize));
-        }
+        
 
         res.fetchOrStore<CnnlContext>();
-        auto routine = [swap = aDims != cDims, d,
+        auto routine = [d = std::move(d),
                         workspaceSize, cnnlLogicOP,
                         op = this->opType](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             auto handle = res.fetchOrStore<CnnlContext>()->handle;
@@ -151,20 +147,11 @@ namespace refactor::kernel {
                      beta = d->f32
                                 ? factor<fp32_t>(0)
                                 : factor<fp64_t>(0);
-
-                if (swap) {
-                    CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
-                                             &alphaB, d->bDesc, b,
-                                             &alphaA, d->aDesc, a,
-                                             workspace, workspaceSize,
-                                             &beta, d->cDesc, c));
-                } else {
                     CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
                                              &alphaA, d->aDesc, a,
                                              &alphaB, d->bDesc, b,
                                              workspace, workspaceSize,
                                              &beta, d->cDesc, c));
-                }
             } else if (op == SimpleBinaryType::Div) {
                 CNNL_ASSERT(cnnlDiv_v2(handle,
                                        CNNL_COMPUTATION_HIGH_PRECISION,
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
index 0c91a28d3..f8c0d7d01 100644
--- a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc
@@ -62,12 +62,12 @@ namespace refactor::kernel {
 
         setCnnlTensor(d->tensor, dataType, slice(&size, 1));
 
-        auto cnnlUnaryForward = [this](cnnlHandle_t handle,
-                                       const cnnlTensorDescriptor_t x_desc,
-                                       const void *x,
-                                       const cnnlTensorDescriptor_t y_desc,
-                                       void *y) -> cnnlStatus_t {
-            switch (this->type) {
+        auto cnnlUnaryForward = [t = this->type](cnnlHandle_t handle,
+                                                 const cnnlTensorDescriptor_t x_desc,
+                                                 const void *x,
+                                                 const cnnlTensorDescriptor_t y_desc,
+                                                 void *y) -> cnnlStatus_t {
+            switch (t) {
                 case Ty::Abs:
                     return cnnlAbs(handle, x_desc, x, y_desc, y);
                 case Ty::Neg:
@@ -77,6 +77,7 @@ namespace refactor::kernel {
                 case Ty::Erf:
                     return cnnlErf_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y);
                 default:
+                    // fmt::println("{}", unaryName(t));
                     UNREACHABLE();
             }
         };
diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc
index ace809ffa..c967bff2d 100644
--- a/src/04kernel/src/kernels/split/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc
@@ -14,7 +14,7 @@ namespace refactor::kernel {
         : dataType(dt_), axis(axis_), num(num_), inDim(std::move(in_)), outDims(std::move(out_)) {}
 
 
-    Info::SplitInfoCnnl(int axis, Tensor input, TensorRefs outputs)
+    Info::SplitInfoCnnl(int axis, Tensor const &input, TensorRefs outputs)
         : SplitInfoCnnl(input.dataType, axis, outputs.size(),
                         std::move(std::vector<int>(input.shape.begin(), input.shape.end())),
                         std::move([](TensorRefs tensors) -> std::vector<std::vector<int>> {
@@ -29,7 +29,7 @@ namespace refactor::kernel {
     K::SplitCnnl(SplitInfoCnnl info_) noexcept
         : Kernel(), info(std::move(info_)) {}
 
-    auto K::build(int axis, Tensor input, TensorRefs outputs) noexcept -> KernelBox {
+    auto K::build(int axis, Tensor const &input, TensorRefs outputs) noexcept -> KernelBox {
 #ifndef USE_BANG
         return nullptr;
 #endif
@@ -78,9 +78,12 @@ namespace refactor::kernel {
             Descriptors(Descriptors &&) = delete;
         };
         auto d = std::make_shared<Descriptors>(info.num, info.dataType != DT::F64);
-        setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
+        // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));
+
         for (auto i = 0; i < info.outDims.size(); i++) {
-            setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
+            // setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
+            CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data()));
         }
 
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.hh b/src/04kernel/src/kernels/split/cnnl_kernel.hh
index bcde3d708..7fb4147d4 100644
--- a/src/04kernel/src/kernels/split/cnnl_kernel.hh
+++ b/src/04kernel/src/kernels/split/cnnl_kernel.hh
@@ -14,7 +14,7 @@ namespace refactor::kernel {
         std::vector<std::vector<int>> outDims;
 
         SplitInfoCnnl(DataType, int, int, std::vector<int>, std::vector<std::vector<int>>);
-        SplitInfoCnnl(int, Tensor, TensorRefs);
+        SplitInfoCnnl(int, Tensor const &, TensorRefs);
     };
 
     struct SplitCnnl final : public Kernel {
@@ -22,7 +22,7 @@ namespace refactor::kernel {
 
         explicit SplitCnnl(SplitInfoCnnl) noexcept;
 
-        static KernelBox build(int, Tensor, TensorRefs) noexcept;
+        static KernelBox build(int, Tensor const &, TensorRefs) noexcept;
         static size_t typeId() noexcept;
 
         size_t kernelTypeId() const noexcept final;
diff --git a/src/09python_ffi/src/compiler.cc b/src/09python_ffi/src/compiler.cc
index bf04053e9..45450582e 100644
--- a/src/09python_ffi/src/compiler.cc
+++ b/src/09python_ffi/src/compiler.cc
@@ -95,6 +95,7 @@ namespace refactor::python_ffi {
         // clang-format off
         auto target_ = target == "cpu"  ? Target::Cpu
                      : target == "cuda" ? Target::Nvidia
+                     : target == "mlu" ? Target::Mlu
                      : UNREACHABLEX(Target, "Unknown target: {}", target);
         // clang-format on
         return compileOn(hardware::device::fetch(target_),
diff --git a/src/09python_ffi/src/import.cpp b/src/09python_ffi/src/import.cpp
index dda0e660c..74cf7cf01 100644
--- a/src/09python_ffi/src/import.cpp
+++ b/src/09python_ffi/src/import.cpp
@@ -16,6 +16,7 @@ namespace refactor::python_ffi {
         // clang-format off
         auto type_ = type == "cpu"    ? Device::Type::Cpu
                    : type == "nvidia" ? Device::Type::Nvidia
+                   : type == "mlu" ? Device::Type::Mlu
                    : UNREACHABLEX(Device::Type, "Unknown device type: \"{}\"", type);
         // clang-format on
         return device::init(type_, card, "");

From 8bde8c1f30b8c0a1c71ccdbd12b7228bc09abd7d Mon Sep 17 00:00:00 2001
From: Zhang Bolun <chamberlain0w0@gmail.com>
Date: Fri, 19 Jan 2024 06:47:11 +0000
Subject: [PATCH 6/9] =?UTF-8?q?feat:=20mlu=E4=B8=8A=E8=B7=91=E9=80=9Allama?=
 =?UTF-8?q?/gpt2=EF=BC=8C=E7=BB=93=E6=9E=9C=E6=AD=A3=E7=A1=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/04kernel/src/collectors/global_pool.cc    |  6 ++
 .../src/kernels/gather/cnnl_kernel.cc         | 14 ++--
 .../src/kernels/reduce/cnnl_kernel.cc         | 11 +--
 .../src/kernels/softmax/cnnl_kernel.cc        |  9 +-
 src/04kernel/src/kernels/where/cnnl_kernel.cc | 82 +++++++++----------
 src/04kernel/src/kernels/where/cnnl_kernel.hh |  4 +-
 .../src/utilities/bang/cnnl_context.cc        |  9 ++
 .../src/utilities/bang/cnnl_context.hh        |  2 +
 .../src/utilities/bang/cnrt_functions.cc      | 27 ++++++
 .../src/utilities/bang/cnrt_functions.h       | 16 ++++
 .../test/kernels/gather/test_gather_cnnl.cpp  | 45 ++++++++++
 src/09python_ffi/CMakeLists.txt               |  4 +
 src/09python_ffi/src/executor.cc              | 13 ++-
 13 files changed, 181 insertions(+), 61 deletions(-)
 create mode 100644 src/04kernel/src/utilities/bang/cnrt_functions.cc
 create mode 100644 src/04kernel/src/utilities/bang/cnrt_functions.h

diff --git a/src/04kernel/src/collectors/global_pool.cc b/src/04kernel/src/collectors/global_pool.cc
index 1ae1d7fc2..e6a278c1f 100644
--- a/src/04kernel/src/collectors/global_pool.cc
+++ b/src/04kernel/src/collectors/global_pool.cc
@@ -1,5 +1,6 @@
 ﻿#include "kernel/collectors/global_pool.h"
 #include "../kernels/pool/cudnn_kernel.hh"
+#include "../kernels/pool/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -28,6 +29,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = PoolCnnl::build(type, false, kernelShape, attributes, x, y); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
index d5ddcacec..b4d5aa154 100644
--- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -4,6 +4,7 @@
 #include "../../utilities/bang/cnnl_context.hh"
 #include "../../utilities/bang/cnnl_functions.h"
 #endif
+#include <iostream>
 
 namespace refactor::kernel {
     using K = GatherCnnl;
@@ -15,11 +16,11 @@ namespace refactor::kernel {
 #ifndef USE_BANG
         return nullptr;
 #endif
-
+        
         return std::make_unique<K>(decltype(info){
             input.dataType,
             DataType::I32,
-            axis,
+            axis ? axis : 0,
             std::vector<int>(input.shape.begin(), input.shape.end()),
             std::vector<int>(index.shape.begin(), index.shape.end()),
             std::vector<int>(output.shape.begin(), output.shape.end()),
@@ -70,15 +71,16 @@ namespace refactor::kernel {
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d),
-                        shape = info.inDim.data(), workspaceSize,
+                        shape = std::vector<int>(info.inDim.begin(), info.inDim.end()), 
+                        workspaceSize,
                         dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
-            BANG_ASSERT(cnrtMemcpy(workspace, (void*) shape, workspaceSize, CNRT_MEM_TRANS_DIR_HOST2DEV));
+            res.fetchOrStore<CnnlContext>()->copyFromCPU(workspace, shape.data(), workspaceSize);
             CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore<CnnlContext>()->handle, dim,
                                      d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
-                                      d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
+                                     d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
                                      d->outDesc, outputs[0]));
             BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
-        };
+       };
 
         return {std::move(routine), workspaceSize};
     }
diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
index 6b22d793d..4ea6fd827 100644
--- a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc
@@ -71,14 +71,15 @@ namespace refactor::kernel {
 
         std::vector<int>
             dimsI(shape.begin(), shape.end()),
-            dimsO(shape.begin(), shape.end());
+            dimsO(shape.begin(), shape.end()),
+            indices(axes.begin(), axes.end());
         for (auto axis : axes) {
             dimsO[axis] = 1;
         }
         // setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size()));
         // setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size()));
-        CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
-        CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));
 
         // clang-format off
         auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG
@@ -91,12 +92,12 @@ namespace refactor::kernel {
                       : UNREACHABLEX(cnnlReduceOp_t, "");
         // clang-format on
         CNNL_ASSERT(cnnlSetReduceDescriptor_v2(
-            d->reduce, (int *) (axes.data()), axes.size(), reduceOp,
+            d->reduce, indices.data(), indices.size(), reduceOp,
             cnnlDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64),
             CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));
 
         auto handler = res.fetchOrStore<CnnlContext>()->handle;
-        size_t idxWorkspaceSize = axes.size() * sizeof(int);
+        size_t idxWorkspaceSize = indices.size() * sizeof(int);
         // idxWorkspaceSize = hardware::alignBytes(idxWorkspaceSize, 256);
         size_t workspaceSize;
         // get workspace
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
index 865e452e1..0633195d5 100644
--- a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
@@ -59,9 +59,11 @@ namespace refactor::kernel {
             static_cast<cnnlSoftmaxAlgorithm_t>(algo),
             dataType != DataType::F64);
         int dims[]{pre, mid, post};
-        cnnlSoftmaxMode_t mode = (post == 1)  ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
-                                 : (pre == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
-                                              : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+        // cnnlSoftmaxMode_t mode = (pre == 1)  ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
+        //                          : (post == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
+        //                                       : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+        // FIXME(bolun): CNNL Softmax mode
+        cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
 
         // cnnlSoftmaxForward_v2 is applied to a 3D input tensor only
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims));
@@ -78,6 +80,7 @@ namespace refactor::kernel {
                 CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
                 &a, d->t, inputs[0],
                 &b, d->t, outputs[0]));
+            res.fetchOrStore<CnnlContext>()->queueSync();
         };
     }
 
diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc
index 774c5513d..50b7c9d85 100644
--- a/src/04kernel/src/kernels/where/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc
@@ -16,13 +16,24 @@ namespace refactor::kernel {
 #ifndef USE_BANG
         return nullptr;
 #endif
-        return std::make_unique<K>(decltype(info) {
-            inputs[1].get().dataType,
-            inputs[0].get().shape,
-            inputs[1].get().shape,
-            inputs[2].get().shape,
-            outputs[0].get().shape,
-        });
+        std::vector<int> cDim(inputs[0].get().shape.begin(), inputs[0].get().shape.end()),
+            xDim(inputs[1].get().shape.begin(), inputs[1].get().shape.end()),
+            yDim(inputs[2].get().shape.begin(), inputs[2].get().shape.end()),
+            ansDim(outputs[0].get().shape.begin(), outputs[0].get().shape.end());
+        if (ansDim.size() == 0) {
+            ansDim.push_back(1);
+        }
+        if (xDim.size() == 0) {
+            xDim.push_back(1);
+        }
+        if (yDim.size() == 0) {
+            yDim.push_back(1);
+        }
+        if (cDim.size() == 0) {
+            cDim.push_back(1);
+        }
+        return std::make_unique<K>(decltype(info){
+            inputs[1].get().dataType, cDim, xDim, yDim, ansDim});
     }
     auto K::typeId() noexcept -> size_t {
         static uint8_t ID = 1;
@@ -44,11 +55,10 @@ namespace refactor::kernel {
 
         struct Descriptors {
             cnnlTensorDescriptor_t cond, x, y, ans;
-            bool f32;
 
-            explicit Descriptors(decltype(f32) f32_)
+            explicit Descriptors()
                 : cond(nullptr), x(nullptr), y(nullptr),
-                  ans(nullptr), f32(f32_) {
+                  ans(nullptr) {
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&cond));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
@@ -64,29 +74,20 @@ namespace refactor::kernel {
             Descriptors(const Descriptors &) = delete;
             Descriptors(Descriptors &&) = delete;
         };
-        auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);
-
-        std::vector<int> cDim(info.condDim.begin(), info.condDim.end()),
-            xDim(info.thenDim.begin(), info.thenDim.end()),
-            yDim(info.elseDim.begin(), info.elseDim.end()),
-            ansDim(info.outputDim.begin(), info.outputDim.end());
-
-        auto rightAlign = [](std::vector<int> &dim, uint32_t targetLength) {
-            if (dim.size() < targetLength) {
-                dim.insert(dim.begin(), targetLength - dim.size(), 1);
-            }
-        };
-        if (ansDim.size() == 0) {
-            ansDim.push_back(1);
-        }
-        rightAlign(cDim, ansDim.size());
-        rightAlign(xDim, ansDim.size());
-        rightAlign(yDim, ansDim.size());
-
-        CNNL_ASSERT(cnnlSetTensorDescriptor(d->cond, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(DT::Bool), cDim.size(), cDim.data()));
-        CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), xDim.size(), xDim.data()));
-        CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), yDim.size(), yDim.data()));
-        CNNL_ASSERT(cnnlSetTensorDescriptor(d->ans, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), ansDim.size(), ansDim.data()));
+        auto d = std::make_shared<Descriptors>();
+
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->cond, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(DT::Bool),
+            info.condDim.size(), info.condDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.thenDim.size(), info.thenDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.elseDim.size(), info.elseDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->ans, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.outputDim.size(), info.outputDim.data()));
 
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
         size_t workspaceSize;
@@ -94,19 +95,14 @@ namespace refactor::kernel {
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
-            // fetch cnnl handle from resources
-            auto handle = res.fetchOrStore<CnnlContext>()->handle;
-            auto cond = inputs[0],
-                 x = inputs[1],
-                 y = inputs[2];
-            auto ans = outputs[0];
 
             CNNL_ASSERT(cnnlSelectV2(
-                handle, d->cond, cond, d->x, x,
-                d->y, y, workspace, workspaceSize,
-                d->ans, ans));
+                res.fetchOrStore<CnnlContext>()->handle,
+                d->cond, inputs[0], d->x, inputs[1],
+                d->y, inputs[2], workspace, workspaceSize,
+                d->ans, outputs[0]));
 
-            cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue);
+            res.fetchOrStore<CnnlContext>()->queueSync();
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.hh b/src/04kernel/src/kernels/where/cnnl_kernel.hh
index 6139b1832..ffe39a875 100644
--- a/src/04kernel/src/kernels/where/cnnl_kernel.hh
+++ b/src/04kernel/src/kernels/where/cnnl_kernel.hh
@@ -7,12 +7,10 @@
 
 namespace refactor::kernel {
 
-    using Shape = absl::InlinedVector<dim_t, 4>;
-
     struct WhereCnnl final : public Kernel {
         struct {
             DataType dataType;
-            Shape condDim, thenDim, elseDim, outputDim;
+            std::vector<int> condDim, thenDim, elseDim, outputDim;
         } info;
 
         WhereCnnl(decltype(info)) noexcept;
diff --git a/src/04kernel/src/utilities/bang/cnnl_context.cc b/src/04kernel/src/utilities/bang/cnnl_context.cc
index 15cc13829..f2ad33ab5 100644
--- a/src/04kernel/src/utilities/bang/cnnl_context.cc
+++ b/src/04kernel/src/utilities/bang/cnnl_context.cc
@@ -30,6 +30,15 @@ namespace refactor::kernel::cnnl {
         return "CnnlContext";
     }
 
+    void CnnlContext::copyFromCPU(void *dst, const void *src, size_t size) {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
+                               CNRT_MEM_TRANS_DIR_HOST2DEV));
+    }
+
+    void CnnlContext::queueSync() {
+        BANG_ASSERT(cnrtQueueSync(queue));
+    }
+
 }// namespace refactor::kernel::cnnl
 
 #endif
diff --git a/src/04kernel/src/utilities/bang/cnnl_context.hh b/src/04kernel/src/utilities/bang/cnnl_context.hh
index 7db40d3d3..4743a0e4e 100644
--- a/src/04kernel/src/utilities/bang/cnnl_context.hh
+++ b/src/04kernel/src/utilities/bang/cnnl_context.hh
@@ -22,6 +22,8 @@ namespace refactor::kernel::cnnl {
         size_t resourceTypeId() const noexcept final;
         std::string_view description() const noexcept final;
 
+        void copyFromCPU(void *dst, const void *src, size_t size);
+        void queueSync();
     };
 
 }// namespace refactor::kernel::cnnl
diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc
new file mode 100644
index 000000000..2ea66194d
--- /dev/null
+++ b/src/04kernel/src/utilities/bang/cnrt_functions.cc
@@ -0,0 +1,27 @@
+#ifdef USE_BANG
+#include "cnrt_functions.h"
+#include "cnnl_functions.h"
+#include <cnrt.h>
+#include <cstdio>
+
+namespace refactor::kernel::cnnl {
+
+    int currentDevice() {
+        int device;
+        BANG_ASSERT(cnrtGetDevice(&device));
+        return device;
+    }
+
+    void sync() {
+        BANG_ASSERT(cnrtSyncDevice());
+    }
+
+    void copyOut(void *dst, const void *src, size_t size) {
+        sync();
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
+                               CNRT_MEM_TRANS_DIR_DEV2HOST));
+    }
+
+}// namespace refactor::kernel::cnnl
+
+#endif
diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h
new file mode 100644
index 000000000..ef119819c
--- /dev/null
+++ b/src/04kernel/src/utilities/bang/cnrt_functions.h
@@ -0,0 +1,16 @@
+#ifndef KERNEL_CNRT_FUNCTIONS_H
+#define KERNEL_CNRT_FUNCTIONS_H
+
+#include "common.h"
+
+namespace refactor::kernel::cnnl {
+
+    int currentDevice();
+
+    void sync();
+
+    void copyOut(void *dst, const void *src, size_t size);
+
+}// namespace refactor::kernel::cnnl
+
+#endif// KERNEL_CNRT_FUNCTIONS_H
diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
index a3f739cd0..020b5f914 100644
--- a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
+++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
@@ -94,6 +94,51 @@ TEST(kernel, GatherCnnl) {
             EXPECT_FLOAT_EQ(c[i], result[i]);
         }
     }
+
+    // Case axis = 1, indexType= int32
+    {
+        // Create Tensor and build kernels
+        auto data = Tensor::share(DataType::F32, Shape{32, 16}, LayoutType::NCHW);
+        auto indices = Tensor::share(DataType::I64, Shape{1, 4}, LayoutType::NCHW);
+        auto output = Tensor::share(DataType::F32, Shape{1, 4, 16}, LayoutType::NCHW);
+        GatherInfo info(0, *data, *indices);
+        auto cnnlKernel = GatherCnnl::build(0, *data, *indices, *output);
+        auto cpuKernel = GatherCpu::build(info);
+        ASSERT_TRUE(cnnlKernel && cpuKernel);
+        auto res = runtime::Resources();
+        auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res);
+        auto cpuRoutine = cpuKernel->lower(res).routine;
+        // Init inputs and outputs
+        std::vector<float> a;
+        for (auto i = 0; i < data->elementsSize(); i++) {
+            a.push_back(i + 0.1f);
+        }
+        std::vector<int64_t> b(indices->elementsSize(), 0);
+        std::vector<float> c(output->elementsSize());
+        auto workspace = dev.malloc(workspaceSize),
+             aMLU = dev.malloc(data->bytesSize()),
+             bMLU = dev.malloc(indices->bytesSize()),
+             cMLU = dev.malloc(output->bytesSize());
+        aMLU->copyFromHost(a.data(), data->bytesSize());
+        bMLU->copyFromHost(b.data(), indices->bytesSize());
+        // Compute
+        {
+            void const *inputs[]{*aMLU, *bMLU};
+            void *outputs[]{*cMLU};
+            cnnlRoutine(res, *workspace, inputs, outputs);
+        }
+        {
+            void const *inputs[]{a.data(), b.data()};
+            void *outputs[]{c.data()};
+            cpuRoutine(res, nullptr, inputs, outputs);
+        }
+        // Compare
+        std::vector<float> result(output->elementsSize());
+        cMLU->copyToHost(result.data(), output->bytesSize());
+        for (auto i : range0_(c.size())) {
+            EXPECT_FLOAT_EQ(c[i], result[i]);
+        }
+    }
 }
 
 #endif
diff --git a/src/09python_ffi/CMakeLists.txt b/src/09python_ffi/CMakeLists.txt
index ccce34d37..09567c9da 100644
--- a/src/09python_ffi/CMakeLists.txt
+++ b/src/09python_ffi/CMakeLists.txt
@@ -10,6 +10,10 @@ pybind11_add_module(python_ffi SHARED ${PYFFI_SRC})
 target_link_libraries(python_ffi PRIVATE onnx llm communication)
 target_include_directories(python_ffi PRIVATE include)
 
+if(USE_BANG)
+    target_include_directories(python_ffi PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../04kernel/src/utilities/bang)
+endif()
+
 # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
 # define (VERSION_INFO) here.
 # target_compile_definitions(python_ffi
diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc
index c6a20cb95..947410ccb 100644
--- a/src/09python_ffi/src/executor.cc
+++ b/src/09python_ffi/src/executor.cc
@@ -7,6 +7,10 @@
 #include "kernel/cuda/functions.cuh"
 #endif// USE_CUDA
 
+#ifdef USE_BANG
+#include "cnrt_functions.h"
+#endif// USE_BANG
+
 namespace refactor::python_ffi {
 
     Executor::Executor(computation::Graph graph, runtime::Stream stream)
@@ -70,9 +74,13 @@ namespace refactor::python_ffi {
     void Executor::bench(bool sync) {
 #ifdef USE_CUDA
         auto ans = _stream.bench(sync ? kernel::cuda::sync : nullptr);
+#else
+    #ifdef USE_BANG
+        auto ans = _stream.bench(sync ? kernel::cnnl::sync : nullptr);
 #else
         auto ans = _stream.bench(nullptr);
-#endif// USE_CUDA
+    #endif
+#endif
         auto const &nodes = _graph.internal().contiguous().nodes;
         for (auto i : range0_(nodes.size())) {
             fmt::println("{} {} {}",
@@ -213,6 +221,9 @@ namespace refactor::python_ffi {
 #ifdef USE_CUDA
                 kernel::cuda::copyOut(buffer.data(), addresses[idx], size);
 #endif
+#ifdef USE_BANG
+                kernel::cnnl::copyOut(buffer.data(), addresses[idx], size);
+#endif
 
                 auto file = path / fmt::format("data{:06}.{}", dataIdx++, format);
                 fs::remove(file);

From 2258c1ee2d09cfe0add168d5944e6d3ec349c634 Mon Sep 17 00:00:00 2001
From: Zhang Bolun <Chamberlain0w0@gmail.com>
Date: Fri, 26 Jan 2024 15:13:09 +0800
Subject: [PATCH 7/9] =?UTF-8?q?fix:=20=E8=A7=A3=E5=86=B3=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?warning=EF=BC=8C=E5=B9=B6=E6=8A=8Async=E6=93=8D=E4=BD=9C?=
 =?UTF-8?q?=E4=BB=8E=E7=AE=97=E5=AD=90=E5=86=85=E9=83=A8=E7=A7=BB=E9=99=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../batch_normalization/cnnl_kernel.cc        |  5 ++--
 src/04kernel/src/kernels/cast/cnnl_kernel.cc  |  1 -
 src/04kernel/src/kernels/clip/cnnl_kernel.cc  |  1 -
 .../src/kernels/concat/cnnl_kernel.cc         |  4 ++--
 src/04kernel/src/kernels/conv/cnnl_kernel.cc  |  6 ++---
 .../src/kernels/expand/cnnl_kernel.cc         |  1 -
 .../src/kernels/gather/cnnl_kernel.cc         |  1 -
 .../src/kernels/mat_mul/cnnl_kernel.cc        |  1 -
 src/04kernel/src/kernels/pool/cnnl_kernel.cc  |  4 ++--
 .../src/kernels/simple_binary/binary_cnnl.cc  |  1 -
 src/04kernel/src/kernels/slice/cnnl_kernel.cc |  2 +-
 .../src/kernels/softmax/cnnl_kernel.cc        |  1 -
 src/04kernel/src/kernels/split/cnnl_kernel.cc |  4 ++--
 src/04kernel/src/kernels/where/cnnl_kernel.cc |  1 -
 .../src/utilities/bang/cnrt_functions.cc      |  4 ++--
 .../src/utilities/bang/cnrt_functions.h       |  4 ++--
 .../kernels/batch_normalization/test_cnnl.cpp |  2 ++
 src/04kernel/test/kernels/cast/test_cnnl.cpp  |  4 +++-
 src/04kernel/test/kernels/clip/test_cnnl.cpp  |  4 +++-
 .../test/kernels/concat/test_cnnl.cpp         |  4 +++-
 src/04kernel/test/kernels/conv/test_cnnl.cpp  | 11 ++-------
 .../test/kernels/expand/test_cnnl.cpp         |  2 ++
 .../test/kernels/gather/test_gather_cnnl.cpp  |  6 ++++-
 .../test/kernels/mat_mul/test_cnnl.cpp        |  5 ++++
 src/04kernel/test/kernels/pool/test_cnnl.cpp  |  2 ++
 .../test/kernels/reduce/test_cnnl.cpp         |  2 ++
 .../test/kernels/scatter_nd/test_cnnl.cpp     |  4 +++-
 .../simple_binary/test_binary_cnnl.cpp        | 23 ++-----------------
 .../test/kernels/simple_unary/test_cnnl.cpp   |  2 ++
 src/04kernel/test/kernels/slice/test_cnnl.cpp |  4 +++-
 .../test/kernels/softmax/test_cnnl.cpp        |  4 +++-
 src/04kernel/test/kernels/split/test_cnnl.cpp |  4 +++-
 .../test/kernels/transpose/test_cnnl.cpp      |  2 ++
 src/04kernel/test/kernels/where/test_cnnl.cpp |  2 ++
 src/09python_ffi/src/executor.cc              |  4 ++--
 35 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
index e3efcd1d3..1330cfa7a 100644
--- a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
@@ -128,8 +128,8 @@ namespace refactor::kernel {
             auto y = outputs[0];
 
             void *xTrans = workspace;
-            void *yTrans = xTrans + xTransSize;
-            void *cursor = yTrans + xTransSize;
+            void *yTrans = reinterpret_cast<uint8_t *>(xTrans) + xTransSize;
+            void *cursor = reinterpret_cast<uint8_t *>(yTrans) + xTransSize;
 
             // transpose NCHW input to NHWC
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
@@ -147,7 +147,6 @@ namespace refactor::kernel {
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans,
                                          d->inDesc, y, cursor, workspaceSize));
 
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), totalWorkspaceSize};
diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
index 951209741..54798e670 100644
--- a/src/04kernel/src/kernels/cast/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
@@ -65,7 +65,6 @@ namespace refactor::kernel {
         return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
                                          d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
-            // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 
diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.cc b/src/04kernel/src/kernels/clip/cnnl_kernel.cc
index 423939bc9..fe65e99b3 100644
--- a/src/04kernel/src/kernels/clip/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/clip/cnnl_kernel.cc
@@ -57,7 +57,6 @@ namespace refactor::kernel {
                                     CNNL_POINTER_MODE_DEVICE, d->t,
                                     inputs[0], inputs[1], hasMax ? inputs[2] : nullptr,
                                     d->t, outputs[0]));
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 
diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
index 6c7a0fbaf..c35b1c33f 100644
--- a/src/04kernel/src/kernels/concat/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
@@ -52,7 +52,7 @@ namespace refactor::kernel {
             }
             ~Descriptors() noexcept(false) {
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
-                for (auto i = 0; i < out.size(); i++) {
+                for (size_t i = 0; i < out.size(); i++) {
                     CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
                 }
             }
@@ -62,7 +62,7 @@ namespace refactor::kernel {
         };
         auto d = std::make_shared<Descriptors>(info.num, info.dataType != DT::F64);
         setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
-        for (auto i = 0; i < info.outDims.size(); i++) {
+        for (size_t i = 0; i < info.outDims.size(); i++) {
             setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
         }
 
diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
index 85dcb9517..187ac4b51 100644
--- a/src/04kernel/src/kernels/conv/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
@@ -209,9 +209,9 @@ namespace refactor::kernel {
             // }
 
             void *xTrans = workspace;
-            void *wTrans = xTrans + xTransSize;
-            void *yTrans = wTrans + wTransSize;
-            void *opWorkspace = yTrans + yTransSize;
+            void *wTrans = reinterpret_cast<uint8_t *>(xTrans) + xTransSize;
+            void *yTrans = reinterpret_cast<uint8_t *>(wTrans) + wTransSize;
+            void *opWorkspace = reinterpret_cast<uint8_t *>(yTrans) + yTransSize;
 
             // transpose NCHW input to NHWC
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->x, x,
diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.cc b/src/04kernel/src/kernels/expand/cnnl_kernel.cc
index 18f586422..426eac49d 100644
--- a/src/04kernel/src/kernels/expand/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/expand/cnnl_kernel.cc
@@ -60,7 +60,6 @@ namespace refactor::kernel {
         return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             CNNL_ASSERT(cnnlExpand(res.fetchOrStore<CnnlContext>()->handle,
                                    d->inDesc, inputs[0], d->outDesc, outputs[0]));
-            // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 #endif
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
index b4d5aa154..abdf0d5c4 100644
--- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -79,7 +79,6 @@ namespace refactor::kernel {
                                      d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
                                      d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
                                      d->outDesc, outputs[0]));
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
        };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
index 7be05b7b2..3eac35723 100644
--- a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
@@ -141,7 +141,6 @@ namespace refactor::kernel {
                     workspace, algoWorkspaceSize));
             }
 
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), algoWorkspaceSize};
diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.cc b/src/04kernel/src/kernels/pool/cnnl_kernel.cc
index 083125b1f..929ea5789 100644
--- a/src/04kernel/src/kernels/pool/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/pool/cnnl_kernel.cc
@@ -130,7 +130,7 @@ namespace refactor::kernel {
             auto handle = res.fetchOrStore<CnnlContext>()->handle;
            
             void *extraInputDev = workspace;
-            void *poolWorkSpace = workspace + extraInputSize;
+            void *poolWorkSpace = reinterpret_cast<uint8_t *>(workspace) + extraInputSize;
 
             void *extraInputHost = malloc(extraInputSize);
             CNNL_ASSERT(cnnlInitPoolingExtraInput(handle, d->pooling, d->x, d->y, extraInputHost));
@@ -145,7 +145,7 @@ namespace refactor::kernel {
                 &b, extraInputDev, d->y, outputs[0],
                 poolWorkSpace, workspaceSize));
 
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+            res.fetchOrStore<CnnlContext>()->queueSync();
 
             free(extraInputHost);
         };
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
index 11712b4f8..8b4647ac8 100644
--- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
+++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -180,7 +180,6 @@ namespace refactor::kernel {
                                          workspace, workspaceSize));
             }
             
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.cc b/src/04kernel/src/kernels/slice/cnnl_kernel.cc
index b5f8b4c85..85bc90938 100644
--- a/src/04kernel/src/kernels/slice/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/slice/cnnl_kernel.cc
@@ -64,7 +64,7 @@ namespace refactor::kernel {
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->out, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDim.size(), info.outDim.data()));
         std::vector<int> begin, end, stride;
-        for (auto i = 0; i < info.dims.size(); i++) {
+        for (size_t i = 0; i < info.dims.size(); i++) {
             // [begin, end), end is not inclued
             begin.push_back(info.dims[i].start);
             auto sign = info.dims[i].step > 0 ? 1 : -1;
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
index 0633195d5..babaf33cc 100644
--- a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
@@ -80,7 +80,6 @@ namespace refactor::kernel {
                 CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
                 &a, d->t, inputs[0],
                 &b, d->t, outputs[0]));
-            res.fetchOrStore<CnnlContext>()->queueSync();
         };
     }
 
diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc
index c967bff2d..8f686d597 100644
--- a/src/04kernel/src/kernels/split/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc
@@ -69,7 +69,7 @@ namespace refactor::kernel {
             }
             ~Descriptors() noexcept(false) {
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
-                for (auto i = 0; i < out.size(); i++) {
+                for (size_t i = 0; i < out.size(); i++) {
                     CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
                 }
             }
@@ -81,7 +81,7 @@ namespace refactor::kernel {
         // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));
 
-        for (auto i = 0; i < info.outDims.size(); i++) {
+        for (size_t i = 0; i < info.outDims.size(); i++) {
             // setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
             CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data()));
         }
diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc
index 50b7c9d85..14f8d6676 100644
--- a/src/04kernel/src/kernels/where/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc
@@ -102,7 +102,6 @@ namespace refactor::kernel {
                 d->y, inputs[2], workspace, workspaceSize,
                 d->ans, outputs[0]));
 
-            res.fetchOrStore<CnnlContext>()->queueSync();
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc
index 2ea66194d..26c1b975d 100644
--- a/src/04kernel/src/utilities/bang/cnrt_functions.cc
+++ b/src/04kernel/src/utilities/bang/cnrt_functions.cc
@@ -4,7 +4,7 @@
 #include <cnrt.h>
 #include <cstdio>
 
-namespace refactor::kernel::cnnl {
+namespace refactor::kernel::bang {
 
     int currentDevice() {
         int device;
@@ -22,6 +22,6 @@ namespace refactor::kernel::cnnl {
                                CNRT_MEM_TRANS_DIR_DEV2HOST));
     }
 
-}// namespace refactor::kernel::cnnl
+}// namespace refactor::kernel::bang
 
 #endif
diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h
index ef119819c..3a05195ce 100644
--- a/src/04kernel/src/utilities/bang/cnrt_functions.h
+++ b/src/04kernel/src/utilities/bang/cnrt_functions.h
@@ -3,7 +3,7 @@
 
 #include "common.h"
 
-namespace refactor::kernel::cnnl {
+namespace refactor::kernel::bang {
 
     int currentDevice();
 
@@ -11,6 +11,6 @@ namespace refactor::kernel::cnnl {
 
     void copyOut(void *dst, const void *src, size_t size);
 
-}// namespace refactor::kernel::cnnl
+}// namespace refactor::kernel::bang
 
 #endif// KERNEL_CNRT_FUNCTIONS_H
diff --git a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
index 14a1a07f4..d1ad9bd78 100644
--- a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/batch_normalization/cnnl_kernel.hh"
 #include "../../../src/kernels/batch_normalization/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -57,6 +58,7 @@ TEST(kernel, BatchNormalizationCnnl) {
         void const *inputs[]{*mluIn, *mluScale, *mluBias, *mluMean, *mluVar};
         void *outputs[]{*mluOut};
         rMlu(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     // take output data
     std::vector<float> result(outTensor->elementsSize());
diff --git a/src/04kernel/test/kernels/cast/test_cnnl.cpp b/src/04kernel/test/kernels/cast/test_cnnl.cpp
index 72f842470..94297357d 100644
--- a/src/04kernel/test/kernels/cast/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/cast/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/cast/cpu_kernel.hh"
 #include "../../../src/kernels/cast/cnnl_kernel.hh"
+#include "../../../src/kernels/cast/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -34,6 +35,7 @@ TEST(kernel, CastCnnl) {
         void const *inputs[]{*xMlu};
         void *outputs[]{*yMlu};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{x_.data()};
diff --git a/src/04kernel/test/kernels/clip/test_cnnl.cpp b/src/04kernel/test/kernels/clip/test_cnnl.cpp
index ab29a1cb5..ff2e77290 100644
--- a/src/04kernel/test/kernels/clip/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/clip/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/clip/cpu_kernel.hh"
 #include "../../../src/kernels/clip/cnnl_kernel.hh"
+#include "../../../src/kernels/clip/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -36,6 +37,7 @@ TEST(kernel, ClipCnnl) {
         void const *inputs[]{*mluMem, *mluMin, *mluMax};
         void *outputs[]{*mluMem};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{value.data(), &min, &max};
diff --git a/src/04kernel/test/kernels/concat/test_cnnl.cpp b/src/04kernel/test/kernels/concat/test_cnnl.cpp
index 2911c9e90..ecc817aca 100644
--- a/src/04kernel/test/kernels/concat/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/concat/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/concat/cpu_kernel.hh"
 #include "../../../src/kernels/concat/cnnl_kernel.hh"
+#include "../../../src/kernels/concat/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -65,6 +66,7 @@ TEST(kernel, ConcatCnnl) {
         void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2], *mluIns[3]};
         void *outputs[]{*mluOut};
         routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()};
diff --git a/src/04kernel/test/kernels/conv/test_cnnl.cpp b/src/04kernel/test/kernels/conv/test_cnnl.cpp
index bf4bff3c8..74e799f5a 100644
--- a/src/04kernel/test/kernels/conv/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/conv/test_cnnl.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_BANG
 
 #include "../../../src/kernels/conv/cnnl_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -39,15 +40,7 @@ void testConvCnnl(int rank, const int64_t *pads, const int64_t *strides, const i
     void const *inputs[]{*xMlu, *wMlu};
     void *outputs[]{*yMlu};
     routine(res, *workspace, inputs, outputs);
-
-    xMlu->copyToHost(xData.data(), xTensor->bytesSize());
-    wMlu->copyToHost(wData.data(), wTensor->bytesSize());
-    // fmt::println("{}", vec2str(xData));
-    // fmt::println("{}", vec2str(wData));
-
-    // std::vector<float> ws(workspaceSize);
-    // workspace->copyToHost(ws.data(), workspaceSize);
-    // fmt::println("{}", vec2str(ws));
+    kernel::bang::sync();
 
     // take output data
     std::vector<float> result(yTensor->elementsSize());
diff --git a/src/04kernel/test/kernels/expand/test_cnnl.cpp b/src/04kernel/test/kernels/expand/test_cnnl.cpp
index e06e59777..43fb07e8d 100644
--- a/src/04kernel/test/kernels/expand/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/expand/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/expand/cnnl_kernel.hh"
 #include "../../../src/kernels/expand/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -36,6 +37,7 @@ TEST(kernel, ExpandCnnl) {
         void const *inputs[]{*mluIn};
         void *outputs[]{*mluOut};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{data.data()};
diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
index 020b5f914..b63682d26 100644
--- a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
+++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../src/kernels/gather/cnnl_kernel.hh"
 #include "../src/kernels/gather/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -39,6 +40,7 @@ TEST(kernel, GatherCnnl) {
             void const *inputs[]{*aMLU, *bMLU};
             void *outputs[]{*cMLU};
             cnnlRoutine(res, *workspace, inputs, outputs);
+            kernel::bang::sync();
         }
         {
             void const *inputs[]{a.data(), b.data()};
@@ -81,6 +83,7 @@ TEST(kernel, GatherCnnl) {
             void const *inputs[]{*aMLU, *bMLU};
             void *outputs[]{*cMLU};
             cnnlRoutine(res, *workspace, inputs, outputs);
+            kernel::bang::sync();
         }
         {
             void const *inputs[]{a.data(), b.data()};
@@ -110,7 +113,7 @@ TEST(kernel, GatherCnnl) {
         auto cpuRoutine = cpuKernel->lower(res).routine;
         // Init inputs and outputs
         std::vector<float> a;
-        for (auto i = 0; i < data->elementsSize(); i++) {
+        for (size_t i = 0; i < data->elementsSize(); i++) {
             a.push_back(i + 0.1f);
         }
         std::vector<int64_t> b(indices->elementsSize(), 0);
@@ -126,6 +129,7 @@ TEST(kernel, GatherCnnl) {
             void const *inputs[]{*aMLU, *bMLU};
             void *outputs[]{*cMLU};
             cnnlRoutine(res, *workspace, inputs, outputs);
+            kernel::bang::sync();
         }
         {
             void const *inputs[]{a.data(), b.data()};
diff --git a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
index 251c5c898..f079b4445 100644
--- a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../src/kernels/mat_mul/cnnl_kernel.hh"
 #include "../src/kernels/mat_mul/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -48,6 +49,7 @@ TEST(kernel, MatMulCnnl_OnlyBias) {
     void const *inputs[]{*ma, *mb, *mc};
     void *outputs[]{*my};
     routine(res, *workspace, inputs, outputs);
+    kernel::bang::sync();
     // take output data
     std::vector<float> result(Y->elementsSize());
     my->copyToHost(result.data(), Y->bytesSize());
@@ -91,6 +93,7 @@ TEST(kernel, MatMulCnnl_Broadcast) {
         void const *inputs[]{*ma, *mb, *mc};
         void *outputs[]{*my};
         mluRoutine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
@@ -135,6 +138,7 @@ TEST(kernel, MatMulCnnl_TransABNoBias) {
         void const *inputs[]{*ma, *mb};
         void *outputs[]{*my};
         mluRoutine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{dataA.data(), dataB.data()};
@@ -189,6 +193,7 @@ TEST(kernel, MatMulCnnl_Large) {
         void const *inputs[]{*ma, *mb, *mc};
         void *outputs[]{*my};
         mluRoutine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
diff --git a/src/04kernel/test/kernels/pool/test_cnnl.cpp b/src/04kernel/test/kernels/pool/test_cnnl.cpp
index 405bf3f8c..1adf45139 100644
--- a/src/04kernel/test/kernels/pool/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/pool/test_cnnl.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_BANG
 
 #include "../../../src/kernels/pool/cnnl_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -32,6 +33,7 @@ void testPoolCnnl(PoolType poolType, int rank, const int64_t *pads, const int64_
     void const *inputs[]{*mluMem};
     void *outputs[]{*mluMem};
     routine(res, *workspace, inputs, outputs);
+    kernel::bang::sync();
     // take output data
     std::vector<float> result(yTensor->elementsSize());
     mluMem->copyToHost(result.data(), yTensor->bytesSize());
diff --git a/src/04kernel/test/kernels/reduce/test_cnnl.cpp b/src/04kernel/test/kernels/reduce/test_cnnl.cpp
index 32952fead..113fe7664 100644
--- a/src/04kernel/test/kernels/reduce/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/reduce/test_cnnl.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_BANG
 
 #include "../../../src/kernels/reduce/cnnl_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -28,6 +29,7 @@ static void testReducemean(const Shape &shape, const std::vector<float> &data,
         void const *inputs[]{*mluMemIn};
         void *outputs[]{*mluMemOut};
         routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     // take output data
     Shape outDimArray;
diff --git a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp
index 1cdf72166..cf5b9c367 100644
--- a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/scatter_nd/cpu_kernel.hh"
 #include "../../../src/kernels/scatter_nd/cnnl_kernel.hh"
+#include "../../../src/kernels/scatter_nd/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -48,6 +49,7 @@ TEST(kernel, ScatterNDCnnl) {
         void const *inputs[]{*mluData, *mluIndices, *mluUpdates};
         void *outputs[]{*mluOut};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{data_.data(), indices_.data(), updates_.data()};
diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
index 7bd18bf6d..dfd3c9b80 100644
--- a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
+++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../src/kernels/simple_binary/binary_cnnl.hh"
 #include "../src/kernels/simple_binary/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -38,6 +39,7 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di
         void const *inputs[]{*aMLU, *bMLU};
         void *outputs[]{*cMLU};
         routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{a.data(), b.data()};
@@ -100,27 +102,6 @@ TEST(kernel, BinaryCnnlFMod) {
                                   Shape{2, 5, 10, 20, 3, 4});
 }
 
-// TEST(kernel, BinaryCnnlMod) {
-//     testBinaryCnnl<DataType::I8>(SimpleBinaryType::Mod,
-//                                  Shape{2, 5, 10, 20, 3, 4},
-//                                  Shape{2, 5, 10, 20, 3, 4},
-//                                  Shape{2, 5, 10, 20, 3, 4});
-// }
-
-// TEST(kernel, BinaryCnnlFmodI8) {
-//     testBinaryCnnl<DataType::I8>(SimpleBinaryType::Fmod,
-//                                  Shape{2, 5, 10, 20, 3, 4},
-//                                  Shape{2, 5, 10, 20, 3, 4},
-//                                  Shape{2, 5, 10, 20, 3, 4});
-// }
-
-// TEST(kernel, BinaryCnnlFmodF32) {
-//     testBinaryCnnl<DataType::F32>(SimpleBinaryType::Fmod,
-//                                   Shape{2, 5, 10, 20, 3, 4},
-//                                   Shape{2, 5, 10, 20, 3, 4},
-//                                   Shape{2, 5, 10, 20, 3, 4});
-// }
-
 TEST(kernel, BinaryCnnlBroadcast) {
     testBinaryCnnl<DataType::F32>(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6});
 }
diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
index e4b9428ee..8e1e8a9f6 100644
--- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
@@ -3,6 +3,7 @@
 #include "../../../src/kernels/simple_unary/cnnl_activation_kernel.hh"
 #include "../../../src/kernels/simple_unary/cnnl_simple_unary_kernel.hh"
 #include "../../../src/kernels/simple_unary/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -32,6 +33,7 @@ static void testOp(SimpleUnaryType opType, bool activation = true) {
         void const *inputs[]{*mluMem};
         void *outputs[]{*mluMem};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{data.data()};
diff --git a/src/04kernel/test/kernels/slice/test_cnnl.cpp b/src/04kernel/test/kernels/slice/test_cnnl.cpp
index 914a8fee4..1685d7aaa 100644
--- a/src/04kernel/test/kernels/slice/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/slice/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/slice/cpu_kernel.hh"
 #include "../../../src/kernels/slice/cnnl_kernel.hh"
+#include "../../../src/kernels/slice/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -45,6 +46,7 @@ TEST(kernel, SliceCnnl) {
         void const *inputs[]{*mluIn};
         void *outputs[]{*mluOut};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{data.data()};
diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp
index a8c7fb283..09874d207 100644
--- a/src/04kernel/test/kernels/softmax/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/softmax/test_cnnl.cpp
@@ -1,7 +1,8 @@
 #ifdef USE_BANG
 
-#include "../../../src/kernels/softmax/cpu_kernel.hh"
 #include "../../../src/kernels/softmax/cnnl_kernel.hh"
+#include "../../../src/kernels/softmax/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -39,6 +40,7 @@ TEST(kernel, SoftmaxCnnl) {
         void const *inputs[]{*mluIn};
         void *outputs[]{*mluOut};
         rCnnl(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     // take output data
     std::vector<float> result(outTensor->elementsSize());
diff --git a/src/04kernel/test/kernels/split/test_cnnl.cpp b/src/04kernel/test/kernels/split/test_cnnl.cpp
index 5afa434e6..71e69b219 100644
--- a/src/04kernel/test/kernels/split/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/split/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/split/cpu_kernel.hh"
 #include "../../../src/kernels/split/cnnl_kernel.hh"
+#include "../../../src/kernels/split/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -64,6 +65,7 @@ TEST(kernel, SplitCnnl) {
         void const *inputs[]{*mluIn};
         void *outputs[]{*mluOuts[0], *mluOuts[1], *mluOuts[2], *mluOuts[3]};
         routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{data.data()};
diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
index 4f4301d86..f46890714 100644
--- a/src/04kernel/test/kernels/transpose/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/transpose/cnnl_kernel.hh"
 #include "../../../src/kernels/transpose/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -42,6 +43,7 @@ TEST(kernel, TransposeCnnl) {
         void const *inputs[]{*mluIn};
         void *outputs[]{*mluOut};
         routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     // take output data
     std::vector<float> result(dataTensor->elementsSize());
diff --git a/src/04kernel/test/kernels/where/test_cnnl.cpp b/src/04kernel/test/kernels/where/test_cnnl.cpp
index aaad65857..6e26ddbd2 100644
--- a/src/04kernel/test/kernels/where/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/where/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/where/cnnl_kernel.hh"
 #include "../../../src/kernels/where/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -49,6 +50,7 @@ void testWhereCnnl(Shape cDim, Shape xDim, Shape yDim, Shape outDim) {
         void const *inputs[]{*mluC, *mluX, *mluY};
         void *outputs[]{*mluOut};
         rCnnl(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     // take output data
     std::vector<float> result(outTensor->elementsSize());
diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc
index 947410ccb..1d0e543f1 100644
--- a/src/09python_ffi/src/executor.cc
+++ b/src/09python_ffi/src/executor.cc
@@ -76,7 +76,7 @@ namespace refactor::python_ffi {
         auto ans = _stream.bench(sync ? kernel::cuda::sync : nullptr);
 #else
     #ifdef USE_BANG
-        auto ans = _stream.bench(sync ? kernel::cnnl::sync : nullptr);
+        auto ans = _stream.bench(sync ? kernel::bang::sync : nullptr);
 #else
         auto ans = _stream.bench(nullptr);
     #endif
@@ -222,7 +222,7 @@ namespace refactor::python_ffi {
                 kernel::cuda::copyOut(buffer.data(), addresses[idx], size);
 #endif
 #ifdef USE_BANG
-                kernel::cnnl::copyOut(buffer.data(), addresses[idx], size);
+                kernel::bang::copyOut(buffer.data(), addresses[idx], size);
 #endif
 
                 auto file = path / fmt::format("data{:06}.{}", dataIdx++, format);

From b82c861c3cf08a9eb72e93f4c77ac72f237c5bb3 Mon Sep 17 00:00:00 2001
From: Bolun <chamberlain0w0@gmail.com>
Date: Wed, 31 Jan 2024 07:21:02 +0000
Subject: [PATCH 8/9] =?UTF-8?q?fix:=20=E6=95=B4=E7=90=86=E5=88=86=E6=94=AF?=
 =?UTF-8?q?=E5=90=8E=E8=B7=91=E9=80=9A=EF=BC=8C=E6=B7=BB=E5=8A=A0HardSwish?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/kernels/gather/cnnl_kernel.cc         | 18 +++++++++------
 .../src/kernels/simple_binary/binary_cnnl.cc  | 22 ++++++++-----------
 .../simple_unary/cnnl_activation_kernel.cc    | 21 ++++++++++++------
 .../test/kernels/simple_unary/test_cnnl.cpp   |  1 +
 .../test/kernels/transpose/test_cnnl.cpp      |  4 ++--
 5 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
index abdf0d5c4..58b86660b 100644
--- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -16,13 +16,16 @@ namespace refactor::kernel {
 #ifndef USE_BANG
         return nullptr;
 #endif
-        
+        auto indicesDim = std::vector<int>(index.shape.begin(), index.shape.end());
+        if (indicesDim.size() == 0) {
+            indicesDim.push_back(1);
+        }
         return std::make_unique<K>(decltype(info){
             input.dataType,
-            DataType::I32,
-            axis ? axis : 0,
+            index.dataType,
+            axis,
             std::vector<int>(input.shape.begin(), input.shape.end()),
-            std::vector<int>(index.shape.begin(), index.shape.end()),
+            std::move(indicesDim),
             std::vector<int>(output.shape.begin(), output.shape.end()),
         });
     }
@@ -60,8 +63,9 @@ namespace refactor::kernel {
         CNNL_ASSERT(cnnlSetTensorDescriptor(
             d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
             info.inDim.size(), info.inDim.data()));
+        // cnnlGatherV2 does not support int64 indices
         CNNL_ASSERT(cnnlSetTensorDescriptor(
-            d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType),
+            d->indexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
             info.indexDim.size(), info.indexDim.data()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(
             d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
@@ -71,7 +75,7 @@ namespace refactor::kernel {
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d),
-                        shape = std::vector<int>(info.inDim.begin(), info.inDim.end()), 
+                        shape = std::vector<int>(info.inDim.begin(), info.inDim.end()),
                         workspaceSize,
                         dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             res.fetchOrStore<CnnlContext>()->copyFromCPU(workspace, shape.data(), workspaceSize);
@@ -79,7 +83,7 @@ namespace refactor::kernel {
                                      d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
                                      d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
                                      d->outDesc, outputs[0]));
-       };
+        };
 
         return {std::move(routine), workspaceSize};
     }
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
index 8b4647ac8..d62ba0ab8 100644
--- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
+++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -26,10 +26,7 @@ namespace refactor::kernel {
             // !a.dataType.isFloat() ||
             !ARTHIMETIC.contains(op) ||
             // At least one of a,b should have the same shape as c
-            (a.shape != c.shape && b.shape != c.shape)
-            // Sub only supports brocasting b
-            // (a.shape != c.shape && op == Op::Sub)
-            ) {
+            (a.shape != c.shape && b.shape != c.shape)) {
             return nullptr;
         }
 
@@ -124,9 +121,9 @@ namespace refactor::kernel {
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
         size_t workspaceSize;
         CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc,
-                                                   d->bDesc, d->cDesc,
-                                                   &workspaceSize));
-        
+                                               d->bDesc, d->cDesc,
+                                               &workspaceSize));
+
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d),
@@ -147,11 +144,11 @@ namespace refactor::kernel {
                      beta = d->f32
                                 ? factor<fp32_t>(0)
                                 : factor<fp64_t>(0);
-                    CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
-                                             &alphaA, d->aDesc, a,
-                                             &alphaB, d->bDesc, b,
-                                             workspace, workspaceSize,
-                                             &beta, d->cDesc, c));
+                CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
+                                         &alphaA, d->aDesc, a,
+                                         &alphaB, d->bDesc, b,
+                                         workspace, workspaceSize,
+                                         &beta, d->cDesc, c));
             } else if (op == SimpleBinaryType::Div) {
                 CNNL_ASSERT(cnnlDiv_v2(handle,
                                        CNNL_COMPUTATION_HIGH_PRECISION,
@@ -179,7 +176,6 @@ namespace refactor::kernel {
                                          d->cDesc, c,
                                          workspace, workspaceSize));
             }
-            
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
index d35535948..68670662c 100644
--- a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
+++ b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
@@ -17,7 +17,7 @@ namespace refactor::kernel {
         : Kernel(), type(type_), dataType(dataType_), size(size_) {}
 
     auto K::build(Op op, Tensor const &a) noexcept -> KernelBox {
-        static const std::unordered_set<Op> ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh};
+        static const std::unordered_set<Op> ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh, Op::HardSwish};
 
 #ifndef USE_BANG
         return nullptr;
@@ -64,20 +64,27 @@ namespace refactor::kernel {
         auto d = std::make_shared<Descriptors>();
 
         // clang-format off
-        auto mode = type == Ty::Relu    ? CNNL_ACTIVATION_RELU
-                  : type == Ty::Sigmoid ? CNNL_ACTIVATION_SIGMOID
-                  : type == Ty::Tanh    ? CNNL_ACTIVATION_TANH
+        auto mode = type == Ty::Relu      ? CNNL_ACTIVATION_RELU
+                  : type == Ty::Sigmoid   ? CNNL_ACTIVATION_SIGMOID
+                  : type == Ty::Tanh      ? CNNL_ACTIVATION_TANH
+                  : type == Ty::HardSwish ? CNNL_ACTIVATION_HARDSWISH
                   : UNREACHABLEX(cnnlActivationMode_t, "");
+        float coef = 0.0;
+        float slicedDim = 0.0;
+        float gamma = 0.0;
+        float scale = 0.0;
         // clang-format on
 
         setCnnlTensor(d->tensor, dataType, slice(&size, 1));
-        CNNL_ASSERT(cnnlSetActivationDescriptor_v2(d->activation, mode, CNNL_ACTIVATION_HIGH_PRECISION,
-                                                   CNNL_NOT_PROPAGATE_NAN, 0.0));
+        CNNL_ASSERT(cnnlSetActivationDescriptor_v5(d->activation, mode,
+                                                   CNNL_ACTIVATION_HIGH_PRECISION,
+                                                   CNNL_NOT_PROPAGATE_NAN, coef,
+                                                   slicedDim, gamma, scale, true));
 
         res.fetchOrStore<CnnlContext>();
         return [d = std::move(d)]//
             (Resources & res, void *, void const *const *inputs, void *const *outputs) {
-                float alpha = 1, beta = 0;
+                float alpha = 1.f, beta = 0.f;
                 CNNL_ASSERT(cnnlActivationForward(
                     res.fetchOrStore<CnnlContext>()->handle,
                     d->activation,
diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
index 8e1e8a9f6..3964989c9 100644
--- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
@@ -60,6 +60,7 @@ TEST(kernel, ActivationCnnl) {
     testOp(SimpleUnaryType::Relu);
     testOp(SimpleUnaryType::Sigmoid);
     testOp(SimpleUnaryType::Tanh);
+    testOp(SimpleUnaryType::HardSwish);
 }
 
 
diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
index f46890714..9acdd95ba 100644
--- a/src/04kernel/test/kernels/transpose/test_cnnl.cpp
+++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
@@ -14,8 +14,8 @@ using namespace hardware;
 TEST(kernel, TransposeCnnl) {
     // build routine
     auto dataTensor = Tensor::share(DataType::F32, Shape{1, 3, 2, 5});
-    auto info = TransposeInfo(dataTensor->shape, Permutation{2, 3, 0, 1});
-    auto kCpu = TransposeCpu::build(dataTensor->dataType, info);
+    auto info = TransposeInfo(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1});
+    auto kCpu = TransposeCpu::build(info);
     auto kernel = TransposeCnnl::build(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1});
     ASSERT_TRUE(kCpu && kernel);
     auto res = runtime::Resources();

From 626ae8fe5aece306ea5a7c2f0a5d54d3ad22c92e Mon Sep 17 00:00:00 2001
From: Bolun <chamberlain0w0@gmail.com>
Date: Wed, 31 Jan 2024 08:14:02 +0000
Subject: [PATCH 9/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0hardsigmoid/pad/m?=
 =?UTF-8?q?in/max=E7=AE=97=E5=AD=90=EF=BC=8C=E4=BF=AE=E5=A4=8Dconv/transpo?=
 =?UTF-8?q?se=E7=AD=89=E5=87=BA=E7=8E=B0=E7=9A=84=E5=B0=8F=E9=94=99?=
 =?UTF-8?q?=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/04kernel/src/collectors/hard_sigmoid.cc   |   6 +
 src/04kernel/src/collectors/pad.cc            |   7 +-
 src/04kernel/src/collectors/select.cc         |   4 +
 src/04kernel/src/kernels/cast/cnnl_kernel.cc  |  19 ++-
 src/04kernel/src/kernels/conv/cnnl_kernel.cc  |  67 +++-----
 src/04kernel/src/kernels/conv/cnnl_kernel.hh  |   2 +-
 .../src/kernels/hard_sigmoid/cnnl_kernel.cc   |  81 ++++++++++
 .../src/kernels/hard_sigmoid/cnnl_kernel.hh   |  27 ++++
 src/04kernel/src/kernels/pad/cnnl_kernel.cc   |  95 +++++++++++
 src/04kernel/src/kernels/pad/cnnl_kernel.hh   |  28 ++++
 .../src/kernels/scatter_nd/cnnl_kernel.cc     |   1 -
 .../src/kernels/select/cnnl_kernel.cc         | 151 ++++++++++++++++++
 .../src/kernels/select/cnnl_kernel.hh         |  33 ++++
 .../src/kernels/transpose/cnnl_kernel.cc      |   8 +-
 .../test/kernels/hard_sigmoid/test_cnnl.cpp   |  51 ++++++
 src/04kernel/test/kernels/pad/test_cnnl.cpp   | 131 +++++++++++++++
 .../test/kernels/select/test_cnnl.cpp         |  99 ++++++++++++
 17 files changed, 757 insertions(+), 53 deletions(-)
 create mode 100644 src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/pad/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/pad/cnnl_kernel.hh
 create mode 100644 src/04kernel/src/kernels/select/cnnl_kernel.cc
 create mode 100644 src/04kernel/src/kernels/select/cnnl_kernel.hh
 create mode 100644 src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/pad/test_cnnl.cpp
 create mode 100644 src/04kernel/test/kernels/select/test_cnnl.cpp

diff --git a/src/04kernel/src/collectors/hard_sigmoid.cc b/src/04kernel/src/collectors/hard_sigmoid.cc
index 69d2f9d1e..c44151c20 100644
--- a/src/04kernel/src/collectors/hard_sigmoid.cc
+++ b/src/04kernel/src/collectors/hard_sigmoid.cc
@@ -1,4 +1,5 @@
 #include "kernel/collectors/hard_sigmoid.h"
+#include "../kernels/hard_sigmoid/cnnl_kernel.hh"
 #include "../kernels/hard_sigmoid/cpu_kernel.hh"
 #include "../kernels/hard_sigmoid/cuda_kernel.hh"
 
@@ -20,6 +21,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = HardSigmoidCnnl::build(alpha, beta, a); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/collectors/pad.cc b/src/04kernel/src/collectors/pad.cc
index f4c995e0b..1c7be68c4 100644
--- a/src/04kernel/src/collectors/pad.cc
+++ b/src/04kernel/src/collectors/pad.cc
@@ -1,4 +1,5 @@
 #include "kernel/collectors/pad.h"
+#include "../kernels/pad/cnnl_kernel.hh"
 #include "../kernels/pad/cpu_kernel.hh"
 #include "../kernels/pad/cuda_kernel.hh"
 
@@ -22,6 +23,11 @@ namespace refactor::kernel {
                     ans.emplace_back(std::move(ptr));
                 }
                 break;
+            case decltype(_target)::Mlu:
+                if (auto ptr = PadCnnl::build(dims, input.get().dataType, mode, const_value); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
@@ -29,4 +35,3 @@ namespace refactor::kernel {
     }
 
 }// namespace refactor::kernel
-
diff --git a/src/04kernel/src/collectors/select.cc b/src/04kernel/src/collectors/select.cc
index e4eff8f4b..3cec750e8 100644
--- a/src/04kernel/src/collectors/select.cc
+++ b/src/04kernel/src/collectors/select.cc
@@ -1,6 +1,7 @@
 #include "kernel/collectors/select.h"
 #include "../kernels/select/cpu_kernel.hh"
 #include "../kernels/select/cuda_kernel.hh"
+#include "../kernels/select/cnnl_kernel.hh"
 
 namespace refactor::kernel {
 
@@ -35,6 +36,9 @@ namespace refactor::kernel {
             case decltype(_target)::Nvidia:
                 REGISTER(SelectCuda)
                 break;
+            case decltype(_target)::Mlu:
+                REGISTER(SelectCnnl)
+                break;
             default:
                 UNREACHABLEX(void, "Unknown target");
         }
diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
index 54798e670..735692b90 100644
--- a/src/04kernel/src/kernels/cast/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
@@ -46,8 +46,10 @@ namespace refactor::kernel {
         struct Descriptors {
             cnnlTensorDescriptor_t inDesc, outDesc;
             cnnlCastDataType_t cast;
+            bool needCast;
 
-            Descriptors() : inDesc(nullptr), outDesc(nullptr) {
+            Descriptors(bool need) : inDesc(nullptr), outDesc(nullptr),
+                                     needCast(need) {
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
             }
@@ -56,15 +58,22 @@ namespace refactor::kernel {
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
             }
         };
-        auto d = std::make_shared<Descriptors>();
-        d->cast = castType(from, to);
+        auto d = std::make_shared<Descriptors>(from != to);
+        if (d->needCast) {
+            d->cast = castType(from, to);
+        }
         setCnnlTensor(d->inDesc, from, slice(shape.data(), shape.size()));
         setCnnlTensor(d->outDesc, to, slice(shape.data(), shape.size()));
 
         res.fetchOrStore<CnnlContext>();
         return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
-            CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
-                                         d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
+            if (d->needCast) {
+                CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
+                                             d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
+            } else {
+                CNNL_ASSERT(cnnlCopy(res.fetchOrStore<CnnlContext>()->handle,
+                                     d->inDesc, inputs[0], d->outDesc, outputs[0]));
+            }
         };
     }
 
diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
index 187ac4b51..0974a7600 100644
--- a/src/04kernel/src/kernels/conv/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
@@ -29,15 +29,10 @@ namespace refactor::kernel {
             return nullptr;
         }
 
-        std::optional<ExpandInfoCnnl> biasExpand = std::nullopt;
+        int biasSize_ = 0;
         if (b) {
             ASSERT(b->get().shape[0] == y.shape[1], "");
-            std::vector<dim_t> input(y.rank(), 1);
-            input[1] = y.shape[1];
-            biasExpand.emplace(ExpandInfoCnnl(
-                b->get().dataType,
-                slice(input.data(), input.size()),
-                slice(y.shape.data(), y.rank())));
+            biasSize_ = b->get().shape[0];
         }
 
         // group is not supported
@@ -70,7 +65,7 @@ namespace refactor::kernel {
             {d[0], d[1]},
             {p[0], p[1], p[2], p[3]},
             {s[0], s[1]},
-            std::move(biasExpand),
+            biasSize_,
         });
     }
 
@@ -92,22 +87,18 @@ namespace refactor::kernel {
 
         // RAII for closure
         struct Descriptors {
-            cnnlTensorDescriptor_t x, y, w;
+            cnnlTensorDescriptor_t x, y, w, b;
             cnnlTensorDescriptor_t xTrans, yTrans, wTrans;
             cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
             cnnlConvolutionDescriptor_t conv;
             cnnlConvolutionForwardAlgo_t algo;
-            // std::optional<ExtraPadding> extraPadding;
-            std::optional<Routine> biasExpand;
-            bool f32;
-
-            Descriptors(decltype(f32) f32_)
-                :// extraPadding(std::nullopt),
-                  biasExpand(std::nullopt),
-                  f32(f32_) {
+            bool bias;
+
+            Descriptors(decltype(bias) bias_) : bias(bias_) {
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&w));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&b));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&xTrans));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&yTrans));
                 CNNL_ASSERT(cnnlCreateTensorDescriptor(&wTrans));
@@ -119,6 +110,7 @@ namespace refactor::kernel {
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(w));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(b));
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(xTrans));
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(yTrans));
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(wTrans));
@@ -130,11 +122,8 @@ namespace refactor::kernel {
             Descriptors(const Descriptors &) = delete;
             Descriptors(Descriptors &&) = delete;
         };
-        auto d = std::make_shared<Descriptors>(info.dt != DataType::F64);
-        // d->extraPadding = ExtraPadding::build(info.dt, info.xShape, info.pad);
-        if (info.biasExpand) {
-            d->biasExpand = ExpandCnnl(*info.biasExpand).lower(res).routine;
-        }
+        auto d = std::make_shared<Descriptors>(info.biasSize > 0);
+
         int xs[]{
             info.xShape[0],
             info.xShape[1],
@@ -154,10 +143,15 @@ namespace refactor::kernel {
         setCnnlTensor(d->x, info.dt, slice(xs, 4));
         setCnnlTensor(d->y, info.dt, slice(info.yShape, 4));
         setCnnlTensor(d->w, info.dt, slice(info.wShape, 4));
+
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->xTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, xsNHWC.data()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->yTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, ysNHWC.data()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->wTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, wsNHWC.data()));
-        
+        if (d->bias) {
+            int biasDim[] = {1, 1, 1, info.biasSize};
+            CNNL_ASSERT(cnnlSetTensorDescriptor(d->b, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, biasDim));
+        }
+
         auto xTransSize = cnnlGetTensorElementNum(d->xTrans) * info.dt.size();
         auto yTransSize = cnnlGetTensorElementNum(d->yTrans) * info.dt.size();
         auto wTransSize = cnnlGetTensorElementNum(d->wTrans) * info.dt.size();
@@ -188,10 +182,6 @@ namespace refactor::kernel {
             handle, d->xTrans, d->wTrans, d->yTrans, NULL,
             d->conv, d->algo, &convWorkspaceSize));
 
-        // if (d->extraPadding) {
-        //     workspaceSize = hardware::alignBytes(workspaceSize, 256);
-        // }
-
         size_t workspaceSize = xTransSize + yTransSize + wTransSize + std::max({xWorkspaceSize, wWorkspaceSize, yWorkspaceSize, convWorkspaceSize});
 
         res.fetchOrStore<CnnlContext>();
@@ -201,12 +191,6 @@ namespace refactor::kernel {
             auto handle = res.fetchOrStore<CnnlContext>()->handle;
             void const *x = inputs[0], *w = inputs[1];
             void *y = outputs[0];
-            // if (auto f = d->extraPadding; f) {
-            //     x = (*f)(x, reinterpret_cast<uint8_t *>(workspace) + workspaceSize);
-            // }
-            // if (auto f = d->biasExpand; f) {
-            //     (*f)(res, workspace, inputs + 2, outputs);
-            // }
 
             void *xTrans = workspace;
             void *wTrans = reinterpret_cast<uint8_t *>(xTrans) + xTransSize;
@@ -218,19 +202,16 @@ namespace refactor::kernel {
                                          d->xTrans, xTrans, opWorkspace, xWorkspaceSize));
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->w, w,
                                          d->wTrans, wTrans, opWorkspace, wWorkspaceSize));
-            
-            // build alpha/beta for double
-            auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
-                 b = d->f32
-                         ? factor<fp32_t>(d->biasExpand ? 1 : 0)
-                         : factor<fp64_t>(d->biasExpand ? 1 : 0);
+
+            auto bDesc = (d->bias) ? d->b : NULL;
+            auto bData = (d->bias) ? inputs[2] : NULL;
             CNNL_ASSERT(cnnlConvolutionForward(
                 handle,
-                d->conv, d->algo, &a,
+                d->conv, d->algo, NULL,
                 d->xTrans, xTrans, d->wTrans, wTrans,
-                NULL, NULL, opWorkspace, convWorkspaceSize,
-                &b, d->yTrans, yTrans));
-            
+                bDesc, bData, opWorkspace, convWorkspaceSize,
+                NULL, d->yTrans, yTrans));
+
             // transpose NHWC intermediates to NCHW
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->yTrans, yTrans,
                                          d->y, y, opWorkspace, yWorkspaceSize));
diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.hh b/src/04kernel/src/kernels/conv/cnnl_kernel.hh
index 814e75ad5..c486cdd17 100644
--- a/src/04kernel/src/kernels/conv/cnnl_kernel.hh
+++ b/src/04kernel/src/kernels/conv/cnnl_kernel.hh
@@ -19,7 +19,7 @@ namespace refactor::kernel {
                 dilation[2],
                 pad[4],
                 stride[2];
-            std::optional<ExpandInfoCnnl> biasExpand;
+            int biasSize;
         } info;
 
         explicit ConvCnnl(decltype(info)) noexcept;
diff --git a/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc
new file mode 100644
index 000000000..25ec6d0cb
--- /dev/null
+++ b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc
@@ -0,0 +1,81 @@
+﻿#include "cnnl_kernel.hh"
+#include "kernel/collectors/hard_sigmoid.h"
+#include <unordered_set>
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#include <cnnl.h>
+#endif
+
+namespace refactor::kernel {
+    using K = HardSigmoidCnnl;
+    using DT = DataType;
+
+    K::HardSigmoidCnnl(float alpha_, float beta_, DT dataType_, int size_) noexcept
+        : Kernel(), alpha(alpha_), beta(beta_), dataType(dataType_), size(size_) {}
+
+    auto K::build(float alpha_, float beta_, Tensor const &a) noexcept -> KernelBox {
+
+#ifndef USE_BANG
+        return nullptr;
+#endif
+
+        return std::make_unique<K>(alpha_, beta_, a.dataType, a.elementsSize());
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing hardsigmoid using CNNL";
+    }
+
+#ifdef USE_BANG
+
+    auto HardSigmoidCnnl::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        // RAII for closure
+        struct Descriptors {
+            cnnlActivationDescriptor_t activation;
+            cnnlTensorDescriptor_t tensor;
+
+            Descriptors() : activation(nullptr), tensor(nullptr) {
+                CNNL_ASSERT(cnnlCreateActivationDescriptor(&activation));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyActivationDescriptor(activation));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor));
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>();
+
+        setCnnlTensor(d->tensor, dataType, slice(&size, 1));
+        CNNL_ASSERT(cnnlSetActivationDescriptor_v5(d->activation, CNNL_ACTIVATION_HARDSIGMOID,
+                                                   CNNL_ACTIVATION_HIGH_PRECISION,
+                                                   CNNL_NOT_PROPAGATE_NAN, 0.0,
+                                                   0.0, alpha, beta, true));
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d)]//
+            (Resources & res, void *, void const *const *inputs, void *const *outputs) {
+                float alpha = 1.f, beta = 0.f;
+                CNNL_ASSERT(cnnlActivationForward(
+                    res.fetchOrStore<CnnlContext>()->handle,
+                    d->activation,
+                    &alpha, d->tensor, inputs[0],
+                    &beta, d->tensor, outputs[0]));
+            };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh
new file mode 100644
index 000000000..c343d1fed
--- /dev/null
+++ b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh
@@ -0,0 +1,27 @@
+﻿#ifndef KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH
+#define KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH
+
+#include "kernel/collectors/simple_unary.h"
+
+namespace refactor::kernel {
+
+    struct HardSigmoidCnnl final : public Kernel {
+        float alpha, beta;
+        DataType dataType;
+        int size;
+
+        HardSigmoidCnnl(float, float, DataType, int) noexcept;
+
+        static KernelBox build(float, float, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/pad/cnnl_kernel.cc b/src/04kernel/src/kernels/pad/cnnl_kernel.cc
new file mode 100644
index 000000000..39648f851
--- /dev/null
+++ b/src/04kernel/src/kernels/pad/cnnl_kernel.cc
@@ -0,0 +1,95 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = PadCnnl;
+
+    K::PadCnnl(DataType dataType_, PadType mode_, std::vector<int> inDim_,
+               std::vector<int> outDim_, std::vector<int> padDim_, size_t len_) noexcept
+        : Kernel(), dataType(dataType_), mode(mode_), inDim(std::move(inDim_)),
+          outDim(std::move(outDim_)), padDim(std::move(padDim_)), valueLength(len_) {}
+
+    auto K::build(PadDimension dims_, DataType dataType_, PadType mode_, std::optional<std::reference_wrapper<Tensor const>> value_) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        if (mode_ != PadType::Constant || (value_ && value_->get().dataType != dataType_)) {
+            return nullptr;
+        }
+        size_t valueLength_ = value_ ? value_->get().dataType.size() : 0;
+        std::vector<int> inDim_, outDim_, padDim_;
+        for (auto dim : dims_) {
+            inDim_.push_back(dim.dimI);
+            outDim_.push_back(dim.dimO);
+            padDim_.push_back(dim.pads);
+        }
+
+        return std::make_unique<K>(dataType_, mode_, inDim_, outDim_, padDim_, valueLength_);
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing Pad using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto K::lower(Resources &res) const noexcept -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            cnnlTensorDescriptor_t inDesc, outDesc;
+
+            Descriptors() : inDesc(nullptr), outDesc(nullptr) {
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
+                CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
+            }
+            ~Descriptors() noexcept(false) {
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
+                CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
+            }
+        };
+        auto d = std::make_shared<Descriptors>();
+        setCnnlTensor(d->inDesc, dataType, slice(inDim.data(), inDim.size()));
+        setCnnlTensor(d->outDesc, dataType, slice(outDim.data(), outDim.size()));
+
+        std::vector<int> pads;
+        for (auto d : padDim) {
+            pads.push_back(d);
+            pads.push_back(d);
+        }
+
+        res.fetchOrStore<CnnlContext>();
+        return [d = std::move(d), val = valueLength,
+                p = std::vector<int>(pads.begin(), pads.end())](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            void *paddingValue;
+            if (val != 0) {
+                paddingValue = malloc(val);
+                BANG_ASSERT(cnrtMemcpy(paddingValue, const_cast<void *>(inputs[2]),
+                                       val, CNRT_MEM_TRANS_DIR_DEV2HOST));
+            } else {
+                float zero = 0.0;
+                paddingValue = &zero;
+            }
+
+            CNNL_ASSERT(cnnlPad(res.fetchOrStore<CnnlContext>()->handle,
+                                d->inDesc, inputs[0], p.data(), paddingValue,
+                                d->outDesc, outputs[0]));
+
+            if (val != 0) {
+                free(paddingValue);
+            }
+        };
+    }
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/pad/cnnl_kernel.hh b/src/04kernel/src/kernels/pad/cnnl_kernel.hh
new file mode 100644
index 000000000..202518fd4
--- /dev/null
+++ b/src/04kernel/src/kernels/pad/cnnl_kernel.hh
@@ -0,0 +1,28 @@
+#ifndef KERNEL_PAD_CNNL_HH
+#define KERNEL_PAD_CNNL_HH
+
+#include "kernel/attributes/pad_info.h"
+#include "kernel/collectors/pad.h"
+
+namespace refactor::kernel {
+
+    struct PadCnnl final : public Kernel {
+        DataType dataType;
+        PadType mode;
+        std::vector<int> inDim, outDim, padDim;
+        size_t valueLength;
+
+        PadCnnl(DataType, PadType, std::vector<int>, std::vector<int>, std::vector<int>, size_t) noexcept;
+        static KernelBox build(PadDimension, DataType, PadType, std::optional<std::reference_wrapper<Tensor const>>) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif//KERNEL_PAD_CNNL_HH
diff --git a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc
index 8f181b58b..2152acf5b 100644
--- a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc
@@ -78,7 +78,6 @@ namespace refactor::kernel {
             CNNL_ASSERT(cnnlScatterNd_v2(res.fetchOrStore<CnnlContext>()->handle, CNNL_SCATTERND_UPDATE,
                                          d->indexDesc, inputs[1], d->updateDesc, inputs[2],
                                          d->inDesc, inputs[0], d->outDesc, outputs[0]));
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 #endif
diff --git a/src/04kernel/src/kernels/select/cnnl_kernel.cc b/src/04kernel/src/kernels/select/cnnl_kernel.cc
new file mode 100644
index 000000000..be54a8904
--- /dev/null
+++ b/src/04kernel/src/kernels/select/cnnl_kernel.cc
@@ -0,0 +1,151 @@
+#include "cnnl_kernel.hh"
+
+#ifdef USE_BANG
+#include "../../utilities/bang/cnnl_context.hh"
+#include "../../utilities/bang/cnnl_functions.h"
+#endif
+
+namespace refactor::kernel {
+    using K = SelectCnnl;
+
+    K::SelectCnnl(decltype(dataType) dataType_,
+                  decltype(selectType) selectType_,
+                  decltype(inputDims) inputDims_,
+                  decltype(outputDims) outputDims_,
+                  decltype(inputsNum) inputsNum_) noexcept
+        : dataType(dataType_),
+          selectType(selectType_),
+          inputDims(std::move(inputDims_)),
+          outputDims(std::move(outputDims_)),
+          inputsNum(inputsNum_) {}
+
+    auto K::build(SelectType selectType_, TensorRefs inputs_) noexcept -> KernelBox {
+#ifndef USE_BANG
+        return nullptr;
+#endif
+        auto dt = inputs_[0].get().dataType;
+        std::vector<std::vector<int>> inputDims_, outputDims_;
+        for (size_t i = 0; i < inputs_.size(); i++) {
+            auto shape = std::vector<int>(inputs_[i].get().shape.begin(),
+                                          inputs_[i].get().shape.end());
+            if (shape.size() == 0) {
+                shape.push_back(1);
+            }
+            inputDims_.push_back(shape);
+        }
+
+        auto broadcastShape = [](const std::vector<int> &shape1, const std::vector<int> &shape2) -> std::vector<int> {
+            int max_dim = std::max(shape1.size(), shape2.size());
+
+            std::vector<int> resultShape(max_dim, 1);
+            int dim_diff1 = max_dim - shape1.size();
+            int dim_diff2 = max_dim - shape2.size();
+
+            for (int i = 0; i < max_dim; ++i) {
+                int dim_size1 = (i >= dim_diff1) ? shape1[i - dim_diff1] : 1;
+                int dim_size2 = (i >= dim_diff2) ? shape2[i - dim_diff2] : 1;
+                resultShape[i] = std::max(dim_size1, dim_size2);
+            }
+
+            return resultShape;
+        };
+
+        for (size_t i = 1; i < inputs_.size(); i++) {
+            outputDims_.push_back(broadcastShape(inputDims_[i - 1], inputDims_[i]));
+        }
+
+        return std::make_unique<K>(dt, selectType_, inputDims_, outputDims_, inputs_.size());
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing select operation using CNNL";
+    }
+
+#ifdef USE_BANG
+    auto K::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace cnnl;
+        using namespace runtime;
+
+        struct Descriptors {
+            std::vector<cnnlTensorDescriptor_t> in, out;
+
+            explicit Descriptors(int n)
+                : in(std::vector<cnnlTensorDescriptor_t>(n, nullptr)),
+                  out(std::vector<cnnlTensorDescriptor_t>(n - 1, nullptr)) {
+                for (auto i = 0; i < n; i++) {
+                    CNNL_ASSERT(cnnlCreateTensorDescriptor(&in[i]));
+                    if (i != n - 1) {
+                        CNNL_ASSERT(cnnlCreateTensorDescriptor(&out[i]));
+                    }
+                }
+            }
+            ~Descriptors() noexcept(false) {
+                for (size_t i = 0; i < in.size(); i++) {
+                    CNNL_ASSERT(cnnlDestroyTensorDescriptor(in[i]));
+                    if (i != in.size() - 1) {
+                        CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
+                    }
+                }
+            }
+
+            Descriptors(const Descriptors &) = delete;
+            Descriptors(Descriptors &&) = delete;
+        };
+        auto d = std::make_shared<Descriptors>(inputsNum);
+        for (size_t i = 0; i < inputsNum; i++) {
+            setCnnlTensor(d->in[i], dataType, slice(inputDims[i].data(), inputDims[i].size()));
+            if (i != inputsNum - 1) {
+                setCnnlTensor(d->out[i], dataType, slice(outputDims[i].data(), outputDims[i].size()));
+            }
+        }
+
+        auto handle = res.fetchOrStore<CnnlContext>()->handle;
+        size_t workspaceSize;
+        switch (selectType) {
+            case SelectType::Max:
+                CNNL_ASSERT(cnnlGetMaximumWorkspaceSize(handle, d->out.back(), &workspaceSize));
+                break;
+            case SelectType::Min:
+                CNNL_ASSERT(cnnlGetMinimumWorkspaceSize(handle, d->out.back(), &workspaceSize));
+                break;
+            default:
+                UNREACHABLE();
+        }
+
+        res.fetchOrStore<CnnlContext>();
+        auto routine = [d = std::move(d), type = selectType, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            // fetch cnnl handle from resources
+            auto handle = res.fetchOrStore<CnnlContext>()->handle;
+
+            auto select =
+                (type == SelectType::Max)   ? cnnlMaximum
+                : (type == SelectType::Min) ? cnnlMinimum
+                                            : nullptr;
+
+            for (size_t i = 1; i < d->in.size(); i++) {
+                if (i == 1) {
+                    CNNL_ASSERT(select(
+                        handle, d->in[0], inputs[0], d->in[1], inputs[1],
+                        d->out[0], outputs[0], workspace, workspaceSize));
+                } else {
+                    CNNL_ASSERT(select(
+                        handle, d->out[i - 2], outputs[0], d->in[i], inputs[i],
+                        d->out[i - 1], outputs[0], workspace, workspaceSize));
+                }
+            }
+        };
+
+        return {std::move(routine), workspaceSize};
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/select/cnnl_kernel.hh b/src/04kernel/src/kernels/select/cnnl_kernel.hh
new file mode 100644
index 000000000..680911f36
--- /dev/null
+++ b/src/04kernel/src/kernels/select/cnnl_kernel.hh
@@ -0,0 +1,33 @@
+#ifndef KERNEL_SELECT_CNNL_KERNEL_HH
+#define KERNEL_SELECT_CNNL_KERNEL_HH
+
+#include "kernel/attributes/broadcaster.h"
+#include "kernel/collectors/select.h"
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct SelectCnnl final : public Kernel {
+        DataType dataType;
+        SelectType selectType;
+        std::vector<std::vector<int>> inputDims;
+        std::vector<std::vector<int>> outputDims;
+        size_t inputsNum;
+
+        SelectCnnl(decltype(dataType), decltype(selectType), decltype(inputDims),
+                   decltype(outputDims), decltype(inputsNum)) noexcept;
+
+        static KernelBox build(SelectType, TensorRefs) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_BANG
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SELECT_CNNL_KERNEL_HH
diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
index a5a6881e4..58f2d4fd4 100644
--- a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
+++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc
@@ -72,8 +72,12 @@ namespace refactor::kernel {
         };
 
         auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);
-        setCnnlTensor(d->x, info.dataType, slice(info.inDim.data(), info.inDim.size()));
-        setCnnlTensor(d->y, info.dataType, slice(info.outDim.data(), info.outDim.size()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.inDim.size(), info.inDim.data()));
+        CNNL_ASSERT(cnnlSetTensorDescriptor(
+            d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
+            info.outDim.size(), info.outDim.data()));
         CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, info.perm.size(), info.perm.data()));
 
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
diff --git a/src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp b/src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp
new file mode 100644
index 000000000..ad26438bf
--- /dev/null
+++ b/src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp
@@ -0,0 +1,51 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/hard_sigmoid/cnnl_kernel.hh"
+#include "../../../src/kernels/hard_sigmoid/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, HardSigmoidCnnl) {
+    // build routine
+    auto dataTensor = Tensor::share(DataType::F32, Shape{2, 3, 5});
+    float alpha = 0.2f, beta = 0.5f;
+    auto kernel = HardSigmoidCnnl::build(alpha, beta, *dataTensor);
+    auto kCpu = HardSigmoidCpu::build(alpha, beta, *dataTensor);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine,
+         rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    auto mluMem = dev.malloc(dataTensor->bytesSize());
+    // put input data
+    std::vector<float> data(dataTensor->elementsSize());
+    for (auto i : range0_(data.size())) { data[i] = i; }
+    mluMem->copyFromHost(data.data(), dataTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluMem};
+        void *outputs[]{*mluMem};
+        routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
+    }
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{data.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    std::vector<float> result(dataTensor->elementsSize());
+    mluMem->copyToHost(result.data(), dataTensor->bytesSize());
+    // check
+    for (auto i : range0_(data.size())) {
+        EXPECT_FLOAT_EQ(data[i], result[i]);
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/pad/test_cnnl.cpp b/src/04kernel/test/kernels/pad/test_cnnl.cpp
new file mode 100644
index 000000000..9243be4c7
--- /dev/null
+++ b/src/04kernel/test/kernels/pad/test_cnnl.cpp
@@ -0,0 +1,131 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/pad/cpu_kernel.hh"
+#include "../../../src/kernels/pad/cnnl_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, PadCnnl) {
+    {
+        PadDimension dims{
+            {2, 4, 1},
+            {3, 5, 1},
+            {1, 1, 0},
+            {4, 8, 2},
+        };
+        // build routine
+        auto t1Tensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 4});
+        auto t2Tensor = Tensor::share(DataType::I64, Shape{8});
+        auto t3Tensor = Tensor::share(DataType::F32, Shape{});
+        auto yTensor = Tensor::share(DataType::F32, Shape{4, 5, 1, 8});
+        PadType type = PadType::Constant;
+        auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
+        auto kernel = PadCnnl::build(dims, DataType::F32, type, std::make_optional(std::reference_wrapper(*t3Tensor)));
+        ASSERT_TRUE(kernel && kCpu);
+        auto res = runtime::Resources();
+        auto routine = kernel->lower(res).routine,
+             rCpu = kCpu->lower(res).routine;
+        // malloc
+        auto &dev = *device::init(Device::Type::Mlu, 0, "");
+        auto mluIn = dev.malloc(t1Tensor->bytesSize()),
+             mluIn2 = dev.malloc(t2Tensor->bytesSize()),
+             mluIn3 = dev.malloc(t3Tensor->bytesSize()),
+             mluOut = dev.malloc(yTensor->bytesSize());
+        // put input data
+        std::vector<float> data(t1Tensor->elementsSize()),
+            constvalue(1, 1.2f),
+            cpuOut(yTensor->elementsSize());
+        std::vector<int64_t> pads{1, 1, 0, 2, 1, 1, 0, 2};
+
+
+        for (auto i : range0_(data.size())) { data[i] = i; }
+        mluIn->copyFromHost(data.data(), t1Tensor->bytesSize());
+        mluIn2->copyFromHost(pads.data(), t2Tensor->bytesSize());
+        mluIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize());
+
+        // inference
+        {
+            void const *inputs[]{*mluIn, *mluIn2, *mluIn3};
+            void *outputs[]{*mluOut};
+            routine(res, nullptr, inputs, outputs);
+            kernel::bang::sync();
+        }
+        {
+            void const *inputs[]{data.data(), pads.data(), constvalue.data()};
+            void *outputs[]{cpuOut.data()};
+            rCpu(res, nullptr, inputs, outputs);
+        }
+        // take output data
+        std::vector<float> result(yTensor->elementsSize());
+        mluOut->copyToHost(result.data(), yTensor->bytesSize());
+        // check
+        for (auto i : range0_(cpuOut.size())) {
+            EXPECT_FLOAT_EQ(cpuOut[i], result[i]);
+        }
+    }
+
+    {
+        PadDimension dims{
+            {2, 2, 0},
+            {3, 3, 0},
+            {1, 1, 0},
+            {4, 4, 0},
+        };
+        // build routine
+        auto t1Tensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 4});
+        auto t2Tensor = Tensor::share(DataType::I64, Shape{8});
+        auto t3Tensor = Tensor::share(DataType::F32, Shape{});
+        auto yTensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 4});
+        PadType type = PadType::Constant;
+        auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
+        auto kernel = PadCnnl::build(dims, DataType::F32, type, std::make_optional(std::reference_wrapper(*t3Tensor)));
+        ASSERT_TRUE(kernel && kCpu);
+        auto res = runtime::Resources();
+        auto routine = kernel->lower(res).routine,
+             rCpu = kCpu->lower(res).routine;
+        // malloc
+        auto &dev = *device::init(Device::Type::Mlu, 0, "");
+        auto mluIn = dev.malloc(t1Tensor->bytesSize()),
+             mluIn2 = dev.malloc(t2Tensor->bytesSize()),
+             mluIn3 = dev.malloc(t3Tensor->bytesSize()),
+             mluOut = dev.malloc(yTensor->bytesSize());
+        // put input data
+        std::vector<float> data(t1Tensor->elementsSize()),
+            constvalue(1, 1.2f),
+            cpuOut(yTensor->elementsSize());
+        std::vector<int64_t> pads{0, 0, 0, 0, 0, 0, 0, 0};
+
+
+        for (auto i : range0_(data.size())) { data[i] = i; }
+        mluIn->copyFromHost(data.data(), t1Tensor->bytesSize());
+        mluIn2->copyFromHost(pads.data(), t2Tensor->bytesSize());
+        mluIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize());
+
+        // inference
+        {
+            void const *inputs[]{*mluIn, *mluIn2, *mluIn3};
+            void *outputs[]{*mluOut};
+            routine(res, nullptr, inputs, outputs);
+            kernel::bang::sync();
+        }
+        {
+            void const *inputs[]{data.data(), pads.data(), constvalue.data()};
+            void *outputs[]{cpuOut.data()};
+            rCpu(res, nullptr, inputs, outputs);
+        }
+        // take output data
+        std::vector<float> result(yTensor->elementsSize());
+        mluOut->copyToHost(result.data(), yTensor->bytesSize());
+        // check
+        for (auto i : range0_(cpuOut.size())) {
+            EXPECT_FLOAT_EQ(cpuOut[i], result[i]);
+        }
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/select/test_cnnl.cpp b/src/04kernel/test/kernels/select/test_cnnl.cpp
new file mode 100644
index 000000000..b691fc3d1
--- /dev/null
+++ b/src/04kernel/test/kernels/select/test_cnnl.cpp
@@ -0,0 +1,99 @@
+#ifdef USE_BANG
+
+#include "../../../src/kernels/select/cnnl_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
+#include "hardware/device_manager.h"
+#include <functional>
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+static void testSelect(const SelectType selectType, const std::vector<Shape> &shapes, const Shape &outShape, const std::vector<std::vector<float>> &data,
+                       const std::vector<float> expectData) {
+    // build routine
+    TensorRefs dataTensors;
+    std::vector<Tensor> tensorsVec;
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        tensorsVec.push_back(Tensor(DataType::F32, shapes[i], LayoutType::Others, nullptr));
+    }
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        dataTensors.push_back(std::cref(tensorsVec[i]));
+    }
+    auto result = Tensor::share(DataType::F32, outShape);
+    auto kernel = SelectCnnl::build(selectType, dataTensors);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // cnnl malloc
+    auto &dev = *device::init(Device::Type::Mlu, 0, "");
+    Arc<Device::Blob>
+        workspace = dev.malloc(workspaceSize),
+        mluIns[]{
+            dev.malloc(dataTensors[0].get().bytesSize()),
+            dev.malloc(dataTensors[1].get().bytesSize()),
+            dev.malloc(dataTensors[2].get().bytesSize()),
+        },
+        mluOut = dev.malloc(result->bytesSize());
+    // put input data
+    mluIns[0]->copyFromHost(data[0].data(), dataTensors[0].get().bytesSize());
+    mluIns[1]->copyFromHost(data[1].data(), dataTensors[1].get().bytesSize());
+    mluIns[2]->copyFromHost(data[2].data(), dataTensors[2].get().bytesSize());
+    // inference
+    {
+        void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2]};
+        void *outputs[]{*mluOut};
+        routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
+    }
+    // check
+    std::vector<float> out(result->elementsSize());
+    mluOut->copyToHost(out.data(), result->bytesSize());
+    for (auto i : range0_(expectData.size())) {
+        EXPECT_FLOAT_EQ(expectData[i], out[i]);
+    }
+}
+
+TEST(kernel, SelectCnnl) {
+    // no need broadcast
+    testSelect(SelectType::Max,
+               {{1, 3}, {1, 3}, {1, 3}},
+               {1, 3},
+               {{3, 2, 1}, {1, 4, 4}, {2, 5, 3}},
+               {3, 5, 4});
+
+    testSelect(SelectType::Min,
+               {{1, 3}, {1, 3}, {1, 3}},
+               {1, 3},
+               {{3, 2, 1}, {1, 4, 4}, {2, 5, 3}},
+               {1, 2, 1});
+
+    // need broadcast
+    testSelect(SelectType::Max,
+               {{3}, {1, 3}, {1, 3}},
+               {1, 3},
+               {{3, 3, 3}, {1, 4, 4}, {2, 5, 3}},
+               {3, 5, 4});
+
+    testSelect(SelectType::Min,
+               {{3}, {1, 3}, {1, 3}},
+               {1, 3},
+               {{3, 3, 3}, {1, 4, 4}, {2, 5, 3}},
+               {1, 3, 3});
+
+    testSelect(SelectType::Max,
+               {{1}, {1, 3}, {1, 3}},
+               {1, 3},
+               {{3}, {1, 4, 4}, {2, 5, 3}},
+               {3, 5, 4});
+
+    testSelect(SelectType::Min,
+               {{1}, {1, 3}, {1, 3}},
+               {1, 3},
+               {{3}, {1, 4, 4}, {2, 5, 3}},
+               {1, 3, 3});
+}
+
+#endif