From c291c79c53cd1420f8acae993620bdac549cb033 Mon Sep 17 00:00:00 2001 From: Bolun Date: Fri, 5 Jan 2024 06:57:40 +0000 Subject: [PATCH 1/9] =?UTF-8?q?feat:=20=E6=8E=A5=E5=85=A5CNNL=EF=BC=8C?= =?UTF-8?q?=E5=B9=B6=E6=B7=BB=E5=8A=A0unary/binary/softmax/batchnorm/reduc?= =?UTF-8?q?e/transpose/pooling=E7=AE=97=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/02hardware/src/device_manager.cpp | 1 + .../src/collectors/batch_normalization.cc | 4 + src/04kernel/src/collectors/pool.cc | 6 + src/04kernel/src/collectors/reduce.cc | 4 + src/04kernel/src/collectors/simple_binary.cc | 4 + src/04kernel/src/collectors/simple_unary.cc | 6 + src/04kernel/src/collectors/softmax.cc | 7 + src/04kernel/src/collectors/transpose.cc | 6 + .../batch_normalization/cnnl_kernel.cc | 158 ++++++++++++++ .../batch_normalization/cnnl_kernel.hh | 32 +++ src/04kernel/src/kernels/pool/cnnl_kernel.cc | 156 ++++++++++++++ src/04kernel/src/kernels/pool/cnnl_kernel.hh | 45 ++++ .../src/kernels/reduce/cnnl_kernel.cc | 128 ++++++++++++ .../src/kernels/reduce/cnnl_kernel.hh | 32 +++ .../src/kernels/simple_binary/binary_cnnl.cc | 195 ++++++++++++++++++ .../src/kernels/simple_binary/binary_cnnl.hh | 28 +++ .../simple_unary/cnnl_activation_kernel.cc | 91 ++++++++ .../simple_unary/cnnl_activation_kernel.hh | 27 +++ .../simple_unary/cnnl_simple_unary_kernel.cc | 94 +++++++++ .../simple_unary/cnnl_simple_unary_kernel.hh | 27 +++ .../src/kernels/softmax/cnnl_kernel.cc | 86 ++++++++ .../src/kernels/softmax/cnnl_kernel.hh | 36 ++++ .../src/kernels/transpose/cnnl_kernel.cc | 92 +++++++++ .../src/kernels/transpose/cnnl_kernel.hh | 32 +++ .../src/utilities/bang/cnnl_context.cc | 35 ++++ .../src/utilities/bang/cnnl_context.hh | 29 +++ .../src/utilities/bang/cnnl_functions.cpp | 38 ++++ .../src/utilities/bang/cnnl_functions.h | 40 ++++ .../kernels/batch_normalization/test_cnnl.cpp | 70 +++++++ src/04kernel/test/kernels/pool/test_cnnl.cpp | 70 +++++++ .../test/kernels/reduce/test_cnnl.cpp | 64 ++++++ .../simple_binary/test_binary_cnnl.cpp | 90 ++++++++ .../test/kernels/simple_unary/test_cnnl.cpp | 63 ++++++ .../test/kernels/softmax/test_cnnl.cpp | 52 +++++ .../test/kernels/transpose/test_cnnl.cpp | 55 +++++ 35 files changed, 1903 insertions(+) create mode 100644 src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/pool/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/pool/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/reduce/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/reduce/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/simple_binary/binary_cnnl.cc create mode 100644 src/04kernel/src/kernels/simple_binary/binary_cnnl.hh create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc create mode 100644 src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh create mode 100644 src/04kernel/src/kernels/softmax/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/softmax/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/transpose/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/transpose/cnnl_kernel.hh create mode 100644 src/04kernel/src/utilities/bang/cnnl_context.cc create mode 100644 src/04kernel/src/utilities/bang/cnnl_context.hh create mode 100644 src/04kernel/src/utilities/bang/cnnl_functions.cpp create mode 100644 src/04kernel/src/utilities/bang/cnnl_functions.h create mode 100644 src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/pool/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/reduce/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp create mode 100644 src/04kernel/test/kernels/simple_unary/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/softmax/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/transpose/test_cnnl.cpp diff --git a/src/02hardware/src/device_manager.cpp b/src/02hardware/src/device_manager.cpp index 15ae3b901..8a7f8edd2 100644 --- a/src/02hardware/src/device_manager.cpp +++ b/src/02hardware/src/device_manager.cpp @@ -2,6 +2,7 @@ #include "hardware/devices/cpu.h" #include "hardware/devices/mlu.h" #include "hardware/devices/nvidia.h" +#include "hardware/devices/mlu.h" namespace refactor::hardware::device { diff --git a/src/04kernel/src/collectors/batch_normalization.cc b/src/04kernel/src/collectors/batch_normalization.cc index 93bcb240e..e944e37d7 100644 --- a/src/04kernel/src/collectors/batch_normalization.cc +++ b/src/04kernel/src/collectors/batch_normalization.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/batch_normalization.h" #include "../kernels/batch_normalization/cpu_kernel.hh" #include "../kernels/batch_normalization/cudnn_kernel.hh" +#include "../kernels/batch_normalization/cnnl_kernel.hh" namespace refactor::kernel { @@ -20,6 +21,9 @@ namespace refactor::kernel { case decltype(_target)::Nvidia: REGISTER(BatchNormalizationCudnn) break; + case decltype(_target)::Mlu: + REGISTER(BatchNormalizationCnnl) + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/pool.cc b/src/04kernel/src/collectors/pool.cc index 458d3a375..d034e199f 100644 --- a/src/04kernel/src/collectors/pool.cc +++ b/src/04kernel/src/collectors/pool.cc @@ -1,5 +1,6 @@ #include "kernel/collectors/pool.h" #include "../kernels/pool/cudnn_kernel.hh" +#include "../kernels/pool/cnnl_kernel.hh" namespace refactor::kernel { @@ -29,6 +30,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = PoolCnnl::build(type, ceil, kernelShape, attributes, x, y); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/reduce.cc b/src/04kernel/src/collectors/reduce.cc index bec37731d..71fa194ba 100644 --- a/src/04kernel/src/collectors/reduce.cc +++ b/src/04kernel/src/collectors/reduce.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/reduce.h" #include "../kernels/reduce/cpu_kernel.hh" #include "../kernels/reduce/cudnn_kernel.hh" +#include "../kernels/reduce/cnnl_kernel.hh" namespace refactor::kernel { @@ -27,6 +28,9 @@ namespace refactor::kernel { case decltype(_target)::Nvidia: REGISTER(ReduceCudnn) break; + case decltype(_target)::Mlu: + REGISTER(ReduceCnnl) + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/simple_binary.cc b/src/04kernel/src/collectors/simple_binary.cc index 53ae6723c..e61d013f9 100644 --- a/src/04kernel/src/collectors/simple_binary.cc +++ b/src/04kernel/src/collectors/simple_binary.cc @@ -2,6 +2,7 @@ #include "../kernels/simple_binary/binary_cudnn.hh" #include "../kernels/simple_binary/cpu_kernel.hh" #include "../kernels/simple_binary/cuda_kernel.hh" +#include "../kernels/simple_binary/binary_cnnl.hh" namespace refactor::kernel { @@ -50,6 +51,9 @@ namespace refactor::kernel { REGISTER_BROCAST(BinaryCudnn) REGISTER(BinaryCuda) break; + case decltype(_target)::Mlu: + REGISTER_BROCAST(BinaryCnnl) + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/simple_unary.cc b/src/04kernel/src/collectors/simple_unary.cc index 51a334c91..c489acecf 100644 --- a/src/04kernel/src/collectors/simple_unary.cc +++ b/src/04kernel/src/collectors/simple_unary.cc @@ -2,6 +2,8 @@ #include "../kernels/simple_unary/cpu_kernel.hh" #include "../kernels/simple_unary/cuda_kernel.hh" #include "../kernels/simple_unary/cudnn_activation_kernel.hh" +#include "../kernels/simple_unary/cnnl_activation_kernel.hh" +#include "../kernels/simple_unary/cnnl_simple_unary_kernel.hh" #include "common.h" namespace refactor::kernel { @@ -55,6 +57,10 @@ namespace refactor::kernel { REGISTER(ActivationCudnn) REGISTER(SimpleUnaryCuda) break; + case decltype(_target)::Mlu: + REGISTER(ActivationCnnl) + REGISTER(SimpleUnaryCnnl) + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/softmax.cc b/src/04kernel/src/collectors/softmax.cc index 2ce442696..020bc6ded 100644 --- a/src/04kernel/src/collectors/softmax.cc +++ b/src/04kernel/src/collectors/softmax.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/softmax.h" +#include "../kernels/softmax/cnnl_kernel.hh" #include "../kernels/softmax/cpu_kernel.hh" #include "../kernels/softmax/cuda_kernel.hh" #include "../kernels/softmax/cudnn_kernel.hh" @@ -28,6 +29,12 @@ namespace refactor::kernel { } break; } + case decltype(_target)::Mlu: { + if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + } default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/transpose.cc b/src/04kernel/src/collectors/transpose.cc index c8d651974..c91493f98 100644 --- a/src/04kernel/src/collectors/transpose.cc +++ b/src/04kernel/src/collectors/transpose.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/transpose.h" #include "../kernels/transpose/cpu_kernel.hh" #include "../kernels/transpose/cuda_kernel.hh" +#include "../kernels/transpose/cnnl_kernel.hh" namespace refactor::kernel { @@ -25,6 +26,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = TransposeCnnl::build(data.dataType, data.shape, perm); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc new file mode 100644 index 000000000..be06233cd --- /dev/null +++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc @@ -0,0 +1,158 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = BatchNormalizationCnnl; + using DT = DataType; + + K::BatchNormalizationCnnl(decltype(info) info_) noexcept + : info(info_) {} + + auto K::build(float epsilon, TensorRefs inputs) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + auto const &x = inputs[0].get(); + auto const &scale = inputs[1].get(); + auto const &mean = inputs[3].get(); + + if (x.rank() != 4) { + return nullptr; + } + + // see "Supported Configurations for `cnnlBatchNormalizationForwardInference`" + if (scale.dataType != mean.dataType) { + return nullptr; + } + if (x.dataType == DT::F64) { + if (scale.dataType != DT::F64) { + return nullptr; + } + } else { + if (scale.dataType != DT::F32) { + return nullptr; + } + } + return std::make_unique(decltype(info){ + epsilon, + x.dataType, + scale.dataType, + x.layout, + { + static_cast(x.shape[0]), + static_cast(x.shape[1]), + static_cast(x.shape[2]), + static_cast(x.shape[3]), + }}); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing batch normalization for non-training-mode using CNNL"; + } + +#ifdef USE_BANG + + auto K::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t inDesc, inDescTrans, p; + cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW; + bool f32; + + explicit Descriptors(decltype(f32) f32_) + : inDesc(nullptr), inDescTrans(nullptr), p(nullptr), + NCHW2NHWC(nullptr), NHWC2NCHW(nullptr), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDescTrans)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&p)); + CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC)); + CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDescTrans)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(p)); + CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC)); + CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.dtX != DT::F64); + int dimNCHW[4] = {info.dimAx[0], info.dimAx[1], info.dimAx[2], info.dimAx[3]}; + int dimNHWC[4] = {info.dimAx[0], info.dimAx[2], info.dimAx[3], info.dimAx[1]}; + int dimParam[]{info.dimAx[1]}; + setCnnlTensor(d->inDesc, info.dtX, slice(dimNCHW, 4)); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->inDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dtX), 4, dimNHWC)); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->p, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dtP), 1, dimParam)); + int permute[4] = {0, 2, 3, 1}; + int permuteOut[4] = {0, 3, 1, 2}; + CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permute)); + CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut)); + + auto handle = res.fetchOrStore()->handle; + auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX); + size_t workspaceSize; + CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize)); + size_t totalWorkspaceSize = xTransSize + workspaceSize; + + res.fetchOrStore(); + auto routine = [d = std::move(d), + epsilon = info.epsilon, + xTransSize, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + // name inputs and outputs + auto x = inputs[0], + scale = inputs[1], + bias = inputs[2], + mean = inputs[3], + var = inputs[4]; + auto y = outputs[0]; + + void *xTrans = workspace; + void *yTrans = xTrans + xTransSize; + void *cursor = yTrans + workspaceSize; + + // transpose NCHW input to NHWC + CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x, + d->inDescTrans, xTrans, cursor, workspaceSize)); + + // build alpha/beta for double + auto a = d->f32 ? factor(1) : factor(1), + b = d->f32 ? factor(0) : factor(0); + CNNL_ASSERT(cnnlBatchNormForwardInference( + handle, &a, &b, + d->inDescTrans, xTrans, d->p, scale, bias, mean, var, + epsilon, d->inDescTrans, yTrans)); + + // transpose NHWC intermediates to NCHW + CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans, + d->inDesc, y, cursor, workspaceSize)); + + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + + return {std::move(routine), totalWorkspaceSize}; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh new file mode 100644 index 000000000..978b0dedc --- /dev/null +++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH +#define KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + /// @brief Use `cnnlBatchNormalizationForwardInference`. + /// It only supports 4D and 5D tensors. + struct BatchNormalizationCnnl final : public Kernel { + struct { + float epsilon; + DataType dtX, dtP; + LayoutType layout; + int dimAx[4];// dimA for x + } info; + + explicit BatchNormalizationCnnl(decltype(info)) noexcept; + + static KernelBox build(float, TensorRefs) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.cc b/src/04kernel/src/kernels/pool/cnnl_kernel.cc new file mode 100644 index 000000000..083125b1f --- /dev/null +++ b/src/04kernel/src/kernels/pool/cnnl_kernel.cc @@ -0,0 +1,156 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = PoolCnnl; + + K::PoolCnnl(decltype(info) info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(PoolType poolType, + bool ceil, + KernelShape const &kernelShape, + PoolAttributes const &poolAttributes, + Tensor const &x, + Tensor const &y) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + // TODO check data type + auto p = poolAttributes.pads(), + d = poolAttributes.dilations(), + s = poolAttributes.strides(); + if (x.rank() != 4 || + poolType == PoolType::Lp || + d[0] != 1 || d[1] != 1) { + return nullptr; + } + return std::make_unique(decltype(info){ + poolType, + x.dataType, + { + static_cast(x.shape[0]), + static_cast(x.shape[1]), + static_cast(x.shape[2]), + static_cast(x.shape[3]), + }, + { + static_cast(y.shape[0]), + static_cast(y.shape[1]), + static_cast(y.shape[2]), + static_cast(y.shape[3]), + }, + { + static_cast(kernelShape[0]), + static_cast(kernelShape[1]), + }, + {p[0], p[1], p[2], p[3]}, + {s[0], s[1]}, + {d[0], d[1]}, + ceil + }); + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing pool using CNNL"; + } + +#ifdef USE_BANG + + auto PoolCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using Ty = PoolType; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t x, y; + cnnlPoolingDescriptor_t pooling; + bool f32; + + Descriptors(decltype(f32) f32_) : f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); + CNNL_ASSERT(cnnlCreatePoolingDescriptor(&pooling)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(x)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(y)); + CNNL_ASSERT(cnnlDestroyPoolingDescriptor(pooling)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.dt != DataType::F64); + int const + xs[]{ + info.xShape[0], + info.xShape[1], + info.xShape[2] + std::abs(info.pads[0] - info.pads[2]), + info.xShape[3] + std::abs(info.pads[1] - info.pads[3]), + }, + *ys = info.yShape; + setCnnlTensor(d->x, info.dt, slice(xs, 4)); + setCnnlTensor(d->y, info.dt, slice(ys, 4)); + + // clang-format off + auto mode = info.poolType == Ty::Average ? CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING + : info.poolType == Ty::Max ? CNNL_POOLING_MAX + : UNREACHABLEX(cnnlPoolingMode_t, ""); + // clang-format on + auto pp = info.pads; + auto ss = info.strides; + auto kk = info.kernelShape; + auto dd = info.dilations; + CNNL_ASSERT(cnnlSetPooling2dDescriptor_v2( + d->pooling, mode, CNNL_NOT_PROPAGATE_NAN, + kk[0], kk[1], pp[0], pp[2], pp[1], pp[3], + ss[0], ss[1], dd[0], dd[1], ceil)); + + auto handle = res.fetchOrStore()->handle; + size_t extraInputSize, workspaceSize; + CNNL_ASSERT(cnnlGetPoolingWorkspaceSize(handle, mode, ys[3], ys[2], &workspaceSize)); + CNNL_ASSERT(cnnlGetPoolingExtraInputSize(handle, mode, ys[3], ys[2], &extraInputSize)); + + res.fetchOrStore(); + auto routine = [d, workspaceSize, + extraInputSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + auto handle = res.fetchOrStore()->handle; + + void *extraInputDev = workspace; + void *poolWorkSpace = workspace + extraInputSize; + + void *extraInputHost = malloc(extraInputSize); + CNNL_ASSERT(cnnlInitPoolingExtraInput(handle, d->pooling, d->x, d->y, extraInputHost)); + BANG_ASSERT(cnrtMemcpy(extraInputDev, extraInputHost, extraInputSize, CNRT_MEM_TRANS_DIR_HOST2DEV)); + + // build alpha/beta for double + auto a = d->f32 ? factor(1) : factor(1), + b = d->f32 ? factor(0) : factor(0); + CNNL_ASSERT(cnnlPoolingForward_v2( + handle, d->pooling, + &a, d->x, inputs[0], + &b, extraInputDev, d->y, outputs[0], + poolWorkSpace, workspaceSize)); + + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + + free(extraInputHost); + }; + return {std::move(routine), workspaceSize + extraInputSize}; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.hh b/src/04kernel/src/kernels/pool/cnnl_kernel.hh new file mode 100644 index 000000000..0a0298ede --- /dev/null +++ b/src/04kernel/src/kernels/pool/cnnl_kernel.hh @@ -0,0 +1,45 @@ +#ifndef KERNEL_POOL_CNNL_KERNEL_HH +#define KERNEL_POOL_CNNL_KERNEL_HH + +#include "kernel/attributes/pool_attributes.h" +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + /// @brief Use `cnnlPoolingForward`. + /// It only supports 4D tensors. + struct PoolCnnl final : public Kernel { + struct + { + PoolType poolType; + DataType dt; + int xShape[4], + yShape[4], + kernelShape[2], + pads[4], + strides[2], + dilations[2]; + bool ceil; + } info; + + explicit PoolCnnl(decltype(info)) noexcept; + + static KernelBox build(PoolType, + bool, + KernelShape const &, + PoolAttributes const &, + Tensor const &, + Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_POOL_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc new file mode 100644 index 000000000..752bee690 --- /dev/null +++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc @@ -0,0 +1,128 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include "hardware/functions.h" +#endif + +namespace refactor::kernel { + using K = ReduceCnnl; + + K::ReduceCnnl( + decltype(dataType) dataType_, + decltype(reduceType) reduceType_, + decltype(axes) axes_, + decltype(shape) shape_) noexcept + : Kernel(), + dataType(dataType_), + reduceType(reduceType_), + axes(std::move(axes_)), + shape(std::move(shape_)) {} + + auto K::build(decltype(axes) axes_, ReduceType reduceType_, TensorRefs inputs_) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + auto const &x = inputs_[0].get(); + return x.dataType.isFloat() + ? std::make_unique(x.dataType, reduceType_, std::move(axes_), x.shape) + : nullptr; + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing reduce operation using CNNL"; + } + +#ifdef USE_BANG + + auto ReduceCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t x, y; + cnnlReduceDescriptor_t reduce; + bool f32; + + explicit Descriptors(decltype(f32) f32_) : f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); + CNNL_ASSERT(cnnlCreateReduceDescriptor(&reduce)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(x)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(y)); + CNNL_ASSERT(cnnlDestroyReduceDescriptor(reduce)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(dataType != DataType::F64); + + std::vector + dimsI(shape.begin(), shape.end()), + dimsO(shape.begin(), shape.end()); + for (auto axis : axes) { + dimsO[axis] = 1; + } + setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size())); + setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size())); + + // clang-format off + auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG + : reduceType == ReduceType::Sum ? CNNL_REDUCE_ADD + : reduceType == ReduceType::Min ? CNNL_REDUCE_MIN + : reduceType == ReduceType::Max ? CNNL_REDUCE_MAX + : reduceType == ReduceType::L1 ? CNNL_REDUCE_NORM1 + : reduceType == ReduceType::L2 ? CNNL_REDUCE_NORM2 + : reduceType == ReduceType::Prod ? CNNL_REDUCE_MUL + : UNREACHABLEX(cnnlReduceOp_t, ""); + // clang-format on + CNNL_ASSERT(cnnlSetReduceDescriptor_v2( + d->reduce, (int *) (axes.data()), axes.size(), reduceOp, + cnnlDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0)); + + auto handler = res.fetchOrStore()->handle; + size_t idxWorkspaceSize = axes.size() * sizeof(int); + // idxWorkspaceSize = hardware::alignBytes(idxWorkspaceSize, 256); + size_t workspaceSize; + // get workspace + CNNL_ASSERT(cnnlGetReduceOpWorkspaceSize(handler, d->x, d->y, d->reduce, &workspaceSize)); + + res.fetchOrStore(); + auto routine = [d = std::move(d), + idxWorkspaceSize, + workspaceSize](Resources &res, + void *workspace, + void const *const *inputs, + void *const *outputs) { + void *idxWorkspace = workspace, + *dataWorkspace = reinterpret_cast(workspace) + idxWorkspaceSize; + // build alpha/beta for double + auto a = d->f32 ? factor(1) : factor(1), + b = d->f32 ? factor(0) : factor(0); + CNNL_ASSERT(cnnlReduce( + res.fetchOrStore()->handle, + d->reduce, + dataWorkspace, workspaceSize, + &a, d->x, inputs[0], + idxWorkspaceSize, idxWorkspace, + &b, d->y, outputs[0])); + }; + return RoutineWorkspace(std::move(routine), idxWorkspaceSize + workspaceSize); + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.hh b/src/04kernel/src/kernels/reduce/cnnl_kernel.hh new file mode 100644 index 000000000..6ffaf7387 --- /dev/null +++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_REDUCE_MEAN_CNNL_KERNEL_HH +#define KERNEL_REDUCE_MEAN_CNNL_KERNEL_HH + +#include "kernel/collectors/reduce.h" +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct ReduceCnnl final : public Kernel { + DataType dataType; + ReduceType reduceType; + Axes axes; + Shape shape; + + ReduceCnnl(decltype(dataType), + decltype(reduceType), + decltype(axes), + decltype(shape)) noexcept; + + static KernelBox build(decltype(axes), ReduceType, TensorRefs) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; +}// namespace refactor::kernel + +#endif// KERNEL_REDUCE_MEAN_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc new file mode 100644 index 000000000..cfe2ce0fa --- /dev/null +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc @@ -0,0 +1,195 @@ +#include "binary_cnnl.hh" +#include + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = BinaryCnnl; + using Op = SimpleBinaryType; + using DT = DataType; + + K::BinaryCnnl(Op opType_, DT dataType_, std::vector aDims_, std::vector bDims_, std::vector cDims_) noexcept + : Kernel(), dataType(dataType_), opType(opType_), aDims(aDims_), bDims(bDims_), cDims(cDims_) {} + + auto K::build(Op op, Tensor const &a, Tensor const &b, Tensor const &c) noexcept -> KernelBox { + static const std::unordered_set + ARTHIMETIC{Op::Add, Op::Sub, Op::Mul, Op::Div, Op::And, Op::Or, Op::Xor, Op::Pow}; + +#ifndef USE_BANG + return nullptr; +#endif + + if (a.dataType != b.dataType || + !a.dataType.isFloat() || + !ARTHIMETIC.contains(op) || + // At least one of a,b should have the same shape as c + (a.shape != c.shape && b.shape != c.shape) || + // Sub only supports brocasting b + (a.shape != c.shape && op == Op::Sub) || + // Cnnl binary op only supports up to 5D + !((a.rank() == 5 && b.rank() == 5) || (a.rank() <= 4 && b.rank() <= 4))) { + return nullptr; + } + + auto shape2IntVec = [](Shape shape) -> std::vector { + std::vector intVector; + intVector.reserve(shape.size()); + for (const uint32_t &element : shape) { + intVector.push_back(static_cast(element)); + } + return intVector; + }; + + return std::make_unique(op, a.dataType, shape2IntVec(a.shape), shape2IntVec(b.shape), shape2IntVec(c.shape)); + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing element-wise op of 2 tensors with CNNL"; + } + +#ifdef USE_BANG + + auto BinaryCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlOpTensorDescriptor_t opDesc; + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + bool f32, sub; + + Descriptors(decltype(f32) f32_) : f32(f32_), sub(false) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&aDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&bDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&cDesc)); + CNNL_ASSERT(cnnlCreateOpTensorDescriptor(&opDesc)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(aDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(bDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(cDesc)); + CNNL_ASSERT(cnnlDestroyOpTensorDescriptor(opDesc)); + } + }; + auto d = std::make_shared(dataType != DT::F64); + cnnlOpTensorDesc_t cnnlOP; + cnnlLogicOp_t cnnlLogicOP; + if (opType == SimpleBinaryType::Add) { + cnnlOP = CNNL_OP_TENSOR_ADD; + } else if (opType == SimpleBinaryType::Sub) { + cnnlOP = CNNL_OP_TENSOR_ADD; + d->sub = true; + } else if (opType == SimpleBinaryType::Mul) { + cnnlOP = CNNL_OP_TENSOR_MUL; + } else if (opType == SimpleBinaryType::And) { + cnnlLogicOP = CNNL_LOGIC_OP_AND; + } else if (opType == SimpleBinaryType::Or) { + cnnlLogicOP = CNNL_LOGIC_OP_OR; + } else if (opType == SimpleBinaryType::Xor) { + cnnlLogicOP = CNNL_LOGIC_OP_XOR; + } + + setCnnlTensor(d->aDesc, dataType, slice(aDims.data(), aDims.size())); + setCnnlTensor(d->bDesc, dataType, slice(bDims.data(), bDims.size())); + setCnnlTensor(d->cDesc, dataType, slice(cDims.data(), cDims.size())); + CNNL_ASSERT(cnnlSetOpTensorDescriptor( + d->opDesc, cnnlOP, + cnnlDataTypeConvert(d->f32 ? DT::F32 : DT::F64), + CNNL_NOT_PROPAGATE_NAN)); + + auto cnnlGetBinaryWorkspaceSize = + (opType == SimpleBinaryType::Add || opType == SimpleBinaryType::Sub || opType == SimpleBinaryType::Mul) ? cnnlGetOpTensorWorkspaceSize + : (opType == SimpleBinaryType::Div) ? cnnlGetDivWorkspaceSize + : (opType == SimpleBinaryType::And || opType == SimpleBinaryType::Or || opType == SimpleBinaryType::Xor) ? cnnlGetLogicOpWorkspaceSize + : (opType == SimpleBinaryType::Pow) ? cnnlGetPowWorkspaceSize + : nullptr; + + if (cnnlGetBinaryWorkspaceSize == nullptr) { + UNREACHABLE(); + } + + auto handle = res.fetchOrStore()->handle; + size_t workspaceSize; + if (aDims != cDims) { + CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->bDesc, + d->aDesc, d->cDesc, + &workspaceSize)); + } else { + CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc, + d->bDesc, d->cDesc, + &workspaceSize)); + } + + res.fetchOrStore(); + auto routine = [swap = aDims != cDims, d, + workspaceSize, cnnlLogicOP, + op = this->opType](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + auto handle = res.fetchOrStore()->handle; + // name inputs and outputs + auto a = inputs[0], + b = inputs[1]; + auto c = outputs[0]; + if (op == SimpleBinaryType::Add || op == SimpleBinaryType::Sub || op == SimpleBinaryType::Mul) { + auto alphaA = d->f32 + ? factor(1) + : factor(1), + alphaB = d->f32 + ? factor(d->sub ? -1 : 1) + : factor(d->sub ? -1 : 1), + beta = d->f32 + ? factor(0) + : factor(0); + + if (swap) { + CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc, + &alphaB, d->bDesc, b, + &alphaA, d->aDesc, a, + workspace, workspaceSize, + &beta, d->cDesc, c)); + } else { + CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc, + &alphaA, d->aDesc, a, + &alphaB, d->bDesc, b, + workspace, workspaceSize, + &beta, d->cDesc, c)); + } + } else if (op == SimpleBinaryType::Div) { + CNNL_ASSERT(cnnlDiv_v2(handle, + CNNL_COMPUTATION_HIGH_PRECISION, + d->aDesc, a, + d->bDesc, b, + workspace, workspaceSize, + d->cDesc, c)); + } else if (op == SimpleBinaryType::And || op == SimpleBinaryType::Or || op == SimpleBinaryType::Xor) { + CNNL_ASSERT(cnnlLogicOp(handle, cnnlLogicOP, + d->aDesc, a, + d->bDesc, b, + workspace, workspaceSize, + d->cDesc, c)); + } else if (op == SimpleBinaryType::Pow) { + CNNL_ASSERT(cnnlPow(handle, + CNNL_COMPUTATION_HIGH_PRECISION, + d->aDesc, a, + d->bDesc, b, + workspace, workspaceSize, + d->cDesc, c)); + } + }; + + return {std::move(routine), workspaceSize}; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.hh b/src/04kernel/src/kernels/simple_binary/binary_cnnl.hh new file mode 100644 index 000000000..2d5c7cfaa --- /dev/null +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.hh @@ -0,0 +1,28 @@ +#ifndef KERNEL_BINARY_CNNL_HH +#define KERNEL_BINARY_CNNL_HH + +#include "kernel/collectors/simple_binary.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct BinaryCnnl final : public Kernel { + DataType dataType; + SimpleBinaryType opType; + std::vector aDims, bDims, cDims; + + BinaryCnnl(SimpleBinaryType, DataType, std::vector aDims_, std::vector bDims_, std::vector cDims_) noexcept; + + static KernelBox build(SimpleBinaryType, Tensor const &, Tensor const &, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_BINARY_CNNL_HH diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc new file mode 100644 index 000000000..d35535948 --- /dev/null +++ b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc @@ -0,0 +1,91 @@ +#include "cnnl_activation_kernel.hh" +#include "kernel/collectors/simple_unary.h" +#include + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = ActivationCnnl; + using DT = DataType; + using Op = SimpleUnaryType; + + K::ActivationCnnl(Op type_, DT dataType_, int size_) noexcept + : Kernel(), type(type_), dataType(dataType_), size(size_) {} + + auto K::build(Op op, Tensor const &a) noexcept -> KernelBox { + static const std::unordered_set ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh}; + +#ifndef USE_BANG + return nullptr; +#endif + + return ARTHIMETIC.contains(op) + ? std::make_unique(op, a.dataType, static_cast(a.elementsSize())) + : nullptr; + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing activation using CNNL"; + } + +#ifdef USE_BANG + + auto ActivationCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using Ty = SimpleUnaryType; + + // RAII for closure + struct Descriptors { + cnnlActivationDescriptor_t activation; + cnnlTensorDescriptor_t tensor; + + Descriptors() : activation(nullptr), tensor(nullptr) { + CNNL_ASSERT(cnnlCreateActivationDescriptor(&activation)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyActivationDescriptor(activation)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(); + + // clang-format off + auto mode = type == Ty::Relu ? CNNL_ACTIVATION_RELU + : type == Ty::Sigmoid ? CNNL_ACTIVATION_SIGMOID + : type == Ty::Tanh ? CNNL_ACTIVATION_TANH + : UNREACHABLEX(cnnlActivationMode_t, ""); + // clang-format on + + setCnnlTensor(d->tensor, dataType, slice(&size, 1)); + CNNL_ASSERT(cnnlSetActivationDescriptor_v2(d->activation, mode, CNNL_ACTIVATION_HIGH_PRECISION, + CNNL_NOT_PROPAGATE_NAN, 0.0)); + + res.fetchOrStore(); + return [d = std::move(d)]// + (Resources & res, void *, void const *const *inputs, void *const *outputs) { + float alpha = 1, beta = 0; + CNNL_ASSERT(cnnlActivationForward( + res.fetchOrStore()->handle, + d->activation, + &alpha, d->tensor, inputs[0], + &beta, d->tensor, outputs[0])); + }; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh new file mode 100644 index 000000000..a5d7ad65c --- /dev/null +++ b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.hh @@ -0,0 +1,27 @@ +#ifndef KERNEL_ACTIVATION_CNNL_KERNEL_HH +#define KERNEL_ACTIVATION_CNNL_KERNEL_HH + +#include "kernel/collectors/simple_unary.h" + +namespace refactor::kernel { + + struct ActivationCnnl final : public Kernel { + SimpleUnaryType type; + DataType dataType; + int size; + + ActivationCnnl(SimpleUnaryType, DataType, int) noexcept; + + static KernelBox build(SimpleUnaryType, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_ACTIVATION_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc new file mode 100644 index 000000000..f6e32159d --- /dev/null +++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc @@ -0,0 +1,94 @@ +#include "cnnl_simple_unary_kernel.hh" +#include "kernel/collectors/simple_unary.h" +#include + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = SimpleUnaryCnnl; + using DT = DataType; + using Op = SimpleUnaryType; + + K::SimpleUnaryCnnl(Op type_, DT dataType_, int size_) noexcept + : Kernel(), type(type_), dataType(dataType_), size(size_) {} + + auto K::build(Op op, Tensor const &a) noexcept -> KernelBox { + static const std::unordered_set supportedOp{Op::Abs, Op::Sqrt, Op::Neg}; + +#ifndef USE_BANG + return nullptr; +#endif + + return supportedOp.contains(op) + ? std::make_unique(op, a.dataType, static_cast(a.elementsSize())) + : nullptr; + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing simple unary using CNNL"; + } + +#ifdef USE_BANG + + auto SimpleUnaryCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using Ty = SimpleUnaryType; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t tensor; + + Descriptors() : tensor(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(); + + setCnnlTensor(d->tensor, dataType, slice(&size, 1)); + + auto cnnlUnaryForward = [this](cnnlHandle_t handle, + const cnnlTensorDescriptor_t x_desc, + const void *x, + const cnnlTensorDescriptor_t y_desc, + void *y) -> cnnlStatus_t { + switch (this->type) { + case Ty::Abs: + return cnnlAbs(handle, x_desc, x, y_desc, y); + case Ty::Neg: + return cnnlNegTensor(handle, x_desc, x, y_desc, y); + case Ty::Sqrt: + return cnnlSqrt_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y); + default: + UNREACHABLE(); + } + }; + + res.fetchOrStore(); + return [d = std::move(d), cnnlUnaryForward]// + (Resources & res, void *, void const *const *inputs, void *const *outputs) { + CNNL_ASSERT(cnnlUnaryForward( + res.fetchOrStore()->handle, + d->tensor, inputs[0], + d->tensor, outputs[0])); + }; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh new file mode 100644 index 000000000..b69902f7b --- /dev/null +++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.hh @@ -0,0 +1,27 @@ +#ifndef KERNEL_SIMPLE_UNARY_CNNL_KERNEL_HH +#define KERNEL_SIMPLE_UNARY_CNNL_KERNEL_HH + +#include "kernel/collectors/simple_unary.h" + +namespace refactor::kernel { + + struct SimpleUnaryCnnl final : public Kernel { + SimpleUnaryType type; + DataType dataType; + int size; + + SimpleUnaryCnnl(SimpleUnaryType, DataType, int) noexcept; + + static KernelBox build(SimpleUnaryType, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SIMPLE_UNARY_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc new file mode 100644 index 000000000..865e452e1 --- /dev/null +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc @@ -0,0 +1,86 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = SoftmaxCnnl; + + K::SoftmaxCnnl(cnnl::SoftmaxAlgo algo_, DataType type_, + int pre_, int mid_, int post_) noexcept + : Kernel(), algo(algo_), dataType(type_), + pre(pre_), mid(mid_), post(post_) {} + + auto K::build(cnnl::SoftmaxAlgo algo, SoftmaxInfo info) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + return std::make_unique(algo, info.type, info.pre, info.mid, info.post); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing softmax forward with CNNL"; + } + +#ifdef USE_BANG + + auto SoftmaxCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t t; + cnnlSoftmaxAlgorithm_t algo; + bool f32; + + Descriptors(decltype(algo) algo_, decltype(f32) f32_) + : algo(algo_), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&t)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(t)); + } + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + + auto d = std::make_shared( + static_cast(algo), + dataType != DataType::F64); + int dims[]{pre, mid, post}; + cnnlSoftmaxMode_t mode = (post == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION + : (pre == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION + : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + + // cnnlSoftmaxForward_v2 is applied to a 3D input tensor only + CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims)); + + res.fetchOrStore(); + return [d = std::move(d), mode](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // build alpha/beta for double + auto a = d->f32 ? factor(1) : factor(1), + b = d->f32 ? factor(0) : factor(0); + CNNL_ASSERT(cnnlSoftmaxForward_v2( + res.fetchOrStore()->handle, + d->algo, + mode, + CNNL_COMPUTATION_ULTRAHIGH_PRECISION, + &a, d->t, inputs[0], + &b, d->t, outputs[0])); + }; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.hh b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh new file mode 100644 index 000000000..b9bedb5a4 --- /dev/null +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh @@ -0,0 +1,36 @@ +#ifndef KERNEL_SOFTMAX_CNNL_HH +#define KERNEL_SOFTMAX_CNNL_HH + +#include "kernel/attributes/softmax_info.h" +#include "kernel/collectors/softmax.h" + +namespace refactor::kernel { + + namespace cnnl { + enum class SoftmaxAlgo { + FAST = 0, + ACCURATE = 1, + LOG = 2, + }; + }// namespace cnnl + + struct SoftmaxCnnl final : public Kernel { + cnnl::SoftmaxAlgo algo; + DataType dataType; + int pre, mid, post; + + SoftmaxCnnl(cnnl::SoftmaxAlgo, DataType, int, int, int) noexcept; + + static KernelBox build(cnnl::SoftmaxAlgo, SoftmaxInfo) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SOFTMAX_CNNL_HH diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc new file mode 100644 index 000000000..26a0bef1b --- /dev/null +++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc @@ -0,0 +1,92 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = TransposeCnnl; + using Info = TransposeInfo; + + K::TransposeCnnl(DataType dataType_, Shape dimIn_, Shape dimOut_, Permutation perm_) noexcept + : Kernel(), dataType(dataType_), dimIn(std::move(dimIn_)), + dimOut(std::move(dimOut_)), perm(std::move(perm_)) {} + + auto K::build(DataType dataType, Shape shape_, Permutation perm_) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + Shape dimOut_; + for (uint32_t i = 0; i < shape_.size(); i++) { + dimOut_.push_back(shape_[perm_[i]]); + } + return std::make_unique(dataType, std::move(shape_), std::move(dimOut_), std::move(perm_)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing transpose operation using CNNL"; + } + +#ifdef USE_BANG + auto TransposeCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + struct Descriptors { + cnnlTensorDescriptor_t x, y; + cnnlTransposeDescriptor_t trans; + bool f32; + + explicit Descriptors(decltype(f32) f32_) + : x(nullptr), y(nullptr), trans(nullptr), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); + CNNL_ASSERT(cnnlCreateTransposeDescriptor(&trans)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(x)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(y)); + CNNL_ASSERT(cnnlDestroyTransposeDescriptor(trans)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + + auto d = std::make_shared(dataType != DT::F64); + setCnnlTensor(d->x, dataType, slice((int *)(dimIn.data()), dimIn.size())); + setCnnlTensor(d->y, dataType, slice((int *)(dimOut.data()), dimOut.size())); + CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, perm.size(), (int *)perm.data())); + + auto handle = res.fetchOrStore()->handle; + size_t workspaceSize; + CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->x, d->trans, &workspaceSize)); + + res.fetchOrStore(); + auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + // name inputs and outputs + auto x = inputs[0]; + auto y = outputs[0]; + + CNNL_ASSERT(cnnlTranspose_v2(handle, d->trans, d->x, x, + d->y, y, workspace, workspaceSize)); + }; + + return {std::move(routine), workspaceSize}; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.hh b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh new file mode 100644 index 000000000..37bb7e088 --- /dev/null +++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_TRANSPOSE_CNNL_KERNEL_HH +#define KERNEL_TRANSPOSE_CNNL_KERNEL_HH + +#include "kernel/collectors/transpose.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + using Shape = absl::InlinedVector; + using Permutation = Shape; + + struct TransposeCnnl final : public Kernel { + DataType dataType; + Shape dimIn; + Shape dimOut; + Permutation perm; + + TransposeCnnl(DataType, Shape, Shape, Permutation) noexcept; + + static KernelBox build(DataType, Shape, Permutation) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_TRANSPOSE_CNNL_KERNEL_HH diff --git a/src/04kernel/src/utilities/bang/cnnl_context.cc b/src/04kernel/src/utilities/bang/cnnl_context.cc new file mode 100644 index 000000000..15cc13829 --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_context.cc @@ -0,0 +1,35 @@ +#ifdef USE_BANG + +#include "cnnl_context.hh" +#include "cnnl_functions.h" + +namespace refactor::kernel::cnnl { + + CnnlContext::CnnlContext() : runtime::Resource() { + BANG_ASSERT(cnrtQueueCreate(&queue)); + CNNL_ASSERT(cnnlCreate(&handle)); + CNNL_ASSERT(cnnlSetQueue(handle, queue)); + } + CnnlContext::~CnnlContext() { + BANG_ASSERT(cnrtQueueDestroy(queue)); + CNNL_ASSERT(cnnlDestroy(handle)); + } + + auto CnnlContext::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + auto CnnlContext::build() -> runtime::ResourceBox { + return std::make_unique(); + } + + auto CnnlContext::resourceTypeId() const noexcept -> size_t { + return typeId(); + } + auto CnnlContext::description() const noexcept -> std::string_view { + return "CnnlContext"; + } + +}// namespace refactor::kernel::cnnl + +#endif diff --git a/src/04kernel/src/utilities/bang/cnnl_context.hh b/src/04kernel/src/utilities/bang/cnnl_context.hh new file mode 100644 index 000000000..7db40d3d3 --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_context.hh @@ -0,0 +1,29 @@ +#ifndef KERNEL_CNNL_CONTEXT_HH +#define KERNEL_CNNL_CONTEXT_HH + +#include "runtime/resource.h" +#include +#include + +namespace refactor::kernel::cnnl { + + struct CnnlContext final : public runtime::Resource { + cnnlHandle_t handle; + cnrtQueue_t queue; + + CnnlContext(); + ~CnnlContext(); + CnnlContext(CnnlContext const &) noexcept = delete; + CnnlContext(CnnlContext &&) noexcept = delete; + + static size_t typeId() noexcept; + static runtime::ResourceBox build(); + + size_t resourceTypeId() const noexcept final; + std::string_view description() const noexcept final; + + }; + +}// namespace refactor::kernel::cnnl + +#endif// KERNEL_CNNL_CONTEXT_HH diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.cpp b/src/04kernel/src/utilities/bang/cnnl_functions.cpp new file mode 100644 index 000000000..8dfeb6457 --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_functions.cpp @@ -0,0 +1,38 @@ +#ifdef USE_BANG + +#include "cnnl_functions.h" + +namespace refactor::kernel::cnnl { + + cnnlDataType_t cnnlDataTypeConvert(DataType dataType) { + // clang-format off + switch (dataType) { + case DataType::F32 : return CNNL_DTYPE_FLOAT; break; + case DataType::F64 : return CNNL_DTYPE_DOUBLE; break; + case DataType::FP16: return CNNL_DTYPE_HALF; break; + case DataType::I8 : return CNNL_DTYPE_INT8; break; + case DataType::I32 : return CNNL_DTYPE_INT32; break; + case DataType::U8 : return CNNL_DTYPE_UINT8; break; + case DataType::BF16: return CNNL_DTYPE_BFLOAT16; break; + case DataType::I64 : return CNNL_DTYPE_INT64; break; + case DataType::Bool: return CNNL_DTYPE_BOOL; break; + default: UNREACHABLE(); + } + // clang-format on + } + + void setCnnlTensor(cnnlTensorDescriptor_t t, DataType dt, slice_t d) { + auto dt_ = cnnlDataTypeConvert(dt); + if (auto n = d.size(); n == 4) { + CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin())); + } else if (n < 4) { + int d_[]{1, 1, 1, 1}; + std::copy_n(d.begin(), n, d_ + 4 - n); + CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, 4, std::move(d_))); + } else { + CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin())); + } + } +}// namespace refactor::kernel::cnnl + +#endif diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.h b/src/04kernel/src/utilities/bang/cnnl_functions.h new file mode 100644 index 000000000..4ba2f89d7 --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_functions.h @@ -0,0 +1,40 @@ +#ifndef KERNEL_CNNL_FUNCTIONS_H +#define KERNEL_CNNL_FUNCTIONS_H + +#include "common.h" +#include + +#define BANG_ASSERT(STATUS) \ + if (auto status = (STATUS); status != CNRT_RET_SUCCESS) { \ + RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \ + cnrtGetErrorStr(status), (int) status)); \ + } + +#define CNNL_ASSERT(STATUS) \ + if (auto status = (STATUS); status != CNNL_STATUS_SUCCESS) { \ + fmt::println("cnnl failed on \"" #STATUS "\" with {}", \ + cnnlGetErrorString(status)); \ + abort(); \ + } + +namespace refactor::kernel::cnnl { + + cnnlDataType_t cnnlDataTypeConvert(DataType); + + // A helper function that set Cnnl tensor descriptor given tensor shape and type + void setCnnlTensor(cnnlTensorDescriptor_t, DataType, slice_t); + + template + constexpr uint64_t factor(T x) noexcept { + static_assert(std::is_floating_point_v); + static_assert(sizeof(T) <= sizeof(uint64_t)); + union { + T f; + uint64_t i; + } u{x}; + return u.i; + } + +}// namespace refactor::kernel::cnnl + +#endif// KERNEL_CNNL_FUNCTIONS_H diff --git a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp new file mode 100644 index 000000000..14a1a07f4 --- /dev/null +++ b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp @@ -0,0 +1,70 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/batch_normalization/cnnl_kernel.hh" +#include "../../../src/kernels/batch_normalization/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, BatchNormalizationCnnl) { + // build routine + auto xTensor = Tensor::share(DataType::F32, Shape{1, 2, 3, 2}); + auto outTensor = Tensor::share(DataType::F32, Shape{1, 2, 3, 2}); + auto scaleTensor = Tensor::share(DataType::F32, Shape{2}); + auto biasTensor = Tensor::share(DataType::F32, Shape{2}); + auto meanTensor = Tensor::share(DataType::F32, Shape{2}); + auto varTensor = Tensor::share(DataType::F32, Shape{2}); + float epsilon = 0.00001; + TensorRefs inputs = TensorRefs{*xTensor, *scaleTensor, *biasTensor, *meanTensor, *varTensor}; + auto kCpu = BatchNormalization::build(epsilon, inputs); + auto kCnnl = BatchNormalizationCnnl::build(epsilon, inputs); + ASSERT_TRUE(kCpu && kCnnl); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto [rMlu, workspaceSize] = kCnnl->lower(res); + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + mluIn = dev.malloc(xTensor->bytesSize()), + mluScale = dev.malloc(scaleTensor->bytesSize()), + mluBias = dev.malloc(biasTensor->bytesSize()), + mluMean = dev.malloc(meanTensor->bytesSize()), + mluVar = dev.malloc(varTensor->bytesSize()), + mluOut = dev.malloc(outTensor->bytesSize()); + // put input data + std::vector + data(xTensor->elementsSize(), 1.0f), + scale(scaleTensor->elementsSize(), 0.5f), + bias(biasTensor->elementsSize(), 1.0f), + mean(meanTensor->elementsSize(), 0.5f), + var(varTensor->elementsSize(), 1.0f), + cpuOut(outTensor->elementsSize()); + mluIn->copyFromHost(data.data(), xTensor->bytesSize()); + mluScale->copyFromHost(scale.data(), scaleTensor->bytesSize()); + mluBias->copyFromHost(bias.data(), biasTensor->bytesSize()); + mluMean->copyFromHost(mean.data(), meanTensor->bytesSize()); + mluVar->copyFromHost(var.data(), varTensor->bytesSize()); + // inference + { + void const *inputs[]{data.data(), scale.data(), bias.data(), mean.data(), var.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{*mluIn, *mluScale, *mluBias, *mluMean, *mluVar}; + void *outputs[]{*mluOut}; + rMlu(res, *workspace, inputs, outputs); + } + // take output data + std::vector result(outTensor->elementsSize()); + mluOut->copyToHost(result.data(), outTensor->bytesSize()); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } +} + +#endif diff --git a/src/04kernel/test/kernels/pool/test_cnnl.cpp b/src/04kernel/test/kernels/pool/test_cnnl.cpp new file mode 100644 index 000000000..405bf3f8c --- /dev/null +++ b/src/04kernel/test/kernels/pool/test_cnnl.cpp @@ -0,0 +1,70 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/pool/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +void testPoolCnnl(PoolType poolType, int rank, const int64_t *pads, const int64_t *strides, KernelShape kernelShape, Shape xShape, Shape yShape, const std::vector &ExpectData) { + auto dataTensor = Tensor::share(DataType::F32, xShape); + auto yTensor = Tensor::share(DataType::F32, yShape); + //bool ceil = false; + bool ceil = true; + int64_t const dilations[] = {1, 1}; + PoolAttributes poolAttributes(rank, dilations, pads, strides); + + auto kernel = PoolCnnl::build(poolType, ceil, kernelShape, poolAttributes, *dataTensor, *yTensor); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto [routine, workspaceSize] = kernel->lower(res); + // bang malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + mluMem = dev.malloc(dataTensor->bytesSize()); + // put input data + std::vector data(dataTensor->elementsSize()); + for (auto i : range0_(data.size())) { data[i] = i * 0.1f; } + mluMem->copyFromHost(data.data(), dataTensor->bytesSize()); + // inference + void const *inputs[]{*mluMem}; + void *outputs[]{*mluMem}; + routine(res, *workspace, inputs, outputs); + // take output data + std::vector result(yTensor->elementsSize()); + mluMem->copyToHost(result.data(), yTensor->bytesSize()); + // check + for (auto i : range0_(ExpectData.size())) { + EXPECT_FLOAT_EQ(ExpectData[i], result[i]); + } +} + +TEST(kernel, PoolCnnlMax) { + int rank = 2; + int64_t const + pads[]{0, 0, 0, 0}, + strides[]{2, 2}; + KernelShape kernelShape{2, 2}; + Shape + xShape{1, 1, 4, 4}, + yShape{1, 1, 2, 2}; + const std::vector ExpectData = {0.5, 0.7, 1.3, 1.5}; + testPoolCnnl(PoolType::Max, rank, pads, strides, kernelShape, xShape, yShape, ExpectData); +} + +TEST(kernel, PoolCnnlAvg) { + int rank = 2; + int64_t const + pads[]{0, 0, 0, 0}, + strides[]{2, 2}; + KernelShape kernelShape{2, 2}; + Shape + xShape{1, 1, 4, 4}, + yShape{1, 1, 2, 2}; + const std::vector ExpectData = {0.25, 0.45, 1.05, 1.25}; + testPoolCnnl(PoolType::Average, rank, pads, strides, kernelShape, xShape, yShape, ExpectData); +} + +#endif diff --git a/src/04kernel/test/kernels/reduce/test_cnnl.cpp b/src/04kernel/test/kernels/reduce/test_cnnl.cpp new file mode 100644 index 000000000..32952fead --- /dev/null +++ b/src/04kernel/test/kernels/reduce/test_cnnl.cpp @@ -0,0 +1,64 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/reduce/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +static void testReducemean(const Shape &shape, const std::vector &data, + Axes axes, const std::vector ExpectData) { + // build routine + auto dataTensor = Tensor::share(DataType::F32, shape); + auto kernel = ReduceCnnl::build(axes, ReduceType::Mean, {*dataTensor}); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto [routine, workspaceSize] = kernel->lower(res); + // bang malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + mluMemIn = dev.malloc(dataTensor->bytesSize()), + mluMemOut = dev.malloc(dataTensor->bytesSize()); + // put input output data + mluMemIn->copyFromHost(data.data(), dataTensor->bytesSize()); + // inference + { + void const *inputs[]{*mluMemIn}; + void *outputs[]{*mluMemOut}; + routine(res, *workspace, inputs, outputs); + } + // take output data + Shape outDimArray; + std::unordered_set axesSet(axes.begin(), axes.end()); + for (size_t i = 0; i < shape.size(); ++i) { + if (axesSet.contains(i)) { + outDimArray.push_back(shape[i]); + } + } + auto outputTensor = Tensor::share(DataType::F32, outDimArray); + std::vector result(outDimArray.size()); + mluMemOut->copyToHost(result.data(), outputTensor->bytesSize()); + // check + for (auto i : range0_(ExpectData.size())) { + EXPECT_FLOAT_EQ(ExpectData[i], result[i]); + } +} + +TEST(kernel, ReduceMeanCnnl) { + testReducemean({2, 3, 2, 2}, + {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23}, + {1, 2}, + {5, 6, 17, 18}); + testReducemean({2, 3, 2, 2, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23}, + {1, 2}, + {5, 6, 17, 18}); +} + +#endif diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp new file mode 100644 index 000000000..4ef7c6d23 --- /dev/null +++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp @@ -0,0 +1,90 @@ +#ifdef USE_BANG + +#include "../src/kernels/simple_binary/binary_cnnl.hh" +#include "../src/kernels/simple_binary/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape dimC) { + // Create Tensor and build kernels + auto aTensor = Tensor::share(DataType::F32, dimA, LayoutType::NCHW); + auto bTensor = Tensor::share(DataType::F32, dimB, LayoutType::NCHW); + auto cTensor = Tensor::share(DataType::F32, dimC, LayoutType::NCHW); + auto kernel = BinaryCnnl::build(binaryOPT, *aTensor, *bTensor, *cTensor); + auto kCpu = BinaryCpu::build(binaryOPT, *aTensor, *bTensor); + ASSERT_TRUE(kCpu && kernel); + auto res = runtime::Resources(); + auto [routine, workspaceSize] = kernel->lower(res); + auto rCpu = kCpu->lower(res).routine; + // Init inputs and outputs + std::vector + a(aTensor->elementsSize(), 3.0f), + b(bTensor->elementsSize(), 2.0f), + c(cTensor->elementsSize()); + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + aMLU = dev.malloc(aTensor->bytesSize()), + bMLU = dev.malloc(bTensor->bytesSize()), + cMLU = dev.malloc(cTensor->bytesSize()); + aMLU->copyFromHost(a.data(), aTensor->bytesSize()); + bMLU->copyFromHost(b.data(), bTensor->bytesSize()); + // Compute + { + void const *inputs[]{*aMLU, *bMLU}; + void *outputs[]{*cMLU}; + routine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{a.data(), b.data()}; + void *outputs[]{c.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // Compare + std::vector result(cTensor->elementsSize()); + cMLU->copyToHost(result.data(), cTensor->bytesSize()); + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(c[i], result[i]); + } +} + +TEST(kernel, BinaryCnnlAdd) { + testBinaryCnnl(SimpleBinaryType::Add, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +} + +TEST(kernel, BinaryCnnlMul) { + testBinaryCnnl(SimpleBinaryType::Mul, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +} + +TEST(kernel, BinaryCnnlSub) { + testBinaryCnnl(SimpleBinaryType::Sub, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +} + +TEST(kernel, BinaryCnnlDiv) { + testBinaryCnnl(SimpleBinaryType::Div, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +} + +// TEST(kernel, BinaryCnnlAnd) { +// testBinaryCnnl(SimpleBinaryType::And, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +// } + +// TEST(kernel, BinaryCnnlOr) { +// testBinaryCnnl(SimpleBinaryType::Or, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +// } + +// TEST(kernel, BinaryCnnlXor) { +// testBinaryCnnl(SimpleBinaryType::Xor, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +// } + +TEST(kernel, BinaryCnnlPow) { + testBinaryCnnl(SimpleBinaryType::Pow, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +} + +TEST(kernel, BinaryCnnlBroadcast) { + testBinaryCnnl(SimpleBinaryType::Add, Shape{3, 4, 5, 6}, Shape{}, Shape{3, 4, 5, 6}); +} + +#endif diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp new file mode 100644 index 000000000..2707e6274 --- /dev/null +++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp @@ -0,0 +1,63 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/simple_unary/cnnl_activation_kernel.hh" +#include "../../../src/kernels/simple_unary/cnnl_simple_unary_kernel.hh" +#include "../../../src/kernels/simple_unary/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +static void testOp(SimpleUnaryType opType, bool activation = true) { + // build routine + auto dataTensor = Tensor::share(DataType::F32, Shape{20, 30, 50}); + auto kernel = activation ? ActivationCnnl::build(opType, *dataTensor) + : SimpleUnaryCnnl::build(opType, *dataTensor); + auto kCpu = SimpleUnaryCpu::build(opType, *dataTensor); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluMem = dev.malloc(dataTensor->bytesSize()); + // put input data + std::vector data(dataTensor->elementsSize()); + for (auto i : range0_(data.size())) { data[i] = i * 1e-4f; } + mluMem->copyFromHost(data.data(), dataTensor->bytesSize()); + // inference + { + void const *inputs[]{*mluMem}; + void *outputs[]{*mluMem}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{data.data()}; + void *outputs[]{data.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(dataTensor->elementsSize()); + mluMem->copyToHost(result.data(), dataTensor->bytesSize()); + // check + for (auto i : range0_(data.size())) { + EXPECT_NEAR(data[i], result[i], 1e-4); + } +} + +TEST(kernel, SimpleUnaryCnnl) { + testOp(SimpleUnaryType::Abs, false); + testOp(SimpleUnaryType::Neg, false); + testOp(SimpleUnaryType::Sqrt, false); +} + +TEST(kernel, ActivationCnnl) { + testOp(SimpleUnaryType::Relu); + testOp(SimpleUnaryType::Sigmoid); + testOp(SimpleUnaryType::Tanh); +} + + +#endif// USE_BANG diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp new file mode 100644 index 000000000..a8c7fb283 --- /dev/null +++ b/src/04kernel/test/kernels/softmax/test_cnnl.cpp @@ -0,0 +1,52 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/softmax/cpu_kernel.hh" +#include "../../../src/kernels/softmax/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, SoftmaxCnnl) { + // build routine + auto xTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4}); + auto outTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4}); + dim_t axis = 2; + auto kCpu = SoftmaxCpu::build(SoftmaxInfo(*xTensor, axis)); + auto kCnnl = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::FAST, SoftmaxInfo(*xTensor, axis)); + ASSERT_TRUE(kCpu && kCnnl); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto rCnnl = kCnnl->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluIn = dev.malloc(xTensor->bytesSize()), + mluOut = dev.malloc(outTensor->bytesSize()); + // put input data + std::vector + data(xTensor->elementsSize(), 0), + cpuOut(outTensor->elementsSize()); + mluIn->copyFromHost(data.data(), xTensor->bytesSize()); + // inference + { + void const *inputs[]{data.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{*mluIn}; + void *outputs[]{*mluOut}; + rCnnl(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(outTensor->elementsSize()); + mluOut->copyToHost(result.data(), outTensor->bytesSize()); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } +} + +#endif diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp new file mode 100644 index 000000000..4f4301d86 --- /dev/null +++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp @@ -0,0 +1,55 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/transpose/cnnl_kernel.hh" +#include "../../../src/kernels/transpose/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, TransposeCnnl) { + // build routine + auto dataTensor = Tensor::share(DataType::F32, Shape{1, 3, 2, 5}); + auto info = TransposeInfo(dataTensor->shape, Permutation{2, 3, 0, 1}); + auto kCpu = TransposeCpu::build(dataTensor->dataType, info); + auto kernel = TransposeCnnl::build(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1}); + ASSERT_TRUE(kCpu && kernel); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto [routine, workspaceSize] = kernel->lower(res); + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto bytes = dataTensor->bytesSize(); + auto workspace = dev.malloc(workspaceSize), + mluIn = dev.malloc(bytes), + mluOut = dev.malloc(bytes); + // put input data + std::vector + cpuIn(dataTensor->elementsSize()), + cpuOut(cpuIn.size()); + std::iota(cpuIn.begin(), cpuIn.end(), 0); + mluIn->copyFromHost(cpuIn.data(), bytes); + // inference + { + void const *inputs[]{cpuIn.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{*mluIn}; + void *outputs[]{*mluOut}; + routine(res, *workspace, inputs, outputs); + } + // take output data + std::vector result(dataTensor->elementsSize()); + mluOut->copyToHost(result.data(), bytes); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } +} + +#endif From e329552d84db11a4626fe8ab08777f47e8d0f5bc Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Thu, 11 Jan 2024 03:41:10 +0000 Subject: [PATCH 2/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=92?= =?UTF-8?q?=E6=AD=A6=E7=BA=AA=E5=B9=B3=E5=8F=B0where/expand/conv=E7=AE=97?= =?UTF-8?q?=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/04kernel/src/collectors/conv.cc | 6 + src/04kernel/src/collectors/where.cc | 8 +- .../batch_normalization/cnnl_kernel.cc | 6 +- src/04kernel/src/kernels/conv/cnnl_kernel.cc | 243 ++++++++++++++++++ src/04kernel/src/kernels/conv/cnnl_kernel.hh | 43 ++++ .../src/kernels/expand/cnnl_kernel.cc | 68 +++++ .../src/kernels/expand/cnnl_kernel.hh | 31 +++ .../src/kernels/transpose/cnnl_kernel.cc | 36 ++- .../src/kernels/transpose/cnnl_kernel.hh | 15 +- src/04kernel/src/kernels/where/cnnl_kernel.cc | 116 +++++++++ src/04kernel/src/kernels/where/cnnl_kernel.hh | 32 +++ src/04kernel/test/kernels/conv/test_cnnl.cpp | 76 ++++++ .../test/kernels/expand/test_cnnl.cpp | 50 ++++ src/04kernel/test/kernels/where/test_cnnl.cpp | 68 +++++ 14 files changed, 775 insertions(+), 23 deletions(-) create mode 100644 src/04kernel/src/kernels/conv/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/conv/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/expand/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/expand/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/where/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/where/cnnl_kernel.hh create mode 100644 src/04kernel/test/kernels/conv/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/expand/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/where/test_cnnl.cpp diff --git a/src/04kernel/src/collectors/conv.cc b/src/04kernel/src/collectors/conv.cc index 14b61835f..d9cc0ea27 100644 --- a/src/04kernel/src/collectors/conv.cc +++ b/src/04kernel/src/collectors/conv.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/conv.h" +#include "../kernels/conv/cnnl_kernel.hh" #include "../kernels/conv/cudnn_kernel.hh" namespace refactor::kernel { @@ -23,6 +24,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = ConvCnnl::build(poolAttrs, x, w, b, y); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/where.cc b/src/04kernel/src/collectors/where.cc index 1ce2b60c6..14aa21952 100644 --- a/src/04kernel/src/collectors/where.cc +++ b/src/04kernel/src/collectors/where.cc @@ -1,11 +1,12 @@ #include "kernel/collectors/where.h" +#include "../kernels/where/cnnl_kernel.hh" #include "../kernels/where/cpu_kernel.hh" #include "../kernels/where/where_cuda.hh" namespace refactor::kernel { std::vector - WhereCollector::filter(TensorRefs inputs, TensorRefs) const { + WhereCollector::filter(TensorRefs inputs, TensorRefs outputs) const { std::vector ans; switch (_target) { case decltype(_target)::Cpu: @@ -18,6 +19,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = WhereCnnl::build(inputs, outputs); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc index be06233cd..e3efcd1d3 100644 --- a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc @@ -107,10 +107,10 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut)); auto handle = res.fetchOrStore()->handle; - auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX); + auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * info.dtX.size(); size_t workspaceSize; CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize)); - size_t totalWorkspaceSize = xTransSize + workspaceSize; + size_t totalWorkspaceSize = xTransSize * 2 + workspaceSize; res.fetchOrStore(); auto routine = [d = std::move(d), @@ -129,7 +129,7 @@ namespace refactor::kernel { void *xTrans = workspace; void *yTrans = xTrans + xTransSize; - void *cursor = yTrans + workspaceSize; + void *cursor = yTrans + xTransSize; // transpose NCHW input to NHWC CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x, diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc new file mode 100644 index 000000000..85dcb9517 --- /dev/null +++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc @@ -0,0 +1,243 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include "../expand/cnnl_kernel.hh" +#include "hardware/functions.h" +#endif + +namespace refactor::kernel { + using K = ConvCnnl; + + K::ConvCnnl(decltype(info) info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(PoolAttributes const &poolAttributes, + Tensor const &x, + Tensor const &w, + std::optional> b, + Tensor const &y) -> KernelBox { + static const std::unordered_set + SET{DataType::FP16, DataType::BF16, DataType::F32, DataType::F64, DataType::I8}; +#ifndef USE_BANG + return nullptr; +#endif + + auto dt = x.dataType; + if (!SET.contains(dt) || w.dataType != dt || y.dataType != dt) { + return nullptr; + } + + std::optional biasExpand = std::nullopt; + if (b) { + ASSERT(b->get().shape[0] == y.shape[1], ""); + std::vector input(y.rank(), 1); + input[1] = y.shape[1]; + biasExpand.emplace(ExpandInfoCnnl( + b->get().dataType, + slice(input.data(), input.size()), + slice(y.shape.data(), y.rank()))); + } + + // group is not supported + if (w.rank() != 4 || poolAttributes.rank() != 2) { + return nullptr; + } + auto d = poolAttributes.dilations(), + p = poolAttributes.pads(), + s = poolAttributes.strides(); + return std::make_unique(decltype(info){ + dt, + { + static_cast(x.shape[0]), + static_cast(x.shape[1]), + static_cast(x.shape[2]), + static_cast(x.shape[3]), + }, + { + static_cast(w.shape[0]), + static_cast(w.shape[1]), + static_cast(w.shape[2]), + static_cast(w.shape[3]), + }, + { + static_cast(y.shape[0]), + static_cast(y.shape[1]), + static_cast(y.shape[2]), + static_cast(y.shape[3]), + }, + {d[0], d[1]}, + {p[0], p[1], p[2], p[3]}, + {s[0], s[1]}, + std::move(biasExpand), + }); + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing conv using CNNL"; + } + +#ifdef USE_BANG + + auto ConvCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t x, y, w; + cnnlTensorDescriptor_t xTrans, yTrans, wTrans; + cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW; + cnnlConvolutionDescriptor_t conv; + cnnlConvolutionForwardAlgo_t algo; + // std::optional extraPadding; + std::optional biasExpand; + bool f32; + + Descriptors(decltype(f32) f32_) + :// extraPadding(std::nullopt), + biasExpand(std::nullopt), + f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&w)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&xTrans)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&yTrans)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&wTrans)); + CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC)); + CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW)); + CNNL_ASSERT(cnnlCreateConvolutionDescriptor(&conv)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(x)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(y)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(w)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(xTrans)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(yTrans)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(wTrans)); + CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC)); + CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW)); + CNNL_ASSERT(cnnlDestroyConvolutionDescriptor(conv)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.dt != DataType::F64); + // d->extraPadding = ExtraPadding::build(info.dt, info.xShape, info.pad); + if (info.biasExpand) { + d->biasExpand = ExpandCnnl(*info.biasExpand).lower(res).routine; + } + int xs[]{ + info.xShape[0], + info.xShape[1], + info.xShape[2] + std::abs(info.pad[0] - info.pad[2]), + info.xShape[3] + std::abs(info.pad[1] - info.pad[3]), + }; + + auto NHWC = [](const int shape[]) -> std::vector { + return { + shape[0], shape[2], shape[3], shape[1]}; + }; + + std::vector xsNHWC = NHWC(xs); + std::vector wsNHWC = NHWC(info.wShape); + std::vector ysNHWC = NHWC(info.yShape); + + setCnnlTensor(d->x, info.dt, slice(xs, 4)); + setCnnlTensor(d->y, info.dt, slice(info.yShape, 4)); + setCnnlTensor(d->w, info.dt, slice(info.wShape, 4)); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->xTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, xsNHWC.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->yTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, ysNHWC.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->wTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, wsNHWC.data())); + + auto xTransSize = cnnlGetTensorElementNum(d->xTrans) * info.dt.size(); + auto yTransSize = cnnlGetTensorElementNum(d->yTrans) * info.dt.size(); + auto wTransSize = cnnlGetTensorElementNum(d->wTrans) * info.dt.size(); + + int permuteIn[4] = {0, 2, 3, 1}; + int permuteOut[4] = {0, 3, 1, 2}; + CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permuteIn)); + CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut)); + + size_t xWorkspaceSize, yWorkspaceSize, wWorkspaceSize, convWorkspaceSize; + auto handle = res.fetchOrStore()->handle; + CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->x, d->NCHW2NHWC, &xWorkspaceSize)); + CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->w, d->NCHW2NHWC, &wWorkspaceSize)); + CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->yTrans, d->NHWC2NCHW, &yWorkspaceSize)); + + // clang-format off + auto computation = info.dt == DataType::F64 ? DataType::F64 + : info.dt == DataType::I8 ? DataType::I32 + : DataType::F32; + // clang-format on + auto group = xs[1] / info.wShape[1]; + CNNL_ASSERT(cnnlSetConvolutionDescriptor(d->conv, 4, info.pad, info.stride, info.dilation, group, cnnlDataTypeConvert(computation))); + CNNL_ASSERT(cnnlGetConvolutionForwardAlgorithm( + handle, d->conv, d->xTrans, d->wTrans, d->yTrans, + CNNL_CONVOLUTION_FWD_FASTEST, &d->algo)); + + CNNL_ASSERT(cnnlGetConvolutionForwardWorkspaceSize( + handle, d->xTrans, d->wTrans, d->yTrans, NULL, + d->conv, d->algo, &convWorkspaceSize)); + + // if (d->extraPadding) { + // workspaceSize = hardware::alignBytes(workspaceSize, 256); + // } + + size_t workspaceSize = xTransSize + yTransSize + wTransSize + std::max({xWorkspaceSize, wWorkspaceSize, yWorkspaceSize, convWorkspaceSize}); + + res.fetchOrStore(); + auto routine = [d, xTransSize, yTransSize, wTransSize, + xWorkspaceSize, wWorkspaceSize, + yWorkspaceSize, convWorkspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + auto handle = res.fetchOrStore()->handle; + void const *x = inputs[0], *w = inputs[1]; + void *y = outputs[0]; + // if (auto f = d->extraPadding; f) { + // x = (*f)(x, reinterpret_cast(workspace) + workspaceSize); + // } + // if (auto f = d->biasExpand; f) { + // (*f)(res, workspace, inputs + 2, outputs); + // } + + void *xTrans = workspace; + void *wTrans = xTrans + xTransSize; + void *yTrans = wTrans + wTransSize; + void *opWorkspace = yTrans + yTransSize; + + // transpose NCHW input to NHWC + CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->x, x, + d->xTrans, xTrans, opWorkspace, xWorkspaceSize)); + CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->w, w, + d->wTrans, wTrans, opWorkspace, wWorkspaceSize)); + + // build alpha/beta for double + auto a = d->f32 ? factor(1) : factor(1), + b = d->f32 + ? factor(d->biasExpand ? 1 : 0) + : factor(d->biasExpand ? 1 : 0); + CNNL_ASSERT(cnnlConvolutionForward( + handle, + d->conv, d->algo, &a, + d->xTrans, xTrans, d->wTrans, wTrans, + NULL, NULL, opWorkspace, convWorkspaceSize, + &b, d->yTrans, yTrans)); + + // transpose NHWC intermediates to NCHW + CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->yTrans, yTrans, + d->y, y, opWorkspace, yWorkspaceSize)); + }; + return {std::move(routine), workspaceSize}; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.hh b/src/04kernel/src/kernels/conv/cnnl_kernel.hh new file mode 100644 index 000000000..814e75ad5 --- /dev/null +++ b/src/04kernel/src/kernels/conv/cnnl_kernel.hh @@ -0,0 +1,43 @@ +#ifndef KERNEL_CONV_CNNL_KERNEL_HH +#define KERNEL_CONV_CNNL_KERNEL_HH + +#include "../../kernels/expand/cnnl_kernel.hh" +#include "kernel/attributes/pool_attributes.h" +#include "kernel/kernel.h" +#include + +namespace refactor::kernel { + + /// @brief Use `cnnlConvolutionForward`. + /// It only supports 4D tensors. + struct ConvCnnl final : public Kernel { + struct { + DataType dt; + int xShape[4], + wShape[4], + yShape[4], + dilation[2], + pad[4], + stride[2]; + std::optional biasExpand; + } info; + + explicit ConvCnnl(decltype(info)) noexcept; + + static KernelBox build(PoolAttributes const &, + Tensor const &, + Tensor const &, + std::optional>, + Tensor const &); + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CONV_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.cc b/src/04kernel/src/kernels/expand/cnnl_kernel.cc new file mode 100644 index 000000000..18f586422 --- /dev/null +++ b/src/04kernel/src/kernels/expand/cnnl_kernel.cc @@ -0,0 +1,68 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = ExpandCnnl; + + K::ExpandCnnl(ExpandInfoCnnl info_) noexcept + : Kernel(), info(info_) {} + + auto K::build(Tensor const &input, Tensor const &output) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(ExpandInfoCnnl( + input.dataType, + slice(input.shape.data(), input.rank()), + slice(output.shape.data(), output.rank()) + )); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing expand operation using CNNL"; + } + +#ifdef USE_BANG + auto ExpandCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlTensorDescriptor_t inDesc, outDesc; + + Descriptors() : inDesc(nullptr), outDesc(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc)); + } + }; + auto d = std::make_shared(); + std::vector in(info.inDims.begin(), info.inDims.end()), + out(info.outDims.begin(), info.outDims.end()); + setCnnlTensor(d->inDesc, info.dataType, slice(in.data(), in.size())); + setCnnlTensor(d->outDesc, info.dataType, slice(out.data(), out.size())); + + res.fetchOrStore(); + return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + CNNL_ASSERT(cnnlExpand(res.fetchOrStore()->handle, + d->inDesc, inputs[0], d->outDesc, outputs[0])); + // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.hh b/src/04kernel/src/kernels/expand/cnnl_kernel.hh new file mode 100644 index 000000000..a6271907e --- /dev/null +++ b/src/04kernel/src/kernels/expand/cnnl_kernel.hh @@ -0,0 +1,31 @@ +#ifndef KERNEL_EXPAND_CNNL_KERNEL_HH +#define KERNEL_EXPAND_CNNL_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct ExpandInfoCnnl { + DataType dataType; + slice_t inDims, outDims; + }; + + struct ExpandCnnl final : public Kernel { + ExpandInfoCnnl info; + + explicit ExpandCnnl(ExpandInfoCnnl) noexcept; + + static KernelBox build(Tensor const &input, Tensor const &output) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_EXPAND_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc index 26a0bef1b..a5a6881e4 100644 --- a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc @@ -8,21 +8,29 @@ namespace refactor::kernel { using K = TransposeCnnl; - using Info = TransposeInfo; + using Info = TransposeInfoCnnl; - K::TransposeCnnl(DataType dataType_, Shape dimIn_, Shape dimOut_, Permutation perm_) noexcept - : Kernel(), dataType(dataType_), dimIn(std::move(dimIn_)), - dimOut(std::move(dimOut_)), perm(std::move(perm_)) {} + Info::TransposeInfoCnnl(DataType dataType_, std::vector input_, std::vector perm_) + : dataType(dataType_), inDim(input_), perm(perm_) { + ASSERT(input_.size() == perm_.size(), "Unreachable"); + for (uint32_t i = 0; i < input_.size(); i++) { + outDim.push_back(input_[perm_[i]]); + } + } + + Info::TransposeInfoCnnl(DataType dataType, Shape shape, Permutation perm) + : TransposeInfoCnnl(dataType, + std::move(std::vector(shape.begin(), shape.end())), + std::move(std::vector(perm.begin(), perm.end()))) { } - auto K::build(DataType dataType, Shape shape_, Permutation perm_) noexcept -> KernelBox { + K::TransposeCnnl(Info info_) noexcept + : Kernel(), info(std::move(info_)) { } + + auto K::build(DataType dataType, Shape shape, Permutation perm) noexcept -> KernelBox { #ifndef USE_BANG return nullptr; #endif - Shape dimOut_; - for (uint32_t i = 0; i < shape_.size(); i++) { - dimOut_.push_back(shape_[perm_[i]]); - } - return std::make_unique(dataType, std::move(shape_), std::move(dimOut_), std::move(perm_)); + return std::make_unique(TransposeInfoCnnl(dataType, shape, perm)); } auto K::typeId() noexcept -> size_t { static uint8_t ID = 1; @@ -63,10 +71,10 @@ namespace refactor::kernel { Descriptors(Descriptors &&) = delete; }; - auto d = std::make_shared(dataType != DT::F64); - setCnnlTensor(d->x, dataType, slice((int *)(dimIn.data()), dimIn.size())); - setCnnlTensor(d->y, dataType, slice((int *)(dimOut.data()), dimOut.size())); - CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, perm.size(), (int *)perm.data())); + auto d = std::make_shared(info.dataType != DT::F64); + setCnnlTensor(d->x, info.dataType, slice(info.inDim.data(), info.inDim.size())); + setCnnlTensor(d->y, info.dataType, slice(info.outDim.data(), info.outDim.size())); + CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, info.perm.size(), info.perm.data())); auto handle = res.fetchOrStore()->handle; size_t workspaceSize; diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.hh b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh index 37bb7e088..62254dc4e 100644 --- a/src/04kernel/src/kernels/transpose/cnnl_kernel.hh +++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.hh @@ -9,13 +9,18 @@ namespace refactor::kernel { using Shape = absl::InlinedVector; using Permutation = Shape; - struct TransposeCnnl final : public Kernel { + struct TransposeInfoCnnl { DataType dataType; - Shape dimIn; - Shape dimOut; - Permutation perm; + std::vector inDim, outDim, perm; + + TransposeInfoCnnl(DataType, std::vector, std::vector); + TransposeInfoCnnl(DataType, Shape, Permutation); + }; + + struct TransposeCnnl final : public Kernel { + TransposeInfoCnnl info; - TransposeCnnl(DataType, Shape, Shape, Permutation) noexcept; + TransposeCnnl(TransposeInfoCnnl) noexcept; static KernelBox build(DataType, Shape, Permutation) noexcept; static size_t typeId() noexcept; diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc new file mode 100644 index 000000000..774c5513d --- /dev/null +++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc @@ -0,0 +1,116 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = WhereCnnl; + + K::WhereCnnl(decltype(info) info_) noexcept + : Kernel(), info(info_) {} + + auto K::build(TensorRefs const &inputs, TensorRefs const &outputs) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(decltype(info) { + inputs[1].get().dataType, + inputs[0].get().shape, + inputs[1].get().shape, + inputs[2].get().shape, + outputs[0].get().shape, + }); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing where operation using CNNL"; + } + +#ifdef USE_BANG + auto WhereCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + struct Descriptors { + cnnlTensorDescriptor_t cond, x, y, ans; + bool f32; + + explicit Descriptors(decltype(f32) f32_) + : cond(nullptr), x(nullptr), y(nullptr), + ans(nullptr), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&cond)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&ans)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(cond)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(x)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(y)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(ans)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.dataType != DT::F64); + + std::vector cDim(info.condDim.begin(), info.condDim.end()), + xDim(info.thenDim.begin(), info.thenDim.end()), + yDim(info.elseDim.begin(), info.elseDim.end()), + ansDim(info.outputDim.begin(), info.outputDim.end()); + + auto rightAlign = [](std::vector &dim, uint32_t targetLength) { + if (dim.size() < targetLength) { + dim.insert(dim.begin(), targetLength - dim.size(), 1); + } + }; + if (ansDim.size() == 0) { + ansDim.push_back(1); + } + rightAlign(cDim, ansDim.size()); + rightAlign(xDim, ansDim.size()); + rightAlign(yDim, ansDim.size()); + + CNNL_ASSERT(cnnlSetTensorDescriptor(d->cond, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(DT::Bool), cDim.size(), cDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), xDim.size(), xDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), yDim.size(), yDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->ans, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), ansDim.size(), ansDim.data())); + + auto handle = res.fetchOrStore()->handle; + size_t workspaceSize; + CNNL_ASSERT(cnnlGetSelectV2WorkspaceSize(handle, d->cond, d->x, d->y, &workspaceSize)); + + res.fetchOrStore(); + auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + auto cond = inputs[0], + x = inputs[1], + y = inputs[2]; + auto ans = outputs[0]; + + CNNL_ASSERT(cnnlSelectV2( + handle, d->cond, cond, d->x, x, + d->y, y, workspace, workspaceSize, + d->ans, ans)); + + cnrtQueueSync(res.fetchOrStore()->queue); + }; + + return {std::move(routine), workspaceSize}; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.hh b/src/04kernel/src/kernels/where/cnnl_kernel.hh new file mode 100644 index 000000000..6139b1832 --- /dev/null +++ b/src/04kernel/src/kernels/where/cnnl_kernel.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_WHERE_CNNL_HH +#define KERNEL_WHERE_CNNL_HH + +#include "kernel/collectors/where.h" +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + using Shape = absl::InlinedVector; + + struct WhereCnnl final : public Kernel { + struct { + DataType dataType; + Shape condDim, thenDim, elseDim, outputDim; + } info; + + WhereCnnl(decltype(info)) noexcept; + + static KernelBox build(TensorRefs const &, TensorRefs const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_WHERE_CNNL_HH diff --git a/src/04kernel/test/kernels/conv/test_cnnl.cpp b/src/04kernel/test/kernels/conv/test_cnnl.cpp new file mode 100644 index 000000000..bf4bff3c8 --- /dev/null +++ b/src/04kernel/test/kernels/conv/test_cnnl.cpp @@ -0,0 +1,76 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/conv/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +void testConvCnnl(int rank, const int64_t *pads, const int64_t *strides, const int64_t *dilations, + Shape xShape, Shape wShape, Shape yShape, + const std::vector &ExpectData) { + auto xTensor = Tensor::share(DataType::F32, xShape); + auto wTensor = Tensor::share(DataType::F32, wShape); + auto yTensor = Tensor::share(DataType::F32, yShape); + PoolAttributes poolAttributes(rank, dilations, pads, strides); + auto kernel = ConvCnnl::build(poolAttributes, *xTensor, *wTensor, std::nullopt, *yTensor); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto [routine, workspaceSize] = kernel->lower(res); + // bang malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + xMlu = dev.malloc(xTensor->bytesSize()), + wMlu = dev.malloc(wTensor->bytesSize()), + yMlu = dev.malloc(yTensor->bytesSize()); + // put input data + std::vector xIncremental(xTensor->elementsSize()), + wIncremental(wTensor->elementsSize()); + std::iota(xIncremental.begin(), xIncremental.end(), 0); + std::iota(wIncremental.begin(), wIncremental.end(), 0); + std::vector xData(xIncremental.begin(), xIncremental.end()), + wData(wIncremental.begin(), wIncremental.end()); + xMlu->copyFromHost(xData.data(), xTensor->bytesSize()); + wMlu->copyFromHost(wData.data(), wTensor->bytesSize()); + // inference + void const *inputs[]{*xMlu, *wMlu}; + void *outputs[]{*yMlu}; + routine(res, *workspace, inputs, outputs); + + xMlu->copyToHost(xData.data(), xTensor->bytesSize()); + wMlu->copyToHost(wData.data(), wTensor->bytesSize()); + // fmt::println("{}", vec2str(xData)); + // fmt::println("{}", vec2str(wData)); + + // std::vector ws(workspaceSize); + // workspace->copyToHost(ws.data(), workspaceSize); + // fmt::println("{}", vec2str(ws)); + + // take output data + std::vector result(yTensor->elementsSize()); + yMlu->copyToHost(result.data(), yTensor->bytesSize()); + // check + for (auto i : range0_(ExpectData.size())) { + EXPECT_FLOAT_EQ(ExpectData[i], result[i]); + } +} + +TEST(kernel, ConvCnnl) { + int rank = 2; + int64_t const + pads[]{1, 1, 1, 1}, + strides[]{1, 1}, + dilations[]{1, 1}; + Shape + xShape{1, 3, 3, 2}, + wShape{1, 3, 3, 2}, + yShape{1, 1, 3, 3}; + const std::vector ExpectData = {570, 1158, 582, 888, 1785, 888, 582, 1158, 570}; + testConvCnnl(rank, pads, strides, dilations, xShape, wShape, yShape, ExpectData); +} + + +#endif diff --git a/src/04kernel/test/kernels/expand/test_cnnl.cpp b/src/04kernel/test/kernels/expand/test_cnnl.cpp new file mode 100644 index 000000000..e06e59777 --- /dev/null +++ b/src/04kernel/test/kernels/expand/test_cnnl.cpp @@ -0,0 +1,50 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/expand/cnnl_kernel.hh" +#include "../../../src/kernels/expand/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, ExpandCnnl) { + // build routine + auto input = Tensor::share(DataType::F32, {3, 4, 1, 6}), + output = Tensor::share(DataType::F32, {2, 3, 4, 5, 6}); + auto kernel = ExpandCnnl::build(*input, *output); + auto kCpu = ExpandCpu::build(ExpandInfo(*input, *output)); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + auto rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluIn = dev.malloc(input->bytesSize()), + mluOut = dev.malloc(output->bytesSize()); + // put input data + std::vector + data(input->elementsSize()), + ans(output->elementsSize()), + result(ans.size()); + std::iota(data.begin(), data.end(), 0); + mluIn->copyFromHost(data.data(), input->bytesSize()); + // inference + { + void const *inputs[]{*mluIn}; + void *outputs[]{*mluOut}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{data.data()}; + void *outputs[]{ans.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + mluOut->copyToHost(result.data(), output->bytesSize()); + EXPECT_EQ(result, ans); +} + +#endif diff --git a/src/04kernel/test/kernels/where/test_cnnl.cpp b/src/04kernel/test/kernels/where/test_cnnl.cpp new file mode 100644 index 000000000..aaad65857 --- /dev/null +++ b/src/04kernel/test/kernels/where/test_cnnl.cpp @@ -0,0 +1,68 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/where/cnnl_kernel.hh" +#include "../../../src/kernels/where/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +void testWhereCnnl(Shape cDim, Shape xDim, Shape yDim, Shape outDim) { + // build routine + auto cTensor = Tensor::share(DataType::Bool, cDim); + auto xTensor = Tensor::share(DataType::F32, xDim); + auto yTensor = Tensor::share(DataType::F32, yDim); + auto outTensor = Tensor::share(DataType::F32, outDim); + auto kCpu = WhereCpu::build({*cTensor, *xTensor, *yTensor}); + auto kCnnl = WhereCnnl::build({*cTensor, *xTensor, *yTensor}, {*outTensor}); + ASSERT_TRUE(kCpu && kCnnl); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto [rCnnl, workspaceSize] = kCnnl->lower(res); + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + mluC = dev.malloc(cTensor->bytesSize()), + mluX = dev.malloc(xTensor->bytesSize()), + mluY = dev.malloc(yTensor->bytesSize()), + mluOut = dev.malloc(outTensor->bytesSize()); + // put input data + int dataC[cTensor->elementsSize()]; + memset(dataC, 1, cTensor->elementsSize() * sizeof(bool)); + mluC->copyFromHost(dataC, cTensor->bytesSize()); + std::vector dataX(xTensor->elementsSize()); + for (auto i : range0_(dataX.size())) { dataX[i] = 7; } + mluX->copyFromHost(dataX.data(), xTensor->bytesSize()); + std::vector dataY(yTensor->elementsSize()); + for (auto i : range0_(dataY.size())) { dataY[i] = 3; } + mluY->copyFromHost(dataY.data(), yTensor->bytesSize()); + std::vector cpuOut(outTensor->elementsSize()); + // inference + { + void const *inputs[]{dataC, dataX.data(), dataY.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{*mluC, *mluX, *mluY}; + void *outputs[]{*mluOut}; + rCnnl(res, *workspace, inputs, outputs); + } + // take output data + std::vector result(outTensor->elementsSize()); + mluOut->copyToHost(result.data(), outTensor->bytesSize()); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } +} + +TEST(kernel, WhereCnnl) { + testWhereCnnl(Shape{2, 5}, Shape{2, 3, 1, 5}, Shape{3, 2, 5}, Shape{2, 3, 2, 5}); + testWhereCnnl(Shape{1}, Shape{4}, Shape{1}, Shape{4}); + testWhereCnnl(Shape{3}, Shape{2, 3}, Shape{2, 3}, Shape{2, 3}); +} + +#endif From 917113056411739a30fbf4403ab13d5047a77ede Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Fri, 12 Jan 2024 07:50:39 +0000 Subject: [PATCH 3/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=92?= =?UTF-8?q?=E6=AD=A6=E7=BA=AA=E5=B9=B3=E5=8F=B0split/concat/slice/matmul?= =?UTF-8?q?=E7=AE=97=E5=AD=90=EF=BC=8C=E5=B9=B6merge=20from=20master?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/04kernel/src/collectors/concat.cc | 6 + src/04kernel/src/collectors/mat_mul.cc | 6 + src/04kernel/src/collectors/slice.cc | 6 + src/04kernel/src/collectors/split.cc | 6 + .../src/kernels/concat/cnnl_kernel.cc | 93 ++++++++ .../src/kernels/concat/cnnl_kernel.hh | 26 +++ .../src/kernels/mat_mul/cnnl_kernel.cc | 153 +++++++++++++ .../src/kernels/mat_mul/cnnl_kernel.hh | 32 +++ src/04kernel/src/kernels/slice/cnnl_kernel.cc | 88 ++++++++ src/04kernel/src/kernels/slice/cnnl_kernel.hh | 32 +++ src/04kernel/src/kernels/split/cnnl_kernel.cc | 111 ++++++++++ src/04kernel/src/kernels/split/cnnl_kernel.hh | 37 ++++ .../test/kernels/concat/test_cnnl.cpp | 79 +++++++ .../test/kernels/mat_mul/test_cnnl.cpp | 205 ++++++++++++++++++ src/04kernel/test/kernels/slice/test_cnnl.cpp | 59 +++++ src/04kernel/test/kernels/split/test_cnnl.cpp | 80 +++++++ src/07onnx/src/operators/simple_binary.cc | 2 +- 17 files changed, 1020 insertions(+), 1 deletion(-) create mode 100644 src/04kernel/src/kernels/concat/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/concat/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/slice/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/slice/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/split/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/split/cnnl_kernel.hh create mode 100644 src/04kernel/test/kernels/concat/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/mat_mul/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/slice/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/split/test_cnnl.cpp diff --git a/src/04kernel/src/collectors/concat.cc b/src/04kernel/src/collectors/concat.cc index 8e6386907..27f27c776 100644 --- a/src/04kernel/src/collectors/concat.cc +++ b/src/04kernel/src/collectors/concat.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/concat.h" #include "../kernels/concat/cpu_kernel.hh" #include "../kernels/concat/cuda_kernel.hh" +#include "../kernels/concat/cnnl_kernel.hh" namespace refactor::kernel { @@ -20,6 +21,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = ConcatCnnl::build(axis, inputs, outputs[0].get()); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/mat_mul.cc b/src/04kernel/src/collectors/mat_mul.cc index 7581200cd..d6b323260 100644 --- a/src/04kernel/src/collectors/mat_mul.cc +++ b/src/04kernel/src/collectors/mat_mul.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/mat_mul.h" +#include "../kernels/mat_mul/cnnl_kernel.hh" #include "../kernels/mat_mul/cpu_kernel.hh" #include "../kernels/mat_mul/cublas_kernel.hh" #include "kernel/attributes/mat_mul_info.h" @@ -26,6 +27,11 @@ namespace refactor::kernel { case decltype(_target)::Nvidia: REGISTER(MatMulCublas) break; + case decltype(_target)::Mlu: + if (auto ptr = MatMulCnnl::build(inputs, outputs, transA, transB, alpha, beta); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/slice.cc b/src/04kernel/src/collectors/slice.cc index 0b063dd17..60c93cb28 100644 --- a/src/04kernel/src/collectors/slice.cc +++ b/src/04kernel/src/collectors/slice.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/slice.h" #include "../kernels/slice/cpu_kernel.hh" #include "../kernels/slice/cuda_kernel.hh" +#include "../kernels/slice/cnnl_kernel.hh" namespace refactor::kernel { @@ -26,6 +27,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = SliceCnnl::build(inputs[0].get().dataType, dimentions, inputs[0].get().shape, outputs[0].get().shape); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/split.cc b/src/04kernel/src/collectors/split.cc index 6fe22548d..6a3be0a65 100644 --- a/src/04kernel/src/collectors/split.cc +++ b/src/04kernel/src/collectors/split.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/split.h" +#include "../kernels/split/cnnl_kernel.hh" #include "../kernels/split/cpu_kernel.hh" #include "../kernels/split/cuda_kernel.hh" @@ -20,6 +21,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = SplitCnnl::build(axis, inputs[0].get(), outputs); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc new file mode 100644 index 000000000..7d11dbaa9 --- /dev/null +++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc @@ -0,0 +1,93 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = ConcatCnnl; + + K::ConcatCnnl(SplitInfoCnnl info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(int axis, TensorRefs inputs, Tensor output) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(SplitInfoCnnl(axis, output, inputs)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing split operation using CNNL"; + } + +#ifdef USE_BANG + auto ConcatCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + struct Descriptors { + cnnlTensorDescriptor_t in; + std::vector out; + bool f32; + + explicit Descriptors(int n, decltype(f32) f32_) + : in(nullptr), + out(std::vector(n, nullptr)), + f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&in)); + for (auto i = 0; i < n; i++) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&out[i])); + } + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); + for (auto i = 0; i < out.size(); i++) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); + } + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.num, info.dataType != DT::F64); + setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); + for (auto i = 0; i < info.outDims.size(); i++) { + setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); + } + + auto handle = res.fetchOrStore()->handle; + size_t workspaceSize; + CNNL_ASSERT(cnnlGetSplitWorkspaceSize(handle, info.num, &workspaceSize)); + + res.fetchOrStore(); + auto routine = [d = std::move(d), n = info.num, axis = info.axis, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + const void *argv[n]; + for (auto i = 0; i < n; i++) { + argv[i] = inputs[i]; + } + + CNNL_ASSERT(cnnlConcat( + handle, n, axis, d->out.data(), argv, + workspace, workspaceSize, d->in, outputs[0])); + }; + + return {std::move(routine), workspaceSize}; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.hh b/src/04kernel/src/kernels/concat/cnnl_kernel.hh new file mode 100644 index 000000000..c49d5607a --- /dev/null +++ b/src/04kernel/src/kernels/concat/cnnl_kernel.hh @@ -0,0 +1,26 @@ +#ifndef KERNEL_CONCAT_CNNL_KERNEL_HH +#define KERNEL_CONCAT_CNNL_KERNEL_HH + +#include "../../kernels/split/cnnl_kernel.hh" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct ConcatCnnl final : public Kernel { + SplitInfoCnnl info; + + explicit ConcatCnnl(SplitInfoCnnl) noexcept; + + static KernelBox build(int, TensorRefs, Tensor) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CONCAT_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc new file mode 100644 index 000000000..7be05b7b2 --- /dev/null +++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc @@ -0,0 +1,153 @@ +#include "cnnl_kernel.hh" +#include + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = MatMulCnnl; + using DT = DataType; + + K::MatMulCnnl(decltype(info) info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(TensorRefs inputs_, TensorRefs outputs_, bool transA_, bool transB_, float alpha_, float beta_) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + auto dt = inputs_[0].get().dataType; + return dt.isIeee754() || dt == DT::I8 + ? std::make_unique(decltype(info){ + dt, + transA_, + transB_, + alpha_, + beta_, + std::vector(inputs_[0].get().shape.begin(), inputs_[0].get().shape.end()), + std::vector(inputs_[1].get().shape.begin(), inputs_[1].get().shape.end()), + std::vector(outputs_[0].get().shape.begin(), outputs_[0].get().shape.end()), + inputs_.size() == 3 + ? inputs_[2].get().shape.size() == 0 ? std::make_optional(std::vector(1, 1)) + : std::make_optional(std::vector( + inputs_[2].get().shape.begin(), + inputs_[2].get().shape.end())) + : std::nullopt, + }) + : nullptr; + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing MatMul using CNNL"; + } + + +#ifdef USE_BANG + auto K::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t a, b, c; + cnnlMatMulDescriptor_t bmm; + cnnlMatMulAlgo_t algo; + cnnlMatMulHeuristicResult_t heuristic; + cnnlTensorDescriptor_t bias; + bool addBias, f32; + + explicit Descriptors(bool addBias_, bool f32_) + : a(nullptr), b(nullptr), c(nullptr), + bmm(nullptr), algo(nullptr), heuristic(nullptr), + bias(nullptr), addBias(addBias_), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&a)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&b)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&c)); + if (addBias) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&bias)); + } + CNNL_ASSERT(cnnlMatMulDescCreate(&bmm)); + CNNL_ASSERT(cnnlMatMulAlgoCreate(&algo)); + CNNL_ASSERT(cnnlCreateMatMulHeuristicResult(&heuristic)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(a)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(b)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(c)); + if (addBias) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(bias)); + } + CNNL_ASSERT(cnnlMatMulDescDestroy(bmm)); + CNNL_ASSERT(cnnlMatMulAlgoDestroy(algo)); + CNNL_ASSERT(cnnlDestroyMatMulHeuristicResult(heuristic)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.biasDim.has_value(), info.dataType != DT::F64); + setCnnlTensor(d->a, info.dataType, slice(info.aDim.data(), info.aDim.size())); + setCnnlTensor(d->b, info.dataType, slice(info.bDim.data(), info.bDim.size())); + setCnnlTensor(d->c, info.dataType, slice(info.cDim.data(), info.cDim.size())); + if (d->addBias) { + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->bias, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), + info.biasDim.value().size(), info.biasDim.value().data())); + } + int32_t tA = info.transA, tB = info.transB; + CNNL_ASSERT(cnnlSetMatMulDescAttr(d->bmm, CNNL_MATMUL_DESC_TRANSA, + &tA, sizeof(int32_t))); + CNNL_ASSERT(cnnlSetMatMulDescAttr(d->bmm, CNNL_MATMUL_DESC_TRANSB, + &tB, sizeof(int32_t))); + auto handle = res.fetchOrStore()->handle; + int returnedAlgoCount = 0; + CNNL_ASSERT(cnnlGetBatchMatMulAlgoHeuristic( + handle, d->bmm, d->a, d->b, d->c, + NULL, 1, &(d->heuristic), &returnedAlgoCount)); + + size_t algoWorkspaceSize; + CNNL_ASSERT(cnnlGetBatchMatMulHeuristicResult(d->heuristic, d->algo, &algoWorkspaceSize)); + + res.fetchOrStore(); + auto routine = [d = std::move(d), algoWorkspaceSize, + aa = info.alpha, bb = info.beta](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + // build alpha/beta for double + auto alpha = d->f32 ? factor(aa) : factor(aa), + beta = d->f32 ? factor(bb) : factor(bb), + // one = d->f32 ? factor(1) : factor(1), + zero = d->f32 ? factor(0) : factor(0); + + if (d->addBias) { + CNNL_ASSERT(cnnlExpand(handle, d->bias, inputs[2], d->c, outputs[0])); + } + + if (alpha != 0) { + CNNL_ASSERT(cnnlBatchMatMulBCast_v2( + handle, d->bmm, d->algo, &alpha, + d->a, inputs[0], d->b, inputs[1], + d->addBias ? &beta : &zero, d->c, outputs[0], + workspace, algoWorkspaceSize)); + } + + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + + return {std::move(routine), algoWorkspaceSize}; + } + + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh new file mode 100644 index 000000000..9b44b192c --- /dev/null +++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_MATMUL_CNNL_KERNEL_HH +#define KERNEL_MATMUL_CNNL_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct MatMulCnnl final : public Kernel { + struct { + DataType dataType; + bool transA, transB; + float alpha, beta; + std::vector aDim, bDim, cDim; + std::optional> biasDim; + } info; + + explicit MatMulCnnl(decltype(info)) noexcept; + + static KernelBox build(TensorRefs, TensorRefs, bool, bool, float, float) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_MATMUL_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.cc b/src/04kernel/src/kernels/slice/cnnl_kernel.cc new file mode 100644 index 000000000..b5f8b4c85 --- /dev/null +++ b/src/04kernel/src/kernels/slice/cnnl_kernel.cc @@ -0,0 +1,88 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = SliceCnnl; + + K::SliceCnnl(decltype(info) info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(DataType dt_, Dimensions dims_, Shape in_, Shape out_) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(decltype(info){ + dt_, + dims_, + std::vector(in_.begin(), in_.end()), + std::vector(out_.begin(), out_.end()), + }); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing slice operation using CNNL"; + } + +#ifdef USE_BANG + auto SliceCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + struct Descriptors { + cnnlTensorDescriptor_t in, out; + bool f32; + + explicit Descriptors(decltype(f32) f32_) + : in(nullptr), out(nullptr), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&in)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&out)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(out)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.dataType != DT::F64); + // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); + // setCnnlTensor(d->out, info.dataType, slice(info.outDim.data(), info.outDim.size())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->out, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDim.size(), info.outDim.data())); + std::vector begin, end, stride; + for (auto i = 0; i < info.dims.size(); i++) { + // [begin, end), end is not inclued + begin.push_back(info.dims[i].start); + auto sign = info.dims[i].step > 0 ? 1 : -1; + end.push_back(info.dims[i].start + info.dims[i].step * (info.dims[i].length - 1) + sign); + stride.push_back(info.dims[i].step); + } + + res.fetchOrStore(); + return [d = std::move(d), begin, end, stride](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + CNNL_ASSERT(cnnlStridedSlice( + handle, d->in, inputs[0], + begin.data(), end.data(), stride.data(), + d->out, outputs[0])); + }; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.hh b/src/04kernel/src/kernels/slice/cnnl_kernel.hh new file mode 100644 index 000000000..ace79da1d --- /dev/null +++ b/src/04kernel/src/kernels/slice/cnnl_kernel.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_SLICE_CNNL_KERNEL_HH +#define KERNEL_SLICE_CNNL_KERNEL_HH + +#include "kernel/attributes/slice_info.h" +#include "kernel/collectors/slice.h" +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct SliceCnnl final : public Kernel { + struct { + DataType dataType; + Dimensions dims; + std::vector inDim, outDim; + } info; + + explicit SliceCnnl(decltype(info)) noexcept; + + static KernelBox build(DataType, Dimensions, Shape, Shape) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SLICE_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc new file mode 100644 index 000000000..ace809ffa --- /dev/null +++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc @@ -0,0 +1,111 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = SplitCnnl; + using Info = SplitInfoCnnl; + + Info::SplitInfoCnnl(DataType dt_, int axis_, int num_, std::vector in_, std::vector> out_) + : dataType(dt_), axis(axis_), num(num_), inDim(std::move(in_)), outDims(std::move(out_)) {} + + + Info::SplitInfoCnnl(int axis, Tensor input, TensorRefs outputs) + : SplitInfoCnnl(input.dataType, axis, outputs.size(), + std::move(std::vector(input.shape.begin(), input.shape.end())), + std::move([](TensorRefs tensors) -> std::vector> { + std::vector> res; + for (uint32_t i = 0; i < tensors.size(); i++) { + res.push_back(std::vector(tensors[i].get().shape.begin(), + tensors[i].get().shape.end())); + } + return res; + }(outputs))) {} + + K::SplitCnnl(SplitInfoCnnl info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(int axis, Tensor input, TensorRefs outputs) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(SplitInfoCnnl(axis, input, outputs)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing concat operation using CNNL"; + } + +#ifdef USE_BANG + auto SplitCnnl::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + using DT = DataType; + + struct Descriptors { + cnnlTensorDescriptor_t in; + std::vector out; + bool f32; + + explicit Descriptors(int n, decltype(f32) f32_) + : in(nullptr), + out(std::vector(n, nullptr)), + f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&in)); + for (auto i = 0; i < n; i++) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&out[i])); + } + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); + for (auto i = 0; i < out.size(); i++) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); + } + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(info.num, info.dataType != DT::F64); + setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); + for (auto i = 0; i < info.outDims.size(); i++) { + setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); + } + + auto handle = res.fetchOrStore()->handle; + size_t workspaceSize; + CNNL_ASSERT(cnnlGetSplitWorkspaceSize(handle, info.num, &workspaceSize)); + + res.fetchOrStore(); + auto routine = [d = std::move(d), n = info.num, axis = info.axis, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + void *argv[n]; + for (auto i = 0; i < n; i++) { + argv[i] = outputs[i]; + } + + CNNL_ASSERT(cnnlSplit( + handle, n, axis, d->in, inputs[0], + workspace, workspaceSize, d->out.data(), argv)); + }; + + return {std::move(routine), workspaceSize}; + } + +#endif + + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.hh b/src/04kernel/src/kernels/split/cnnl_kernel.hh new file mode 100644 index 000000000..bcde3d708 --- /dev/null +++ b/src/04kernel/src/kernels/split/cnnl_kernel.hh @@ -0,0 +1,37 @@ +#ifndef KERNEL_SPLIT_CNNL_KERNEL_HH +#define KERNEL_SPLIT_CNNL_KERNEL_HH + +#include "kernel/collectors/split.h" +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + struct SplitInfoCnnl { + DataType dataType; + int axis; + int num; + std::vector inDim; + std::vector> outDims; + + SplitInfoCnnl(DataType, int, int, std::vector, std::vector>); + SplitInfoCnnl(int, Tensor, TensorRefs); + }; + + struct SplitCnnl final : public Kernel { + SplitInfoCnnl info; + + explicit SplitCnnl(SplitInfoCnnl) noexcept; + + static KernelBox build(int, Tensor, TensorRefs) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_CNNL_KERNEL_HH diff --git a/src/04kernel/test/kernels/concat/test_cnnl.cpp b/src/04kernel/test/kernels/concat/test_cnnl.cpp new file mode 100644 index 000000000..2911c9e90 --- /dev/null +++ b/src/04kernel/test/kernels/concat/test_cnnl.cpp @@ -0,0 +1,79 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/concat/cpu_kernel.hh" +#include "../../../src/kernels/concat/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, ConcatCnnl) { + // build routine + std::vector> inputTensors{ + Tensor::share(DataType::F32, Shape{2, 3, 1, 1, 7, 7}),// 勿 + Tensor::share(DataType::F32, Shape{2, 3, 1, 9, 7, 7}),// 忘 + Tensor::share(DataType::F32, Shape{2, 3, 1, 3, 7, 7}),// 国 + Tensor::share(DataType::F32, Shape{2, 3, 1, 7, 7, 7}),// 耻 + }; + auto result = Tensor::share(DataType::F32, Shape{2, 3, 1, 20, 7, 7}); + TensorRefs inputs_; + inputs_.reserve(inputTensors.size()); + std::transform(inputTensors.begin(), inputTensors.end(), + std::back_inserter(inputs_), + [](auto const &it) { return std::cref(*it); }); + SplitInfo info(3, inputs_); + auto kCpu = ConcatCpu::build(info); + auto kernel = ConcatCnnl::build(3, inputs_, *result); + ASSERT_TRUE(kCpu && kernel); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto [routine, workspaceSize] = kernel->lower(res); + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + Arc + workspace = dev.malloc(workspaceSize), + mluIns[]{ + dev.malloc(inputTensors[0]->bytesSize()), + dev.malloc(inputTensors[1]->bytesSize()), + dev.malloc(inputTensors[2]->bytesSize()), + dev.malloc(inputTensors[3]->bytesSize()), + }, + mluOut = dev.malloc(result->bytesSize()); + // put input data + std::vector + cpuIns[]{ + std::vector(inputTensors[0]->elementsSize()), + std::vector(inputTensors[1]->elementsSize()), + std::vector(inputTensors[2]->elementsSize()), + std::vector(inputTensors[3]->elementsSize()), + }, + cpuOut(result->elementsSize()), + out(result->elementsSize()); + std::iota(cpuIns[0].begin(), cpuIns[0].end(), 0); + std::iota(cpuIns[1].begin(), cpuIns[1].end(), 0); + std::iota(cpuIns[2].begin(), cpuIns[2].end(), 0); + std::iota(cpuIns[3].begin(), cpuIns[3].end(), 0); + mluIns[0]->copyFromHost(cpuIns[0].data(), inputTensors[0]->bytesSize()); + mluIns[1]->copyFromHost(cpuIns[1].data(), inputTensors[1]->bytesSize()); + mluIns[2]->copyFromHost(cpuIns[2].data(), inputTensors[2]->bytesSize()); + mluIns[3]->copyFromHost(cpuIns[3].data(), inputTensors[3]->bytesSize()); + // inference + { + void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2], *mluIns[3]}; + void *outputs[]{*mluOut}; + routine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + mluOut->copyToHost(out.data(), result->bytesSize()); + EXPECT_EQ(out, cpuOut); +} + +#endif diff --git a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp new file mode 100644 index 000000000..251c5c898 --- /dev/null +++ b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp @@ -0,0 +1,205 @@ +#ifdef USE_BANG + +#include "../src/kernels/mat_mul/cnnl_kernel.hh" +#include "../src/kernels/mat_mul/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TensorRefs getRefs(std::vector> tensors) { + TensorRefs refs; + std::transform(tensors.begin(), tensors.end(), std::back_inserter(refs), + [](auto const &it) { return std::cref(*it); }); + return refs; +} + +TEST(kernel, MatMulCnnl_OnlyBias) { + // build routine + auto A = Tensor::share(DataType::F32, Shape{2, 2, 2}); + auto B = Tensor::share(DataType::F32, Shape{2, 2}); + auto C = Tensor::share(DataType::F32, Shape{}); + auto Y = Tensor::share(DataType::F32, Shape{2, 2, 2}); + bool tA = false, tB = false; + float alpha = 0.0, beta = 1.0; + MatMulInfo info(*A, *B, *C, tA, tB, alpha, beta); + auto kernel = MatMulCnnl::build(getRefs({A, B, C}), getRefs({Y}), tA, tB, 0, 0); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto [routine, workspaceSize] = kernel->lower(res); + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + ma = dev.malloc(A->bytesSize()), + mb = dev.malloc(B->bytesSize()), + mc = dev.malloc(C->bytesSize()), + my = dev.malloc(Y->bytesSize()); + // put input data + std::vector dataA{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector dataB{0.0, 0.0, 0.0, 0.0}; + std::vector dataC{2.5}; + std::vector ans{2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5}; + ma->copyFromHost(dataA.data(), A->bytesSize()); + mb->copyFromHost(dataB.data(), B->bytesSize()); + mc->copyFromHost(dataC.data(), C->bytesSize()); + // inference + void const *inputs[]{*ma, *mb, *mc}; + void *outputs[]{*my}; + routine(res, *workspace, inputs, outputs); + // take output data + std::vector result(Y->elementsSize()); + my->copyToHost(result.data(), Y->bytesSize()); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(result[i], ans[i]); + } +} + +TEST(kernel, MatMulCnnl_Broadcast) { + // build routine + auto A = Tensor::share(DataType::F32, Shape{2, 1, 2, 2}); + auto B = Tensor::share(DataType::F32, Shape{1, 2, 2, 2}); + auto C = Tensor::share(DataType::F32, Shape{2, 1}); + auto Y = Tensor::share(DataType::F32, Shape{2, 2, 2, 2}); + MatMulInfo info(*A, *B, *C, false, false, 1, 1); + auto cpuKernel = MatMulCPU::build(info); + auto mluKernel = MatMulCnnl::build(getRefs({A, B, C}), getRefs({Y}), false, false, 1.0, 1.0); + ASSERT_TRUE(cpuKernel && mluKernel); + auto res = runtime::Resources(); + auto cpuRoutine = cpuKernel->lower(res).routine; + auto [mluRoutine, workspaceSize] = mluKernel->lower(res); + // put input data + std::vector dataA{1.0, 2.0, 0.0, 0.5, + 1.0, 0.0, 0.0, 1.0}; + std::vector dataB{1.0, 2.0, 0.0, 0.5, + 1.0, 0.0, 0.0, 1.0}; + std::vector dataC{1.0, 0.0}; + std::vector cpuOut(Y->elementsSize()); + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + ma = dev.malloc(A->bytesSize()), + mb = dev.malloc(B->bytesSize()), + mc = dev.malloc(C->bytesSize()), + my = dev.malloc(Y->bytesSize()); + ma->copyFromHost(dataA.data(), A->bytesSize()); + mb->copyFromHost(dataB.data(), B->bytesSize()); + mc->copyFromHost(dataC.data(), C->bytesSize()); + // inference + { + void const *inputs[]{*ma, *mb, *mc}; + void *outputs[]{*my}; + mluRoutine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{dataA.data(), dataB.data(), dataC.data()}; + void *outputs[]{cpuOut.data()}; + cpuRoutine(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(Y->elementsSize()); + my->copyToHost(result.data(), Y->bytesSize()); + // check + EXPECT_EQ(result, cpuOut); +} + +TEST(kernel, MatMulCnnl_TransABNoBias) { + // build routine + auto A = Tensor::share(DataType::F32, Shape{1, 3, 2, 2}); + auto B = Tensor::share(DataType::F32, Shape{2, 1, 2, 2}); + auto Y = Tensor::share(DataType::F32, Shape{2, 3, 2, 2}); + MatMulInfo info(*A, *B, {}, true, true, 2.0, 1); + auto cpuKernel = MatMulCPU::build(info); + auto mluKernel = MatMulCnnl::build(getRefs({A, B}), getRefs({Y}), true, true, 2.0, 1.0); + ASSERT_TRUE(cpuKernel && mluKernel); + auto res = runtime::Resources(); + auto cpuRoutine = cpuKernel->lower(res).routine; + auto [mluRoutine, workspaceSize] = mluKernel->lower(res); + // put input data + std::vector dataA{1.0, 2.0, 0.0, 0.5, + 1.0, 0.0, 0.0, 1.0, + 1.0, 2.0, 3.0, 4.0}; + std::vector dataB{1.0, 2.0, 0.0, 0.5, + 1.0, 0.0, 0.0, 1.0}; + std::vector cpuOut(Y->elementsSize()); + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + ma = dev.malloc(A->bytesSize()), + mb = dev.malloc(B->bytesSize()), + my = dev.malloc(Y->bytesSize()); + ma->copyFromHost(dataA.data(), A->bytesSize()); + mb->copyFromHost(dataB.data(), B->bytesSize()); + // inference + { + void const *inputs[]{*ma, *mb}; + void *outputs[]{*my}; + mluRoutine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{dataA.data(), dataB.data()}; + void *outputs[]{cpuOut.data()}; + cpuRoutine(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(Y->elementsSize()); + my->copyToHost(result.data(), Y->bytesSize()); + // check + EXPECT_EQ(result, cpuOut); +} + +TEST(kernel, MatMulCnnl_Large) { + // build routine + auto A = Tensor::share(DataType::F32, Shape{1, 512}); + auto B = Tensor::share(DataType::F32, Shape{1000, 512}); + auto C = Tensor::share(DataType::F32, Shape{1000}); + auto Y = Tensor::share(DataType::F32, Shape{1, 1000}); + MatMulInfo info(*A, *B, *C, false, true, 1, 1); + auto cpuKernel = MatMulCPU::build(info); + auto mluKernel = MatMulCnnl::build(getRefs({A, B, C}), getRefs({Y}), false, true, 1.0, 1.0); + ASSERT_TRUE(cpuKernel && mluKernel); + auto res = runtime::Resources(); + auto cpuRoutine = cpuKernel->lower(res).routine; + auto [mluRoutine, workspaceSize] = mluKernel->lower(res); + // put input data + std::vector dataA(A->elementsSize()); + for (auto i : range0_(dataA.size())) { + dataA[i] = 1.0 * (i % 4) - 2.0; + } + std::vector dataB(B->elementsSize()); + for (auto i : range0_(dataB.size())) { + dataB[i] = 1.0 * (i % 4) - 2.0; + } + std::vector dataC(C->elementsSize()); + for (auto i : range0_(dataC.size())) { + dataC[i] = 1.0 * (i % 4) - 2.0; + } + std::vector cpuOut(Y->elementsSize()); + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto workspace = dev.malloc(workspaceSize), + ma = dev.malloc(A->bytesSize()), + mb = dev.malloc(B->bytesSize()), + mc = dev.malloc(C->bytesSize()), + my = dev.malloc(Y->bytesSize()); + ma->copyFromHost(dataA.data(), A->bytesSize()); + mb->copyFromHost(dataB.data(), B->bytesSize()); + mc->copyFromHost(dataC.data(), C->bytesSize()); + // inference + { + void const *inputs[]{*ma, *mb, *mc}; + void *outputs[]{*my}; + mluRoutine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{dataA.data(), dataB.data(), dataC.data()}; + void *outputs[]{cpuOut.data()}; + cpuRoutine(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(Y->elementsSize()); + my->copyToHost(result.data(), Y->bytesSize()); + // check + EXPECT_EQ(result, cpuOut); +} + +#endif diff --git a/src/04kernel/test/kernels/slice/test_cnnl.cpp b/src/04kernel/test/kernels/slice/test_cnnl.cpp new file mode 100644 index 000000000..914a8fee4 --- /dev/null +++ b/src/04kernel/test/kernels/slice/test_cnnl.cpp @@ -0,0 +1,59 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/slice/cpu_kernel.hh" +#include "../../../src/kernels/slice/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, SliceCnnl) { + // build routine + Dimensions dims{ + {5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360} + {2, 3, 2}, // 6 -> {2, 5} -> { 36, 60, 90} + {1, 1, 3}, // 5 -> {1, 2, 3} -> { 18, 6, 30} + {0, 1, 1}, // 1 -> {0} + {0, 1, 2}, // 2 -> {0, 1} + {0, 1, 3}, // 3 -> {0, 1, 2} + }; + auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3}), + output = Tensor::share(DataType::F32, Shape{3, 2, 3, 1, 2, 3}); + SliceInfo info(dims, *input); + auto kernel = SliceCnnl::build(DataType::F32, dims, input->shape, output->shape); + auto kCpu = SliceCpu::build(info); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + auto rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluIn = dev.malloc(input->bytesSize()), + mluOut = dev.malloc(output->bytesSize()); + // put input data + std::vector + data(input->elementsSize()), + ans(output->elementsSize()), + result(ans.size()); + std::iota(data.begin(), data.end(), 0); + mluIn->copyFromHost(data.data(), input->bytesSize()); + // inference + { + void const *inputs[]{*mluIn}; + void *outputs[]{*mluOut}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{data.data()}; + void *outputs[]{ans.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + mluOut->copyToHost(result.data(), output->bytesSize()); + EXPECT_EQ(result, ans); +} + +#endif diff --git a/src/04kernel/test/kernels/split/test_cnnl.cpp b/src/04kernel/test/kernels/split/test_cnnl.cpp new file mode 100644 index 000000000..5afa434e6 --- /dev/null +++ b/src/04kernel/test/kernels/split/test_cnnl.cpp @@ -0,0 +1,80 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/split/cpu_kernel.hh" +#include "../../../src/kernels/split/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, SplitCnnl) { + // build routine + auto dataTensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 20, 7, 7}); + std::vector> outputTensors{ + Tensor::share(DataType::F32, Shape{2, 3, 1, 1, 7, 7}),// 勿 + Tensor::share(DataType::F32, Shape{2, 3, 1, 9, 7, 7}),// 忘 + Tensor::share(DataType::F32, Shape{2, 3, 1, 3, 7, 7}),// 国 + Tensor::share(DataType::F32, Shape{2, 3, 1, 7, 7, 7}),// 耻 + }; + TensorRefs outputs_; + outputs_.reserve(outputTensors.size()); + std::transform(outputTensors.begin(), outputTensors.end(), + std::back_inserter(outputs_), + [](auto const &it) { return std::cref(*it); }); + auto info = SplitInfo(3, outputs_); + auto kCpu = SplitCpu::build(info); + auto kernel = SplitCnnl::build(3, *dataTensor, outputs_); + ASSERT_TRUE(kCpu && kernel); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto [routine, workspaceSize]= kernel->lower(res); + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + Arc + workspace = dev.malloc(workspaceSize), + mluIn = dev.malloc(dataTensor->bytesSize()), + mluOuts[]{ + dev.malloc(outputTensors[0]->bytesSize()), + dev.malloc(outputTensors[1]->bytesSize()), + dev.malloc(outputTensors[2]->bytesSize()), + dev.malloc(outputTensors[3]->bytesSize()), + }; + // put input data + std::vector + data(dataTensor->elementsSize()), + outsCpu[]{ + std::vector(outputTensors[0]->elementsSize()), + std::vector(outputTensors[1]->elementsSize()), + std::vector(outputTensors[2]->elementsSize()), + std::vector(outputTensors[3]->elementsSize()), + }, + outs[]{ + std::vector(outputTensors[0]->elementsSize()), + std::vector(outputTensors[1]->elementsSize()), + std::vector(outputTensors[2]->elementsSize()), + std::vector(outputTensors[3]->elementsSize()), + }; + std::iota(data.begin(), data.end(), 0); + mluIn->copyFromHost(data.data(), dataTensor->bytesSize()); + // inference + { + void const *inputs[]{*mluIn}; + void *outputs[]{*mluOuts[0], *mluOuts[1], *mluOuts[2], *mluOuts[3]}; + routine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{data.data()}; + void *outputs[]{outsCpu[0].data(), outsCpu[1].data(), outsCpu[2].data(), outsCpu[3].data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + for (auto i : range0_(outputTensors.size())) { + mluOuts[i]->copyToHost(outs[i].data(), outputTensors[i]->bytesSize()); + EXPECT_EQ(outs[i], outsCpu[i]); + } +} + +#endif diff --git a/src/07onnx/src/operators/simple_binary.cc b/src/07onnx/src/operators/simple_binary.cc index 2db99bdd3..8ff7660b4 100644 --- a/src/07onnx/src/operators/simple_binary.cc +++ b/src/07onnx/src/operators/simple_binary.cc @@ -10,7 +10,7 @@ namespace refactor::onnx { : Operator(), type(type_) {} auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox { - auto fmod = attributes.getOrInsert( "fmod", {0}).int_(); + auto fmod = attributes.getOrInsert("fmod", {0}).int_(); // clang-format off auto type = opType == "onnx::Add" ? Ty::Add : From beda02971cd50cca84626b657db803982e9b40f6 Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Tue, 16 Jan 2024 03:19:08 +0000 Subject: [PATCH 4/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=92?= =?UTF-8?q?=E6=AD=A6=E7=BA=AA=E5=B9=B3=E5=8F=B0erf/mod/cast/clip/gather/sc?= =?UTF-8?q?atternd=E7=AE=97=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/04kernel/src/collectors/cast.cc | 6 + src/04kernel/src/collectors/clip.cc | 6 + src/04kernel/src/collectors/gather.cc | 6 + src/04kernel/src/collectors/scatter_nd.cc | 6 + src/04kernel/src/kernels/cast/cnnl_kernel.cc | 226 ++++++++++++++++++ src/04kernel/src/kernels/cast/cnnl_kernel.hh | 27 +++ src/04kernel/src/kernels/clip/cnnl_kernel.cc | 66 +++++ src/04kernel/src/kernels/clip/cnnl_kernel.hh | 28 +++ .../src/kernels/gather/cnnl_kernel.cc | 86 +++++++ .../src/kernels/gather/cnnl_kernel.hh | 30 +++ .../src/kernels/scatter_nd/cnnl_kernel.cc | 86 +++++++ .../src/kernels/scatter_nd/cnnl_kernel.hh | 30 +++ .../src/kernels/simple_binary/binary_cnnl.cc | 27 ++- .../simple_unary/cnnl_simple_unary_kernel.cc | 4 +- src/04kernel/test/kernels/cast/test_cnnl.cpp | 49 ++++ src/04kernel/test/kernels/clip/test_cnnl.cpp | 51 ++++ .../test/kernels/gather/test_gather_cnnl.cpp | 99 ++++++++ .../test/kernels/scatter_nd/test_cnnl.cpp | 63 +++++ .../simple_binary/test_binary_cnnl.cpp | 89 +++++-- .../test/kernels/simple_unary/test_cnnl.cpp | 1 + 20 files changed, 951 insertions(+), 35 deletions(-) create mode 100644 src/04kernel/src/kernels/cast/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/cast/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/clip/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/clip/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/gather/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/gather/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh create mode 100644 src/04kernel/test/kernels/cast/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/clip/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/gather/test_gather_cnnl.cpp create mode 100644 src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp diff --git a/src/04kernel/src/collectors/cast.cc b/src/04kernel/src/collectors/cast.cc index bca9d2629..45bf3372b 100644 --- a/src/04kernel/src/collectors/cast.cc +++ b/src/04kernel/src/collectors/cast.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/cast.h" #include "../kernels/cast/cpu_kernel.hh" #include "../kernels/cast/cuda_kernel.hh" +#include "../kernels/cast/cnnl_kernel.hh" namespace refactor::kernel { @@ -24,6 +25,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = CastCnnl::build(from, to); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/clip.cc b/src/04kernel/src/collectors/clip.cc index 06ccd020b..364fbe588 100644 --- a/src/04kernel/src/collectors/clip.cc +++ b/src/04kernel/src/collectors/clip.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/clip.h" #include "../kernels/clip/cpu_kernel.hh" #include "../kernels/clip/cuda_kernel.hh" +#include "../kernels/clip/cnnl_kernel.hh" namespace refactor::kernel { @@ -24,6 +25,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = ClipCnnl::build(data, hasMax); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/gather.cc b/src/04kernel/src/collectors/gather.cc index 9c30f1c62..3b66b91f0 100644 --- a/src/04kernel/src/collectors/gather.cc +++ b/src/04kernel/src/collectors/gather.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/gather.h" +#include "../kernels/gather/cnnl_kernel.hh" #include "../kernels/gather/cpu_kernel.hh" #include "../kernels/gather/cuda_kernel.hh" @@ -20,6 +21,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = GatherCnnl::build(axis, inputs[0].get(), inputs[1].get(), outputs[0].get()); ptr != nullptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/scatter_nd.cc b/src/04kernel/src/collectors/scatter_nd.cc index 62d63c802..3b43a25fb 100644 --- a/src/04kernel/src/collectors/scatter_nd.cc +++ b/src/04kernel/src/collectors/scatter_nd.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/scatter_nd.h" #include "../kernels/scatter_nd/cpu_kernel.hh" #include "../kernels/scatter_nd/cuda_kernel.hh" +#include "../kernels/scatter_nd/cnnl_kernel.hh" namespace refactor::kernel { @@ -23,6 +24,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = ScatterNDCnnl::build(inputs, outputs); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc new file mode 100644 index 000000000..951209741 --- /dev/null +++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc @@ -0,0 +1,226 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + + +namespace refactor::kernel { + using K = CastCnnl; + using DT = DataType; + + K::CastCnnl(decltype(from) from_, + decltype(to) to_, + decltype(shape) shape_) noexcept + : from(from_), to(to_), shape(shape_) {} + + auto K::build(Tensor const &from, Tensor const &to) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + return std::make_unique(from.dataType, to.dataType, + std::vector(from.shape.begin(), from.shape.end())); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing cast operation using CNNL"; + } + +#ifdef USE_BANG + + static cnnlCastDataType_t castType(DataType from, DataType to); + + auto K::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlTensorDescriptor_t inDesc, outDesc; + cnnlCastDataType_t cast; + + Descriptors() : inDesc(nullptr), outDesc(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc)); + } + }; + auto d = std::make_shared(); + d->cast = castType(from, to); + setCnnlTensor(d->inDesc, from, slice(shape.data(), shape.size())); + setCnnlTensor(d->outDesc, to, slice(shape.data(), shape.size())); + + res.fetchOrStore(); + return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore()->handle, + d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0])); + // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + } + + static cnnlCastDataType_t castType(DataType from, DataType to) { + switch (from) { + case DT::F32: + switch (to) { + case DT::F64: + return CNNL_CAST_FLOAT_TO_DOUBLE; + case DT::FP16: + return CNNL_CAST_FLOAT_TO_HALF; + case DT::I64: + return CNNL_CAST_FLOAT_TO_INT64; + case DT::I32: + return CNNL_CAST_FLOAT_TO_INT32; + case DT::I16: + return CNNL_CAST_FLOAT_TO_INT16; + case DT::I8: + return CNNL_CAST_FLOAT_TO_INT8; + case DT::U8: + return CNNL_CAST_FLOAT_TO_UINT8; + // case DT::BF16: + // return CNNL_CAST_FLOAT_TO_BFLOAT16; + case DT::Bool: + return CNNL_CAST_FLOAT_TO_BOOL; + default: + UNREACHABLE(); + } + case DT::FP16: + switch (to) { + case DT::F32: + return CNNL_CAST_HALF_TO_FLOAT; + case DT::I64: + return CNNL_CAST_HALF_TO_INT64; + case DT::I32: + return CNNL_CAST_HALF_TO_INT32; + case DT::I16: + return CNNL_CAST_HALF_TO_INT16; + case DT::I8: + return CNNL_CAST_HALF_TO_INT8; + case DT::U8: + return CNNL_CAST_HALF_TO_UINT8; + case DT::Bool: + return CNNL_CAST_HALF_TO_BOOL; + default: + UNREACHABLE(); + } + case DT::I32: + switch (to) { + case DT::F32: + return CNNL_CAST_INT32_TO_FLOAT; + case DT::FP16: + return CNNL_CAST_INT32_TO_HALF; + case DT::I64: + return CNNL_CAST_INT32_TO_INT64; + case DT::I16: + return CNNL_CAST_INT32_TO_INT16; + case DT::I8: + return CNNL_CAST_INT32_TO_INT8; + case DT::Bool: + return CNNL_CAST_INT32_TO_BOOL; + default: + UNREACHABLE(); + } + case DT::I16: + switch (to) { + case DT::F32: + return CNNL_CAST_INT16_TO_FLOAT; + case DT::FP16: + return CNNL_CAST_INT16_TO_HALF; + case DT::I32: + return CNNL_CAST_INT16_TO_INT32; + // case DT::I8: + // return CNNL_CAST_INT16_TO_INT8; + default: + UNREACHABLE(); + } + case DT::I8: + switch (to) { + case DT::F32: + return CNNL_CAST_INT8_TO_FLOAT; + case DT::FP16: + return CNNL_CAST_INT8_TO_HALF; + case DT::I32: + return CNNL_CAST_INT8_TO_INT32; + case DT::I16: + return CNNL_CAST_INT8_TO_INT16; + default: + UNREACHABLE(); + } + case DT::U8: + switch (to) { + case DT::F32: + return CNNL_CAST_UINT8_TO_FLOAT; + case DT::FP16: + return CNNL_CAST_UINT8_TO_HALF; + case DT::I64: + return CNNL_CAST_UINT8_TO_INT64; + case DT::I32: + return CNNL_CAST_UINT8_TO_INT32; + default: + UNREACHABLE(); + } + case DT::Bool: + switch (to) { + case DT::F32: + return CNNL_CAST_BOOL_TO_FLOAT; + case DT::FP16: + return CNNL_CAST_BOOL_TO_HALF; + case DT::I32: + return CNNL_CAST_BOOL_TO_INT32; + default: + UNREACHABLE(); + } + case DT::I64: + switch (to) { + case DT::F32: + return CNNL_CAST_INT64_TO_FLOAT; + case DT::FP16: + return CNNL_CAST_INT64_TO_HALF; + case DT::I32: + return CNNL_CAST_INT64_TO_INT32; + case DT::U32: + return CNNL_CAST_INT64_TO_UINT32; + default: + UNREACHABLE(); + } + case DT::U32: + switch (to) { + case DT::I64: + return CNNL_CAST_UINT32_TO_INT64; + case DT::U64: + return CNNL_CAST_UINT32_TO_UINT64; + default: + UNREACHABLE(); + } + case DT::F64: + switch (to) { + case DT::F32: + return CNNL_CAST_DOUBLE_TO_FLOAT; + default: + UNREACHABLE(); + } + case DT::BF16: + switch (to) { + // case DT::F32: + // return CNNL_CAST_BF16_TO_FLOAT; + default: + UNREACHABLE(); + } + default: + UNREACHABLE(); + } + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.hh b/src/04kernel/src/kernels/cast/cnnl_kernel.hh new file mode 100644 index 000000000..b1e638080 --- /dev/null +++ b/src/04kernel/src/kernels/cast/cnnl_kernel.hh @@ -0,0 +1,27 @@ +#ifndef KERNEL_CAST_CNNL_KERNEL_HH +#define KERNEL_CAST_CNNL_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct CastCnnl final : public Kernel { + DataType from, to; + std::vector shape; + + CastCnnl(decltype(from), decltype(to), decltype(shape)) noexcept; + + static KernelBox build(Tensor const &, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CAST_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.cc b/src/04kernel/src/kernels/clip/cnnl_kernel.cc new file mode 100644 index 000000000..423939bc9 --- /dev/null +++ b/src/04kernel/src/kernels/clip/cnnl_kernel.cc @@ -0,0 +1,66 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = ClipCnnl; + + K::ClipCnnl(decltype(dataType) dt, + decltype(shape) shape_, + decltype(hasMax) hasMax_) noexcept + : dataType(dt), shape(shape_), hasMax(hasMax_) { + } + + auto K::build(Tensor const &data, bool hasMax) noexcept -> KernelBox { + return data.dataType.isCpuNumberic() + ? std::make_unique(data.dataType, + std::vector(data.shape.begin(), data.shape.end()), + hasMax) + : nullptr; + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing clip operation using CNNL"; + } + +#ifdef USE_BANG + auto K::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlTensorDescriptor_t t; + + Descriptors() : t(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&t)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(t)); + } + }; + auto d = std::make_shared(); + setCnnlTensor(d->t, dataType, slice(shape.data(), shape.size())); + + res.fetchOrStore(); + return [d = std::move(d), hasMax = this->hasMax](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + CNNL_ASSERT(cnnlClip_v2(res.fetchOrStore()->handle, + CNNL_POINTER_MODE_DEVICE, d->t, + inputs[0], inputs[1], hasMax ? inputs[2] : nullptr, + d->t, outputs[0])); + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.hh b/src/04kernel/src/kernels/clip/cnnl_kernel.hh new file mode 100644 index 000000000..37d168062 --- /dev/null +++ b/src/04kernel/src/kernels/clip/cnnl_kernel.hh @@ -0,0 +1,28 @@ +#ifndef KERNEL_CLIP_CNNL_KERNEL_HH +#define KERNEL_CLIP_CNNL_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct ClipCnnl final : public Kernel { + DataType dataType; + std::vector shape; + bool hasMax; + + ClipCnnl(decltype(dataType), decltype(shape), decltype(hasMax)) noexcept; + + static KernelBox build(Tensor const &, bool hasMax) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CLIP_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc new file mode 100644 index 000000000..19f9ed358 --- /dev/null +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc @@ -0,0 +1,86 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = GatherCnnl; + + K::GatherCnnl(decltype(info) info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(int axis, Tensor input, Tensor index, Tensor output) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(decltype(info){ + input.dataType, + index.dataType, + axis, + std::vector(input.shape.begin(), input.shape.end()), + std::vector(index.shape.begin(), index.shape.end()), + std::vector(output.shape.begin(), output.shape.end()), + }); + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing gather using CNNL"; + } + +#ifdef USE_BANG + auto K::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlTensorDescriptor_t inDesc, indexDesc, outDesc; + + Descriptors() : inDesc(nullptr), indexDesc(nullptr), outDesc(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&indexDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(indexDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc)); + } + }; + auto d = std::make_shared(); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.inDim.size(), info.inDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType), + info.indexDim.size(), info.indexDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.outDim.size(), info.outDim.data())); + + size_t workspaceSize = info.inDim.size() * sizeof(int); + + res.fetchOrStore(); + auto routine = [d = std::move(d), + shape = info.inDim.data(), workspaceSize, + dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + BANG_ASSERT(cnrtMemcpy(workspace, (void*) shape, workspaceSize, CNRT_MEM_TRANS_DIR_HOST2DEV)); + CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore()->handle, dim, + d->inDesc, inputs[0], reinterpret_cast(workspace), + d->indexDesc, reinterpret_cast(inputs[1]), + d->outDesc, outputs[0])); + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + + return {std::move(routine), workspaceSize}; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.hh b/src/04kernel/src/kernels/gather/cnnl_kernel.hh new file mode 100644 index 000000000..f60a8ada0 --- /dev/null +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.hh @@ -0,0 +1,30 @@ +#ifndef KERNEL_GATHER_CNNL_KERNEL_HH +#define KERNEL_GATHER_CNNL_KERNEL_HH + +#include "kernel/attributes/gather_info.h" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct GatherCnnl final : public Kernel { + struct { + DataType dataType, indexDataType; + int axis; + std::vector inDim, indexDim, outDim; + } info; + + explicit GatherCnnl(decltype(info)) noexcept; + + static KernelBox build(int, Tensor, Tensor, Tensor) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_TRANSPOSE_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc new file mode 100644 index 000000000..8f181b58b --- /dev/null +++ b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc @@ -0,0 +1,86 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = ScatterNDCnnl; + + K::ScatterNDCnnl(decltype(info) info_) + : Kernel(), info(std::move(info_)) {} + + auto K::build(TensorRefs inputs, TensorRefs outputs) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + return std::make_unique(decltype(info){ + inputs[0].get().dataType, + inputs[1].get().dataType, + inputs[2].get().dataType, + std::vector(inputs[0].get().shape.begin(), inputs[0].get().shape.end()), + std::vector(inputs[1].get().shape.begin(), inputs[1].get().shape.end()), + std::vector(inputs[2].get().shape.begin(), inputs[2].get().shape.end()), + std::vector(outputs[0].get().shape.begin(), outputs[0].get().shape.end()), + }); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing scatterNd operation using CNNL"; + } + +#ifdef USE_BANG + auto K::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlTensorDescriptor_t inDesc, indexDesc, updateDesc, outDesc; + + Descriptors() : inDesc(nullptr), indexDesc(nullptr), + updateDesc(nullptr), outDesc(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&indexDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&updateDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(indexDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(updateDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc)); + } + }; + auto d = std::make_shared(); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.inDim.size(), info.inDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType), + info.indexDim.size(), info.indexDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->updateDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.updateDataType), + info.updateDim.size(), info.updateDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.updateDataType), + info.outDim.size(), info.outDim.data())); + + res.fetchOrStore(); + return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + CNNL_ASSERT(cnnlScatterNd_v2(res.fetchOrStore()->handle, CNNL_SCATTERND_UPDATE, + d->indexDesc, inputs[1], d->updateDesc, inputs[2], + d->inDesc, inputs[0], d->outDesc, outputs[0])); + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + }; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh new file mode 100644 index 000000000..81fca0357 --- /dev/null +++ b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.hh @@ -0,0 +1,30 @@ +#ifndef KERNEL_SCATTER_ND_CNNL_KERNEL_HH +#define KERNEL_SCATTER_ND_CNNL_KERNEL_HH + +#include "kernel/attributes/scatter_nd_info.h" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct ScatterNDCnnl final : public Kernel { + struct { + DataType dataType, indexDataType, updateDataType; + std::vector inDim, indexDim, updateDim, outDim; + } info; + + explicit ScatterNDCnnl(decltype(info)); + + static KernelBox build(TensorRefs, TensorRefs) noexcept; + + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SCATTER_ND_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc index cfe2ce0fa..86e605349 100644 --- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc @@ -16,21 +16,19 @@ namespace refactor::kernel { auto K::build(Op op, Tensor const &a, Tensor const &b, Tensor const &c) noexcept -> KernelBox { static const std::unordered_set - ARTHIMETIC{Op::Add, Op::Sub, Op::Mul, Op::Div, Op::And, Op::Or, Op::Xor, Op::Pow}; + ARTHIMETIC{Op::Add, Op::Sub, Op::Mul, Op::Div, Op::And, Op::Or, Op::Xor, Op::Pow, Op::Mod, Op::Fmod}; #ifndef USE_BANG return nullptr; #endif if (a.dataType != b.dataType || - !a.dataType.isFloat() || + // !a.dataType.isFloat() || !ARTHIMETIC.contains(op) || // At least one of a,b should have the same shape as c (a.shape != c.shape && b.shape != c.shape) || // Sub only supports brocasting b - (a.shape != c.shape && op == Op::Sub) || - // Cnnl binary op only supports up to 5D - !((a.rank() == 5 && b.rank() == 5) || (a.rank() <= 4 && b.rank() <= 4))) { + (a.shape != c.shape && op == Op::Sub)) { return nullptr; } @@ -103,16 +101,19 @@ namespace refactor::kernel { setCnnlTensor(d->aDesc, dataType, slice(aDims.data(), aDims.size())); setCnnlTensor(d->bDesc, dataType, slice(bDims.data(), bDims.size())); setCnnlTensor(d->cDesc, dataType, slice(cDims.data(), cDims.size())); - CNNL_ASSERT(cnnlSetOpTensorDescriptor( - d->opDesc, cnnlOP, - cnnlDataTypeConvert(d->f32 ? DT::F32 : DT::F64), - CNNL_NOT_PROPAGATE_NAN)); + if (cnnlOP) { + CNNL_ASSERT(cnnlSetOpTensorDescriptor( + d->opDesc, cnnlOP, + cnnlDataTypeConvert(dataType), + CNNL_NOT_PROPAGATE_NAN)); + } auto cnnlGetBinaryWorkspaceSize = (opType == SimpleBinaryType::Add || opType == SimpleBinaryType::Sub || opType == SimpleBinaryType::Mul) ? cnnlGetOpTensorWorkspaceSize : (opType == SimpleBinaryType::Div) ? cnnlGetDivWorkspaceSize : (opType == SimpleBinaryType::And || opType == SimpleBinaryType::Or || opType == SimpleBinaryType::Xor) ? cnnlGetLogicOpWorkspaceSize : (opType == SimpleBinaryType::Pow) ? cnnlGetPowWorkspaceSize + : (opType == SimpleBinaryType::Mod || opType == SimpleBinaryType::Fmod) ? cnnlGetFloorModWorkspaceSize : nullptr; if (cnnlGetBinaryWorkspaceSize == nullptr) { @@ -184,7 +185,15 @@ namespace refactor::kernel { d->bDesc, b, workspace, workspaceSize, d->cDesc, c)); + } else if (op == SimpleBinaryType::Mod || op == SimpleBinaryType::Fmod) { + CNNL_ASSERT(cnnlFloorMod(handle, + d->aDesc, a, + d->bDesc, b, + d->cDesc, c, + workspace, workspaceSize)); } + + BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc index f6e32159d..0c91a28d3 100644 --- a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc +++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc @@ -17,7 +17,7 @@ namespace refactor::kernel { : Kernel(), type(type_), dataType(dataType_), size(size_) {} auto K::build(Op op, Tensor const &a) noexcept -> KernelBox { - static const std::unordered_set supportedOp{Op::Abs, Op::Sqrt, Op::Neg}; + static const std::unordered_set supportedOp{Op::Abs, Op::Sqrt, Op::Neg, Op::Erf}; #ifndef USE_BANG return nullptr; @@ -74,6 +74,8 @@ namespace refactor::kernel { return cnnlNegTensor(handle, x_desc, x, y_desc, y); case Ty::Sqrt: return cnnlSqrt_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y); + case Ty::Erf: + return cnnlErf_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y); default: UNREACHABLE(); } diff --git a/src/04kernel/test/kernels/cast/test_cnnl.cpp b/src/04kernel/test/kernels/cast/test_cnnl.cpp new file mode 100644 index 000000000..72f842470 --- /dev/null +++ b/src/04kernel/test/kernels/cast/test_cnnl.cpp @@ -0,0 +1,49 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/cast/cpu_kernel.hh" +#include "../../../src/kernels/cast/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, CastCnnl) { + // build routine + auto x = Tensor::share(DataType::F32, Shape{2, 3, 4, 5}); + auto y = Tensor::share(DataType::I8, Shape{2, 3, 4, 5}); + auto kernel = CastCnnl::build(*x, *y), + kCpu = CastCpu::build(*x, *y); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto xMlu = dev.malloc(x->bytesSize()), + yMlu = dev.malloc(y->bytesSize()); + // put input data + std::vector x_(x->elementsSize()); + std::vector y_(y->elementsSize()); + std::iota(x_.begin(), x_.end(), 0); + xMlu->copyFromHost(x_.data(), x->bytesSize()); + // inference + { + void const *inputs[]{*xMlu}; + void *outputs[]{*yMlu}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{x_.data()}; + void *outputs[]{y_.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + std::vector result(y->elementsSize()); + yMlu->copyToHost(result.data(), y->bytesSize()); + EXPECT_EQ(result, y_); +} + +#endif diff --git a/src/04kernel/test/kernels/clip/test_cnnl.cpp b/src/04kernel/test/kernels/clip/test_cnnl.cpp new file mode 100644 index 000000000..ab29a1cb5 --- /dev/null +++ b/src/04kernel/test/kernels/clip/test_cnnl.cpp @@ -0,0 +1,51 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/clip/cpu_kernel.hh" +#include "../../../src/kernels/clip/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, ClipCnnl) { + // build routine + auto data = Tensor::share(DataType::F32, Shape{2, 3, 4, 5}); + auto kernel = ClipCnnl::build(*data, true), + kCpu = ClipCpu::build(*data, true); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluMem = dev.malloc(data->bytesSize()), + mluMin = dev.malloc(sizeof(float)), + mluMax = dev.malloc(sizeof(float)); + // put input data + std::vector value(data->elementsSize()); + float min = 30, max = 80; + std::iota(value.begin(), value.end(), 0); + mluMem->copyFromHost(value.data(), data->bytesSize()); + mluMin->copyFromHost(&min, sizeof(float)); + mluMax->copyFromHost(&max, sizeof(float)); + // inference + { + void const *inputs[]{*mluMem, *mluMin, *mluMax}; + void *outputs[]{*mluMem}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{value.data(), &min, &max}; + void *outputs[]{value.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + std::vector result(data->elementsSize()); + mluMem->copyToHost(result.data(), data->bytesSize()); + EXPECT_EQ(result, value); +} + +#endif diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp new file mode 100644 index 000000000..a3f739cd0 --- /dev/null +++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp @@ -0,0 +1,99 @@ +#ifdef USE_BANG + +#include "../src/kernels/gather/cnnl_kernel.hh" +#include "../src/kernels/gather/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, GatherCnnl) { + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + // Case axis = 0, indexType= int32 + { + // Create Tensor and build kernels + auto data = Tensor::share(DataType::F32, Shape{3, 2}, LayoutType::NCHW); + auto indices = Tensor::share(DataType::I32, Shape{2, 2}, LayoutType::NCHW); + auto output = Tensor::share(DataType::F32, Shape{2, 2, 2}, LayoutType::NCHW); + GatherInfo info(0, *data, *indices); + auto cnnlKernel = GatherCnnl::build(0, *data, *indices, *output); + auto cpuKernel = GatherCpu::build(info); + ASSERT_TRUE(cnnlKernel && cpuKernel); + auto res = runtime::Resources(); + auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res); + auto cpuRoutine = cpuKernel->lower(res).routine; + // Init inputs and outputs + std::vector a{1.0, 1.2, 2.3, 3.4, 4.5, 5.7}; + std::vector b{0, 1, 1, 2}; + std::vector c(output->elementsSize()); + auto workspace = dev.malloc(workspaceSize), + aMLU = dev.malloc(data->bytesSize()), + bMLU = dev.malloc(indices->bytesSize()), + cMLU = dev.malloc(output->bytesSize()); + aMLU->copyFromHost(a.data(), data->bytesSize()); + bMLU->copyFromHost(b.data(), indices->bytesSize()); + // Compute + { + void const *inputs[]{*aMLU, *bMLU}; + void *outputs[]{*cMLU}; + cnnlRoutine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{a.data(), b.data()}; + void *outputs[]{c.data()}; + cpuRoutine(res, nullptr, inputs, outputs); + } + // Compare + std::vector result(output->elementsSize()); + cMLU->copyToHost(result.data(), output->bytesSize()); + for (auto i : range0_(c.size())) { + EXPECT_FLOAT_EQ(c[i], result[i]); + } + } + + // Case axis = 1, indexType= int32 + { + // Create Tensor and build kernels + auto data = Tensor::share(DataType::F32, Shape{3, 3}, LayoutType::NCHW); + auto indices = Tensor::share(DataType::I32, Shape{1, 2}, LayoutType::NCHW); + auto output = Tensor::share(DataType::F32, Shape{3, 1, 2}, LayoutType::NCHW); + GatherInfo info(1, *data, *indices); + auto cnnlKernel = GatherCnnl::build(1, *data, *indices, *output); + auto cpuKernel = GatherCpu::build(info); + ASSERT_TRUE(cnnlKernel && cpuKernel); + auto res = runtime::Resources(); + auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res); + auto cpuRoutine = cpuKernel->lower(res).routine; + // Init inputs and outputs + std::vector a{1.0, 1.2, 1.9, 2.3, 3.4, 3.9, 4.5, 5.7, 5.9}; + std::vector b{0, 2}; + std::vector c(output->elementsSize()); + auto workspace = dev.malloc(workspaceSize), + aMLU = dev.malloc(data->bytesSize()), + bMLU = dev.malloc(indices->bytesSize()), + cMLU = dev.malloc(output->bytesSize()); + aMLU->copyFromHost(a.data(), data->bytesSize()); + bMLU->copyFromHost(b.data(), indices->bytesSize()); + // Compute + { + void const *inputs[]{*aMLU, *bMLU}; + void *outputs[]{*cMLU}; + cnnlRoutine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{a.data(), b.data()}; + void *outputs[]{c.data()}; + cpuRoutine(res, nullptr, inputs, outputs); + } + // Compare + std::vector result(output->elementsSize()); + cMLU->copyToHost(result.data(), output->bytesSize()); + for (auto i : range0_(c.size())) { + EXPECT_FLOAT_EQ(c[i], result[i]); + } + } +} + +#endif diff --git a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp new file mode 100644 index 000000000..1cdf72166 --- /dev/null +++ b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp @@ -0,0 +1,63 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/scatter_nd/cpu_kernel.hh" +#include "../../../src/kernels/scatter_nd/cnnl_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, ScatterNDCnnl) { + // build routine + auto data = Tensor::share(DataType::F32, Shape{8}); + auto indices = Tensor::share(DataType::I64, Shape{4, 1}); + auto updates = Tensor::share(DataType::F32, Shape{4}); + auto output = Tensor::share(DataType::F32, Shape{8}); + ScatterNDInfo info(*data, *indices); + auto getRefs = [](std::vector> tensors) -> TensorRefs { + TensorRefs refs; + std::transform(tensors.begin(), tensors.end(), std::back_inserter(refs), + [](auto const &it) { return std::cref(*it); }); + return refs; + }; + auto kernel = ScatterNDCnnl::build(getRefs({data, indices, updates}), getRefs({output})), + kCpu = ScatterNDCpu::build(info); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluData = dev.malloc(data->bytesSize()), + mluIndices = dev.malloc(indices->bytesSize()), + mluUpdates = dev.malloc(updates->bytesSize()), + mluOut = dev.malloc(output->bytesSize()); + // put input data + std::vector data_(data->elementsSize()); + std::iota(data_.begin(), data_.end(), 1); + std::vector indices_{4, 3, 1, 7}; + std::vector updates_{9, 10, 11, 12}; + mluData->copyFromHost(data_.data(), data->bytesSize()); + mluIndices->copyFromHost(indices_.data(), indices->bytesSize()); + mluUpdates->copyFromHost(updates_.data(), updates->bytesSize()); + // inference + { + void const *inputs[]{*mluData, *mluIndices, *mluUpdates}; + void *outputs[]{*mluOut}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{data_.data(), indices_.data(), updates_.data()}; + void *outputs[]{data_.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + std::vector result(output->elementsSize()); + mluOut->copyToHost(result.data(), output->bytesSize()); + EXPECT_EQ(result, data_); +} + +#endif diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp index 4ef7c6d23..7bd18bf6d 100644 --- a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp @@ -9,11 +9,13 @@ using namespace refactor; using namespace kernel; using namespace hardware; +template void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape dimC) { // Create Tensor and build kernels - auto aTensor = Tensor::share(DataType::F32, dimA, LayoutType::NCHW); - auto bTensor = Tensor::share(DataType::F32, dimB, LayoutType::NCHW); - auto cTensor = Tensor::share(DataType::F32, dimC, LayoutType::NCHW); + using T_ = primitive::type; + auto aTensor = Tensor::share(T, dimA, LayoutType::NCHW); + auto bTensor = Tensor::share(T, dimB, LayoutType::NCHW); + auto cTensor = Tensor::share(T, dimC, LayoutType::NCHW); auto kernel = BinaryCnnl::build(binaryOPT, *aTensor, *bTensor, *cTensor); auto kCpu = BinaryCpu::build(binaryOPT, *aTensor, *bTensor); ASSERT_TRUE(kCpu && kernel); @@ -21,10 +23,9 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di auto [routine, workspaceSize] = kernel->lower(res); auto rCpu = kCpu->lower(res).routine; // Init inputs and outputs - std::vector - a(aTensor->elementsSize(), 3.0f), - b(bTensor->elementsSize(), 2.0f), - c(cTensor->elementsSize()); + std::vector a(aTensor->elementsSize(), 3); + std::vector b(bTensor->elementsSize(), 2); + std::vector c(cTensor->elementsSize()); auto &dev = *device::init(Device::Type::Mlu, 0, ""); auto workspace = dev.malloc(workspaceSize), aMLU = dev.malloc(aTensor->bytesSize()), @@ -44,47 +45,85 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di rCpu(res, nullptr, inputs, outputs); } // Compare - std::vector result(cTensor->elementsSize()); + std::vector result(cTensor->elementsSize()); cMLU->copyToHost(result.data(), cTensor->bytesSize()); for (auto i : range0_(result.size())) { - EXPECT_FLOAT_EQ(c[i], result[i]); + EXPECT_EQ(c[i], result[i]); } } - TEST(kernel, BinaryCnnlAdd) { - testBinaryCnnl(SimpleBinaryType::Add, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); + testBinaryCnnl(SimpleBinaryType::Add, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); } TEST(kernel, BinaryCnnlMul) { - testBinaryCnnl(SimpleBinaryType::Mul, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); + testBinaryCnnl(SimpleBinaryType::Mul, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); } TEST(kernel, BinaryCnnlSub) { - testBinaryCnnl(SimpleBinaryType::Sub, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); + testBinaryCnnl(SimpleBinaryType::Sub, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); } TEST(kernel, BinaryCnnlDiv) { - testBinaryCnnl(SimpleBinaryType::Div, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); + testBinaryCnnl(SimpleBinaryType::Div, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); } -// TEST(kernel, BinaryCnnlAnd) { -// testBinaryCnnl(SimpleBinaryType::And, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); -// } +TEST(kernel, BinaryCnnlPow) { + testBinaryCnnl(SimpleBinaryType::Pow, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); +} + +TEST(kernel, BinaryCnnlMod) { + testBinaryCnnl(SimpleBinaryType::Mod, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); +} -// TEST(kernel, BinaryCnnlOr) { -// testBinaryCnnl(SimpleBinaryType::Or, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +TEST(kernel, BinaryCnnlFMod) { + testBinaryCnnl(SimpleBinaryType::Fmod, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}, + Shape{2, 5, 10, 20, 3, 4}); +} + +// TEST(kernel, BinaryCnnlMod) { +// testBinaryCnnl(SimpleBinaryType::Mod, +// Shape{2, 5, 10, 20, 3, 4}, +// Shape{2, 5, 10, 20, 3, 4}, +// Shape{2, 5, 10, 20, 3, 4}); // } -// TEST(kernel, BinaryCnnlXor) { -// testBinaryCnnl(SimpleBinaryType::Xor, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); +// TEST(kernel, BinaryCnnlFmodI8) { +// testBinaryCnnl(SimpleBinaryType::Fmod, +// Shape{2, 5, 10, 20, 3, 4}, +// Shape{2, 5, 10, 20, 3, 4}, +// Shape{2, 5, 10, 20, 3, 4}); // } -TEST(kernel, BinaryCnnlPow) { - testBinaryCnnl(SimpleBinaryType::Pow, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}, Shape{10, 20, 30, 40}); -} +// TEST(kernel, BinaryCnnlFmodF32) { +// testBinaryCnnl(SimpleBinaryType::Fmod, +// Shape{2, 5, 10, 20, 3, 4}, +// Shape{2, 5, 10, 20, 3, 4}, +// Shape{2, 5, 10, 20, 3, 4}); +// } TEST(kernel, BinaryCnnlBroadcast) { - testBinaryCnnl(SimpleBinaryType::Add, Shape{3, 4, 5, 6}, Shape{}, Shape{3, 4, 5, 6}); + testBinaryCnnl(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6}); } + #endif diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp index 2707e6274..e4b9428ee 100644 --- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp @@ -51,6 +51,7 @@ TEST(kernel, SimpleUnaryCnnl) { testOp(SimpleUnaryType::Abs, false); testOp(SimpleUnaryType::Neg, false); testOp(SimpleUnaryType::Sqrt, false); + testOp(SimpleUnaryType::Erf, false); } TEST(kernel, ActivationCnnl) { From 5aa7a1e6d8edd320de298d99ad486d4a440e56ad Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Wed, 17 Jan 2024 08:23:03 +0000 Subject: [PATCH 5/9] =?UTF-8?q?feat:=20mlu=E8=B7=91=E9=80=9Allama=EF=BC=8C?= =?UTF-8?q?=E4=BD=86=E6=9C=AA=E5=BE=97=E5=88=B0=E6=AD=A3=E7=A1=AE=E7=BB=93?= =?UTF-8?q?=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/04kernel/src/collectors/concat.cc | 4 ++- src/04kernel/src/collectors/gather.cc | 9 +++++-- src/04kernel/src/collectors/split.cc | 4 ++- .../src/kernels/concat/cnnl_kernel.cc | 2 +- .../src/kernels/concat/cnnl_kernel.hh | 2 +- .../src/kernels/gather/cnnl_kernel.cc | 5 ++-- .../src/kernels/gather/cnnl_kernel.hh | 2 +- .../src/kernels/reduce/cnnl_kernel.cc | 6 +++-- .../src/kernels/simple_binary/binary_cnnl.cc | 25 +++++-------------- .../simple_unary/cnnl_simple_unary_kernel.cc | 13 +++++----- src/04kernel/src/kernels/split/cnnl_kernel.cc | 11 +++++--- src/04kernel/src/kernels/split/cnnl_kernel.hh | 4 +-- src/09python_ffi/src/compiler.cc | 1 + src/09python_ffi/src/import.cpp | 1 + 14 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/04kernel/src/collectors/concat.cc b/src/04kernel/src/collectors/concat.cc index 27f27c776..0802de8e8 100644 --- a/src/04kernel/src/collectors/concat.cc +++ b/src/04kernel/src/collectors/concat.cc @@ -9,6 +9,8 @@ namespace refactor::kernel { ConcatCollector::filter(TensorRefs inputs, TensorRefs outputs) const { SplitInfo info(axis, inputs); + auto const &b = outputs[0]; + std::vector ans; switch (_target) { case decltype(_target)::Cpu: @@ -22,7 +24,7 @@ namespace refactor::kernel { } break; case decltype(_target)::Mlu: - if (auto ptr = ConcatCnnl::build(axis, inputs, outputs[0].get()); ptr) { + if (auto ptr = ConcatCnnl::build(axis, inputs, b); ptr) { ans.emplace_back(std::move(ptr)); } break; diff --git a/src/04kernel/src/collectors/gather.cc b/src/04kernel/src/collectors/gather.cc index 3b66b91f0..0a8b8b0e8 100644 --- a/src/04kernel/src/collectors/gather.cc +++ b/src/04kernel/src/collectors/gather.cc @@ -9,7 +9,12 @@ namespace refactor::kernel { GatherCollector::filter(TensorRefs inputs, TensorRefs outputs) const { GatherInfo info(axis, inputs[0], inputs[1]); - std::vector ans; + auto const &a = inputs[0]; + auto const &b = inputs[1]; + auto const &c = outputs[0]; + + std::vector + ans; switch (_target) { case decltype(_target)::Cpu: if (auto ptr = GatherCpu::build(info); ptr != nullptr) { @@ -22,7 +27,7 @@ namespace refactor::kernel { } break; case decltype(_target)::Mlu: - if (auto ptr = GatherCnnl::build(axis, inputs[0].get(), inputs[1].get(), outputs[0].get()); ptr != nullptr) { + if (auto ptr = GatherCnnl::build(axis, a, b, c); ptr != nullptr) { ans.emplace_back(std::move(ptr)); } break; diff --git a/src/04kernel/src/collectors/split.cc b/src/04kernel/src/collectors/split.cc index 6a3be0a65..b862d8959 100644 --- a/src/04kernel/src/collectors/split.cc +++ b/src/04kernel/src/collectors/split.cc @@ -9,6 +9,8 @@ namespace refactor::kernel { SplitCollector::filter(TensorRefs inputs, TensorRefs outputs) const { SplitInfo info(axis, outputs); + auto const &a = inputs[0]; + std::vector ans; switch (_target) { case decltype(_target)::Cpu: @@ -22,7 +24,7 @@ namespace refactor::kernel { } break; case decltype(_target)::Mlu: - if (auto ptr = SplitCnnl::build(axis, inputs[0].get(), outputs); ptr) { + if (auto ptr = SplitCnnl::build(axis, a, outputs); ptr) { ans.emplace_back(std::move(ptr)); } break; diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc index 7d11dbaa9..6c7a0fbaf 100644 --- a/src/04kernel/src/kernels/concat/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc @@ -12,7 +12,7 @@ namespace refactor::kernel { K::ConcatCnnl(SplitInfoCnnl info_) noexcept : Kernel(), info(std::move(info_)) {} - auto K::build(int axis, TensorRefs inputs, Tensor output) noexcept -> KernelBox { + auto K::build(int axis, TensorRefs inputs, Tensor const &output) noexcept -> KernelBox { #ifndef USE_BANG return nullptr; #endif diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.hh b/src/04kernel/src/kernels/concat/cnnl_kernel.hh index c49d5607a..0d4f6f853 100644 --- a/src/04kernel/src/kernels/concat/cnnl_kernel.hh +++ b/src/04kernel/src/kernels/concat/cnnl_kernel.hh @@ -11,7 +11,7 @@ namespace refactor::kernel { explicit ConcatCnnl(SplitInfoCnnl) noexcept; - static KernelBox build(int, TensorRefs, Tensor) noexcept; + static KernelBox build(int, TensorRefs, Tensor const &) noexcept; static size_t typeId() noexcept; size_t kernelTypeId() const noexcept final; diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc index 19f9ed358..d5ddcacec 100644 --- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc @@ -11,13 +11,14 @@ namespace refactor::kernel { K::GatherCnnl(decltype(info) info_) noexcept : Kernel(), info(std::move(info_)) {} - auto K::build(int axis, Tensor input, Tensor index, Tensor output) noexcept -> KernelBox { + auto K::build(int axis, Tensor const &input, Tensor const &index, Tensor const &output) noexcept -> KernelBox { #ifndef USE_BANG return nullptr; #endif + return std::make_unique(decltype(info){ input.dataType, - index.dataType, + DataType::I32, axis, std::vector(input.shape.begin(), input.shape.end()), std::vector(index.shape.begin(), index.shape.end()), diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.hh b/src/04kernel/src/kernels/gather/cnnl_kernel.hh index f60a8ada0..154345929 100644 --- a/src/04kernel/src/kernels/gather/cnnl_kernel.hh +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.hh @@ -15,7 +15,7 @@ namespace refactor::kernel { explicit GatherCnnl(decltype(info)) noexcept; - static KernelBox build(int, Tensor, Tensor, Tensor) noexcept; + static KernelBox build(int, Tensor const &, Tensor const &, Tensor const &) noexcept; static size_t typeId() noexcept; size_t kernelTypeId() const noexcept final; diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc index 752bee690..6b22d793d 100644 --- a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc @@ -75,8 +75,10 @@ namespace refactor::kernel { for (auto axis : axes) { dimsO[axis] = 1; } - setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size())); - setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size())); + // setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size())); + // setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data())); // clang-format off auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc index 86e605349..11712b4f8 100644 --- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc @@ -26,9 +26,10 @@ namespace refactor::kernel { // !a.dataType.isFloat() || !ARTHIMETIC.contains(op) || // At least one of a,b should have the same shape as c - (a.shape != c.shape && b.shape != c.shape) || + (a.shape != c.shape && b.shape != c.shape) // Sub only supports brocasting b - (a.shape != c.shape && op == Op::Sub)) { + // (a.shape != c.shape && op == Op::Sub) + ) { return nullptr; } @@ -122,18 +123,13 @@ namespace refactor::kernel { auto handle = res.fetchOrStore()->handle; size_t workspaceSize; - if (aDims != cDims) { - CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->bDesc, - d->aDesc, d->cDesc, - &workspaceSize)); - } else { - CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc, + CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc, d->bDesc, d->cDesc, &workspaceSize)); - } + res.fetchOrStore(); - auto routine = [swap = aDims != cDims, d, + auto routine = [d = std::move(d), workspaceSize, cnnlLogicOP, op = this->opType](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { auto handle = res.fetchOrStore()->handle; @@ -151,20 +147,11 @@ namespace refactor::kernel { beta = d->f32 ? factor(0) : factor(0); - - if (swap) { - CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc, - &alphaB, d->bDesc, b, - &alphaA, d->aDesc, a, - workspace, workspaceSize, - &beta, d->cDesc, c)); - } else { CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc, &alphaA, d->aDesc, a, &alphaB, d->bDesc, b, workspace, workspaceSize, &beta, d->cDesc, c)); - } } else if (op == SimpleBinaryType::Div) { CNNL_ASSERT(cnnlDiv_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc index 0c91a28d3..f8c0d7d01 100644 --- a/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc +++ b/src/04kernel/src/kernels/simple_unary/cnnl_simple_unary_kernel.cc @@ -62,12 +62,12 @@ namespace refactor::kernel { setCnnlTensor(d->tensor, dataType, slice(&size, 1)); - auto cnnlUnaryForward = [this](cnnlHandle_t handle, - const cnnlTensorDescriptor_t x_desc, - const void *x, - const cnnlTensorDescriptor_t y_desc, - void *y) -> cnnlStatus_t { - switch (this->type) { + auto cnnlUnaryForward = [t = this->type](cnnlHandle_t handle, + const cnnlTensorDescriptor_t x_desc, + const void *x, + const cnnlTensorDescriptor_t y_desc, + void *y) -> cnnlStatus_t { + switch (t) { case Ty::Abs: return cnnlAbs(handle, x_desc, x, y_desc, y); case Ty::Neg: @@ -77,6 +77,7 @@ namespace refactor::kernel { case Ty::Erf: return cnnlErf_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, x_desc, x, y_desc, y); default: + // fmt::println("{}", unaryName(t)); UNREACHABLE(); } }; diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc index ace809ffa..c967bff2d 100644 --- a/src/04kernel/src/kernels/split/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc @@ -14,7 +14,7 @@ namespace refactor::kernel { : dataType(dt_), axis(axis_), num(num_), inDim(std::move(in_)), outDims(std::move(out_)) {} - Info::SplitInfoCnnl(int axis, Tensor input, TensorRefs outputs) + Info::SplitInfoCnnl(int axis, Tensor const &input, TensorRefs outputs) : SplitInfoCnnl(input.dataType, axis, outputs.size(), std::move(std::vector(input.shape.begin(), input.shape.end())), std::move([](TensorRefs tensors) -> std::vector> { @@ -29,7 +29,7 @@ namespace refactor::kernel { K::SplitCnnl(SplitInfoCnnl info_) noexcept : Kernel(), info(std::move(info_)) {} - auto K::build(int axis, Tensor input, TensorRefs outputs) noexcept -> KernelBox { + auto K::build(int axis, Tensor const &input, TensorRefs outputs) noexcept -> KernelBox { #ifndef USE_BANG return nullptr; #endif @@ -78,9 +78,12 @@ namespace refactor::kernel { Descriptors(Descriptors &&) = delete; }; auto d = std::make_shared(info.num, info.dataType != DT::F64); - setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); + // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); + for (auto i = 0; i < info.outDims.size(); i++) { - setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); + // setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data())); } auto handle = res.fetchOrStore()->handle; diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.hh b/src/04kernel/src/kernels/split/cnnl_kernel.hh index bcde3d708..7fb4147d4 100644 --- a/src/04kernel/src/kernels/split/cnnl_kernel.hh +++ b/src/04kernel/src/kernels/split/cnnl_kernel.hh @@ -14,7 +14,7 @@ namespace refactor::kernel { std::vector> outDims; SplitInfoCnnl(DataType, int, int, std::vector, std::vector>); - SplitInfoCnnl(int, Tensor, TensorRefs); + SplitInfoCnnl(int, Tensor const &, TensorRefs); }; struct SplitCnnl final : public Kernel { @@ -22,7 +22,7 @@ namespace refactor::kernel { explicit SplitCnnl(SplitInfoCnnl) noexcept; - static KernelBox build(int, Tensor, TensorRefs) noexcept; + static KernelBox build(int, Tensor const &, TensorRefs) noexcept; static size_t typeId() noexcept; size_t kernelTypeId() const noexcept final; diff --git a/src/09python_ffi/src/compiler.cc b/src/09python_ffi/src/compiler.cc index bf04053e9..45450582e 100644 --- a/src/09python_ffi/src/compiler.cc +++ b/src/09python_ffi/src/compiler.cc @@ -95,6 +95,7 @@ namespace refactor::python_ffi { // clang-format off auto target_ = target == "cpu" ? Target::Cpu : target == "cuda" ? Target::Nvidia + : target == "mlu" ? Target::Mlu : UNREACHABLEX(Target, "Unknown target: {}", target); // clang-format on return compileOn(hardware::device::fetch(target_), diff --git a/src/09python_ffi/src/import.cpp b/src/09python_ffi/src/import.cpp index dda0e660c..74cf7cf01 100644 --- a/src/09python_ffi/src/import.cpp +++ b/src/09python_ffi/src/import.cpp @@ -16,6 +16,7 @@ namespace refactor::python_ffi { // clang-format off auto type_ = type == "cpu" ? Device::Type::Cpu : type == "nvidia" ? Device::Type::Nvidia + : type == "mlu" ? Device::Type::Mlu : UNREACHABLEX(Device::Type, "Unknown device type: \"{}\"", type); // clang-format on return device::init(type_, card, ""); From 8bde8c1f30b8c0a1c71ccdbd12b7228bc09abd7d Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Fri, 19 Jan 2024 06:47:11 +0000 Subject: [PATCH 6/9] =?UTF-8?q?feat:=20mlu=E4=B8=8A=E8=B7=91=E9=80=9Allama?= =?UTF-8?q?/gpt2=EF=BC=8C=E7=BB=93=E6=9E=9C=E6=AD=A3=E7=A1=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/04kernel/src/collectors/global_pool.cc | 6 ++ .../src/kernels/gather/cnnl_kernel.cc | 14 ++-- .../src/kernels/reduce/cnnl_kernel.cc | 11 +-- .../src/kernels/softmax/cnnl_kernel.cc | 9 +- src/04kernel/src/kernels/where/cnnl_kernel.cc | 82 +++++++++---------- src/04kernel/src/kernels/where/cnnl_kernel.hh | 4 +- .../src/utilities/bang/cnnl_context.cc | 9 ++ .../src/utilities/bang/cnnl_context.hh | 2 + .../src/utilities/bang/cnrt_functions.cc | 27 ++++++ .../src/utilities/bang/cnrt_functions.h | 16 ++++ .../test/kernels/gather/test_gather_cnnl.cpp | 45 ++++++++++ src/09python_ffi/CMakeLists.txt | 4 + src/09python_ffi/src/executor.cc | 13 ++- 13 files changed, 181 insertions(+), 61 deletions(-) create mode 100644 src/04kernel/src/utilities/bang/cnrt_functions.cc create mode 100644 src/04kernel/src/utilities/bang/cnrt_functions.h diff --git a/src/04kernel/src/collectors/global_pool.cc b/src/04kernel/src/collectors/global_pool.cc index 1ae1d7fc2..e6a278c1f 100644 --- a/src/04kernel/src/collectors/global_pool.cc +++ b/src/04kernel/src/collectors/global_pool.cc @@ -1,5 +1,6 @@ #include "kernel/collectors/global_pool.h" #include "../kernels/pool/cudnn_kernel.hh" +#include "../kernels/pool/cnnl_kernel.hh" namespace refactor::kernel { @@ -28,6 +29,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = PoolCnnl::build(type, false, kernelShape, attributes, x, y); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc index d5ddcacec..b4d5aa154 100644 --- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc @@ -4,6 +4,7 @@ #include "../../utilities/bang/cnnl_context.hh" #include "../../utilities/bang/cnnl_functions.h" #endif +#include namespace refactor::kernel { using K = GatherCnnl; @@ -15,11 +16,11 @@ namespace refactor::kernel { #ifndef USE_BANG return nullptr; #endif - + return std::make_unique(decltype(info){ input.dataType, DataType::I32, - axis, + axis ? axis : 0, std::vector(input.shape.begin(), input.shape.end()), std::vector(index.shape.begin(), index.shape.end()), std::vector(output.shape.begin(), output.shape.end()), @@ -70,15 +71,16 @@ namespace refactor::kernel { res.fetchOrStore(); auto routine = [d = std::move(d), - shape = info.inDim.data(), workspaceSize, + shape = std::vector(info.inDim.begin(), info.inDim.end()), + workspaceSize, dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { - BANG_ASSERT(cnrtMemcpy(workspace, (void*) shape, workspaceSize, CNRT_MEM_TRANS_DIR_HOST2DEV)); + res.fetchOrStore()->copyFromCPU(workspace, shape.data(), workspaceSize); CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore()->handle, dim, d->inDesc, inputs[0], reinterpret_cast(workspace), - d->indexDesc, reinterpret_cast(inputs[1]), + d->indexDesc, reinterpret_cast(inputs[1]), d->outDesc, outputs[0])); BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); - }; + }; return {std::move(routine), workspaceSize}; } diff --git a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc index 6b22d793d..4ea6fd827 100644 --- a/src/04kernel/src/kernels/reduce/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/reduce/cnnl_kernel.cc @@ -71,14 +71,15 @@ namespace refactor::kernel { std::vector dimsI(shape.begin(), shape.end()), - dimsO(shape.begin(), shape.end()); + dimsO(shape.begin(), shape.end()), + indices(axes.begin(), axes.end()); for (auto axis : axes) { dimsO[axis] = 1; } // setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size())); // setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size())); - CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data())); - CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data())); // clang-format off auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG @@ -91,12 +92,12 @@ namespace refactor::kernel { : UNREACHABLEX(cnnlReduceOp_t, ""); // clang-format on CNNL_ASSERT(cnnlSetReduceDescriptor_v2( - d->reduce, (int *) (axes.data()), axes.size(), reduceOp, + d->reduce, indices.data(), indices.size(), reduceOp, cnnlDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64), CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0)); auto handler = res.fetchOrStore()->handle; - size_t idxWorkspaceSize = axes.size() * sizeof(int); + size_t idxWorkspaceSize = indices.size() * sizeof(int); // idxWorkspaceSize = hardware::alignBytes(idxWorkspaceSize, 256); size_t workspaceSize; // get workspace diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc index 865e452e1..0633195d5 100644 --- a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc @@ -59,9 +59,11 @@ namespace refactor::kernel { static_cast(algo), dataType != DataType::F64); int dims[]{pre, mid, post}; - cnnlSoftmaxMode_t mode = (post == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION - : (pre == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION - : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + // cnnlSoftmaxMode_t mode = (pre == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION + // : (post == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION + // : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + // FIXME(bolun): CNNL Softmax mode + cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; // cnnlSoftmaxForward_v2 is applied to a 3D input tensor only CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims)); @@ -78,6 +80,7 @@ namespace refactor::kernel { CNNL_COMPUTATION_ULTRAHIGH_PRECISION, &a, d->t, inputs[0], &b, d->t, outputs[0])); + res.fetchOrStore()->queueSync(); }; } diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc index 774c5513d..50b7c9d85 100644 --- a/src/04kernel/src/kernels/where/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc @@ -16,13 +16,24 @@ namespace refactor::kernel { #ifndef USE_BANG return nullptr; #endif - return std::make_unique(decltype(info) { - inputs[1].get().dataType, - inputs[0].get().shape, - inputs[1].get().shape, - inputs[2].get().shape, - outputs[0].get().shape, - }); + std::vector cDim(inputs[0].get().shape.begin(), inputs[0].get().shape.end()), + xDim(inputs[1].get().shape.begin(), inputs[1].get().shape.end()), + yDim(inputs[2].get().shape.begin(), inputs[2].get().shape.end()), + ansDim(outputs[0].get().shape.begin(), outputs[0].get().shape.end()); + if (ansDim.size() == 0) { + ansDim.push_back(1); + } + if (xDim.size() == 0) { + xDim.push_back(1); + } + if (yDim.size() == 0) { + yDim.push_back(1); + } + if (cDim.size() == 0) { + cDim.push_back(1); + } + return std::make_unique(decltype(info){ + inputs[1].get().dataType, cDim, xDim, yDim, ansDim}); } auto K::typeId() noexcept -> size_t { static uint8_t ID = 1; @@ -44,11 +55,10 @@ namespace refactor::kernel { struct Descriptors { cnnlTensorDescriptor_t cond, x, y, ans; - bool f32; - explicit Descriptors(decltype(f32) f32_) + explicit Descriptors() : cond(nullptr), x(nullptr), y(nullptr), - ans(nullptr), f32(f32_) { + ans(nullptr) { CNNL_ASSERT(cnnlCreateTensorDescriptor(&cond)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); @@ -64,29 +74,20 @@ namespace refactor::kernel { Descriptors(const Descriptors &) = delete; Descriptors(Descriptors &&) = delete; }; - auto d = std::make_shared(info.dataType != DT::F64); - - std::vector cDim(info.condDim.begin(), info.condDim.end()), - xDim(info.thenDim.begin(), info.thenDim.end()), - yDim(info.elseDim.begin(), info.elseDim.end()), - ansDim(info.outputDim.begin(), info.outputDim.end()); - - auto rightAlign = [](std::vector &dim, uint32_t targetLength) { - if (dim.size() < targetLength) { - dim.insert(dim.begin(), targetLength - dim.size(), 1); - } - }; - if (ansDim.size() == 0) { - ansDim.push_back(1); - } - rightAlign(cDim, ansDim.size()); - rightAlign(xDim, ansDim.size()); - rightAlign(yDim, ansDim.size()); - - CNNL_ASSERT(cnnlSetTensorDescriptor(d->cond, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(DT::Bool), cDim.size(), cDim.data())); - CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), xDim.size(), xDim.data())); - CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), yDim.size(), yDim.data())); - CNNL_ASSERT(cnnlSetTensorDescriptor(d->ans, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), ansDim.size(), ansDim.data())); + auto d = std::make_shared(); + + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->cond, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(DT::Bool), + info.condDim.size(), info.condDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.thenDim.size(), info.thenDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.elseDim.size(), info.elseDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->ans, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.outputDim.size(), info.outputDim.data())); auto handle = res.fetchOrStore()->handle; size_t workspaceSize; @@ -94,19 +95,14 @@ namespace refactor::kernel { res.fetchOrStore(); auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { - // fetch cnnl handle from resources - auto handle = res.fetchOrStore()->handle; - auto cond = inputs[0], - x = inputs[1], - y = inputs[2]; - auto ans = outputs[0]; CNNL_ASSERT(cnnlSelectV2( - handle, d->cond, cond, d->x, x, - d->y, y, workspace, workspaceSize, - d->ans, ans)); + res.fetchOrStore()->handle, + d->cond, inputs[0], d->x, inputs[1], + d->y, inputs[2], workspace, workspaceSize, + d->ans, outputs[0])); - cnrtQueueSync(res.fetchOrStore()->queue); + res.fetchOrStore()->queueSync(); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.hh b/src/04kernel/src/kernels/where/cnnl_kernel.hh index 6139b1832..ffe39a875 100644 --- a/src/04kernel/src/kernels/where/cnnl_kernel.hh +++ b/src/04kernel/src/kernels/where/cnnl_kernel.hh @@ -7,12 +7,10 @@ namespace refactor::kernel { - using Shape = absl::InlinedVector; - struct WhereCnnl final : public Kernel { struct { DataType dataType; - Shape condDim, thenDim, elseDim, outputDim; + std::vector condDim, thenDim, elseDim, outputDim; } info; WhereCnnl(decltype(info)) noexcept; diff --git a/src/04kernel/src/utilities/bang/cnnl_context.cc b/src/04kernel/src/utilities/bang/cnnl_context.cc index 15cc13829..f2ad33ab5 100644 --- a/src/04kernel/src/utilities/bang/cnnl_context.cc +++ b/src/04kernel/src/utilities/bang/cnnl_context.cc @@ -30,6 +30,15 @@ namespace refactor::kernel::cnnl { return "CnnlContext"; } + void CnnlContext::copyFromCPU(void *dst, const void *src, size_t size) { + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), size, + CNRT_MEM_TRANS_DIR_HOST2DEV)); + } + + void CnnlContext::queueSync() { + BANG_ASSERT(cnrtQueueSync(queue)); + } + }// namespace refactor::kernel::cnnl #endif diff --git a/src/04kernel/src/utilities/bang/cnnl_context.hh b/src/04kernel/src/utilities/bang/cnnl_context.hh index 7db40d3d3..4743a0e4e 100644 --- a/src/04kernel/src/utilities/bang/cnnl_context.hh +++ b/src/04kernel/src/utilities/bang/cnnl_context.hh @@ -22,6 +22,8 @@ namespace refactor::kernel::cnnl { size_t resourceTypeId() const noexcept final; std::string_view description() const noexcept final; + void copyFromCPU(void *dst, const void *src, size_t size); + void queueSync(); }; }// namespace refactor::kernel::cnnl diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc new file mode 100644 index 000000000..2ea66194d --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnrt_functions.cc @@ -0,0 +1,27 @@ +#ifdef USE_BANG +#include "cnrt_functions.h" +#include "cnnl_functions.h" +#include +#include + +namespace refactor::kernel::cnnl { + + int currentDevice() { + int device; + BANG_ASSERT(cnrtGetDevice(&device)); + return device; + } + + void sync() { + BANG_ASSERT(cnrtSyncDevice()); + } + + void copyOut(void *dst, const void *src, size_t size) { + sync(); + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), size, + CNRT_MEM_TRANS_DIR_DEV2HOST)); + } + +}// namespace refactor::kernel::cnnl + +#endif diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h new file mode 100644 index 000000000..ef119819c --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnrt_functions.h @@ -0,0 +1,16 @@ +#ifndef KERNEL_CNRT_FUNCTIONS_H +#define KERNEL_CNRT_FUNCTIONS_H + +#include "common.h" + +namespace refactor::kernel::cnnl { + + int currentDevice(); + + void sync(); + + void copyOut(void *dst, const void *src, size_t size); + +}// namespace refactor::kernel::cnnl + +#endif// KERNEL_CNRT_FUNCTIONS_H diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp index a3f739cd0..020b5f914 100644 --- a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp +++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp @@ -94,6 +94,51 @@ TEST(kernel, GatherCnnl) { EXPECT_FLOAT_EQ(c[i], result[i]); } } + + // Case axis = 1, indexType= int32 + { + // Create Tensor and build kernels + auto data = Tensor::share(DataType::F32, Shape{32, 16}, LayoutType::NCHW); + auto indices = Tensor::share(DataType::I64, Shape{1, 4}, LayoutType::NCHW); + auto output = Tensor::share(DataType::F32, Shape{1, 4, 16}, LayoutType::NCHW); + GatherInfo info(0, *data, *indices); + auto cnnlKernel = GatherCnnl::build(0, *data, *indices, *output); + auto cpuKernel = GatherCpu::build(info); + ASSERT_TRUE(cnnlKernel && cpuKernel); + auto res = runtime::Resources(); + auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res); + auto cpuRoutine = cpuKernel->lower(res).routine; + // Init inputs and outputs + std::vector a; + for (auto i = 0; i < data->elementsSize(); i++) { + a.push_back(i + 0.1f); + } + std::vector b(indices->elementsSize(), 0); + std::vector c(output->elementsSize()); + auto workspace = dev.malloc(workspaceSize), + aMLU = dev.malloc(data->bytesSize()), + bMLU = dev.malloc(indices->bytesSize()), + cMLU = dev.malloc(output->bytesSize()); + aMLU->copyFromHost(a.data(), data->bytesSize()); + bMLU->copyFromHost(b.data(), indices->bytesSize()); + // Compute + { + void const *inputs[]{*aMLU, *bMLU}; + void *outputs[]{*cMLU}; + cnnlRoutine(res, *workspace, inputs, outputs); + } + { + void const *inputs[]{a.data(), b.data()}; + void *outputs[]{c.data()}; + cpuRoutine(res, nullptr, inputs, outputs); + } + // Compare + std::vector result(output->elementsSize()); + cMLU->copyToHost(result.data(), output->bytesSize()); + for (auto i : range0_(c.size())) { + EXPECT_FLOAT_EQ(c[i], result[i]); + } + } } #endif diff --git a/src/09python_ffi/CMakeLists.txt b/src/09python_ffi/CMakeLists.txt index ccce34d37..09567c9da 100644 --- a/src/09python_ffi/CMakeLists.txt +++ b/src/09python_ffi/CMakeLists.txt @@ -10,6 +10,10 @@ pybind11_add_module(python_ffi SHARED ${PYFFI_SRC}) target_link_libraries(python_ffi PRIVATE onnx llm communication) target_include_directories(python_ffi PRIVATE include) +if(USE_BANG) + target_include_directories(python_ffi PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../04kernel/src/utilities/bang) +endif() + # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a # define (VERSION_INFO) here. # target_compile_definitions(python_ffi diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc index c6a20cb95..947410ccb 100644 --- a/src/09python_ffi/src/executor.cc +++ b/src/09python_ffi/src/executor.cc @@ -7,6 +7,10 @@ #include "kernel/cuda/functions.cuh" #endif// USE_CUDA +#ifdef USE_BANG +#include "cnrt_functions.h" +#endif// USE_BANG + namespace refactor::python_ffi { Executor::Executor(computation::Graph graph, runtime::Stream stream) @@ -70,9 +74,13 @@ namespace refactor::python_ffi { void Executor::bench(bool sync) { #ifdef USE_CUDA auto ans = _stream.bench(sync ? kernel::cuda::sync : nullptr); +#else + #ifdef USE_BANG + auto ans = _stream.bench(sync ? kernel::cnnl::sync : nullptr); #else auto ans = _stream.bench(nullptr); -#endif// USE_CUDA + #endif +#endif auto const &nodes = _graph.internal().contiguous().nodes; for (auto i : range0_(nodes.size())) { fmt::println("{} {} {}", @@ -213,6 +221,9 @@ namespace refactor::python_ffi { #ifdef USE_CUDA kernel::cuda::copyOut(buffer.data(), addresses[idx], size); #endif +#ifdef USE_BANG + kernel::cnnl::copyOut(buffer.data(), addresses[idx], size); +#endif auto file = path / fmt::format("data{:06}.{}", dataIdx++, format); fs::remove(file); From 2258c1ee2d09cfe0add168d5944e6d3ec349c634 Mon Sep 17 00:00:00 2001 From: Zhang Bolun Date: Fri, 26 Jan 2024 15:13:09 +0800 Subject: [PATCH 7/9] =?UTF-8?q?fix:=20=E8=A7=A3=E5=86=B3=E4=B8=80=E4=BA=9B?= =?UTF-8?q?warning=EF=BC=8C=E5=B9=B6=E6=8A=8Async=E6=93=8D=E4=BD=9C?= =?UTF-8?q?=E4=BB=8E=E7=AE=97=E5=AD=90=E5=86=85=E9=83=A8=E7=A7=BB=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../batch_normalization/cnnl_kernel.cc | 5 ++-- src/04kernel/src/kernels/cast/cnnl_kernel.cc | 1 - src/04kernel/src/kernels/clip/cnnl_kernel.cc | 1 - .../src/kernels/concat/cnnl_kernel.cc | 4 ++-- src/04kernel/src/kernels/conv/cnnl_kernel.cc | 6 ++--- .../src/kernels/expand/cnnl_kernel.cc | 1 - .../src/kernels/gather/cnnl_kernel.cc | 1 - .../src/kernels/mat_mul/cnnl_kernel.cc | 1 - src/04kernel/src/kernels/pool/cnnl_kernel.cc | 4 ++-- .../src/kernels/simple_binary/binary_cnnl.cc | 1 - src/04kernel/src/kernels/slice/cnnl_kernel.cc | 2 +- .../src/kernels/softmax/cnnl_kernel.cc | 1 - src/04kernel/src/kernels/split/cnnl_kernel.cc | 4 ++-- src/04kernel/src/kernels/where/cnnl_kernel.cc | 1 - .../src/utilities/bang/cnrt_functions.cc | 4 ++-- .../src/utilities/bang/cnrt_functions.h | 4 ++-- .../kernels/batch_normalization/test_cnnl.cpp | 2 ++ src/04kernel/test/kernels/cast/test_cnnl.cpp | 4 +++- src/04kernel/test/kernels/clip/test_cnnl.cpp | 4 +++- .../test/kernels/concat/test_cnnl.cpp | 4 +++- src/04kernel/test/kernels/conv/test_cnnl.cpp | 11 ++------- .../test/kernels/expand/test_cnnl.cpp | 2 ++ .../test/kernels/gather/test_gather_cnnl.cpp | 6 ++++- .../test/kernels/mat_mul/test_cnnl.cpp | 5 ++++ src/04kernel/test/kernels/pool/test_cnnl.cpp | 2 ++ .../test/kernels/reduce/test_cnnl.cpp | 2 ++ .../test/kernels/scatter_nd/test_cnnl.cpp | 4 +++- .../simple_binary/test_binary_cnnl.cpp | 23 ++----------------- .../test/kernels/simple_unary/test_cnnl.cpp | 2 ++ src/04kernel/test/kernels/slice/test_cnnl.cpp | 4 +++- .../test/kernels/softmax/test_cnnl.cpp | 4 +++- src/04kernel/test/kernels/split/test_cnnl.cpp | 4 +++- .../test/kernels/transpose/test_cnnl.cpp | 2 ++ src/04kernel/test/kernels/where/test_cnnl.cpp | 2 ++ src/09python_ffi/src/executor.cc | 4 ++-- 35 files changed, 67 insertions(+), 65 deletions(-) diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc index e3efcd1d3..1330cfa7a 100644 --- a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc @@ -128,8 +128,8 @@ namespace refactor::kernel { auto y = outputs[0]; void *xTrans = workspace; - void *yTrans = xTrans + xTransSize; - void *cursor = yTrans + xTransSize; + void *yTrans = reinterpret_cast(xTrans) + xTransSize; + void *cursor = reinterpret_cast(yTrans) + xTransSize; // transpose NCHW input to NHWC CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x, @@ -147,7 +147,6 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans, d->inDesc, y, cursor, workspaceSize)); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), totalWorkspaceSize}; diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc index 951209741..54798e670 100644 --- a/src/04kernel/src/kernels/cast/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc @@ -65,7 +65,6 @@ namespace refactor::kernel { return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore()->handle, d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0])); - // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.cc b/src/04kernel/src/kernels/clip/cnnl_kernel.cc index 423939bc9..fe65e99b3 100644 --- a/src/04kernel/src/kernels/clip/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/clip/cnnl_kernel.cc @@ -57,7 +57,6 @@ namespace refactor::kernel { CNNL_POINTER_MODE_DEVICE, d->t, inputs[0], inputs[1], hasMax ? inputs[2] : nullptr, d->t, outputs[0])); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc index 6c7a0fbaf..c35b1c33f 100644 --- a/src/04kernel/src/kernels/concat/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc @@ -52,7 +52,7 @@ namespace refactor::kernel { } ~Descriptors() noexcept(false) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); - for (auto i = 0; i < out.size(); i++) { + for (size_t i = 0; i < out.size(); i++) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); } } @@ -62,7 +62,7 @@ namespace refactor::kernel { }; auto d = std::make_shared(info.num, info.dataType != DT::F64); setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); - for (auto i = 0; i < info.outDims.size(); i++) { + for (size_t i = 0; i < info.outDims.size(); i++) { setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); } diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc index 85dcb9517..187ac4b51 100644 --- a/src/04kernel/src/kernels/conv/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc @@ -209,9 +209,9 @@ namespace refactor::kernel { // } void *xTrans = workspace; - void *wTrans = xTrans + xTransSize; - void *yTrans = wTrans + wTransSize; - void *opWorkspace = yTrans + yTransSize; + void *wTrans = reinterpret_cast(xTrans) + xTransSize; + void *yTrans = reinterpret_cast(wTrans) + wTransSize; + void *opWorkspace = reinterpret_cast(yTrans) + yTransSize; // transpose NCHW input to NHWC CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->x, x, diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.cc b/src/04kernel/src/kernels/expand/cnnl_kernel.cc index 18f586422..426eac49d 100644 --- a/src/04kernel/src/kernels/expand/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/expand/cnnl_kernel.cc @@ -60,7 +60,6 @@ namespace refactor::kernel { return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { CNNL_ASSERT(cnnlExpand(res.fetchOrStore()->handle, d->inDesc, inputs[0], d->outDesc, outputs[0])); - // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } #endif diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc index b4d5aa154..abdf0d5c4 100644 --- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc @@ -79,7 +79,6 @@ namespace refactor::kernel { d->inDesc, inputs[0], reinterpret_cast(workspace), d->indexDesc, reinterpret_cast(inputs[1]), d->outDesc, outputs[0])); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc index 7be05b7b2..3eac35723 100644 --- a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc @@ -141,7 +141,6 @@ namespace refactor::kernel { workspace, algoWorkspaceSize)); } - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), algoWorkspaceSize}; diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.cc b/src/04kernel/src/kernels/pool/cnnl_kernel.cc index 083125b1f..929ea5789 100644 --- a/src/04kernel/src/kernels/pool/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/pool/cnnl_kernel.cc @@ -130,7 +130,7 @@ namespace refactor::kernel { auto handle = res.fetchOrStore()->handle; void *extraInputDev = workspace; - void *poolWorkSpace = workspace + extraInputSize; + void *poolWorkSpace = reinterpret_cast(workspace) + extraInputSize; void *extraInputHost = malloc(extraInputSize); CNNL_ASSERT(cnnlInitPoolingExtraInput(handle, d->pooling, d->x, d->y, extraInputHost)); @@ -145,7 +145,7 @@ namespace refactor::kernel { &b, extraInputDev, d->y, outputs[0], poolWorkSpace, workspaceSize)); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + res.fetchOrStore()->queueSync(); free(extraInputHost); }; diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc index 11712b4f8..8b4647ac8 100644 --- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc @@ -180,7 +180,6 @@ namespace refactor::kernel { workspace, workspaceSize)); } - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.cc b/src/04kernel/src/kernels/slice/cnnl_kernel.cc index b5f8b4c85..85bc90938 100644 --- a/src/04kernel/src/kernels/slice/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/slice/cnnl_kernel.cc @@ -64,7 +64,7 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->out, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDim.size(), info.outDim.data())); std::vector begin, end, stride; - for (auto i = 0; i < info.dims.size(); i++) { + for (size_t i = 0; i < info.dims.size(); i++) { // [begin, end), end is not inclued begin.push_back(info.dims[i].start); auto sign = info.dims[i].step > 0 ? 1 : -1; diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc index 0633195d5..babaf33cc 100644 --- a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc @@ -80,7 +80,6 @@ namespace refactor::kernel { CNNL_COMPUTATION_ULTRAHIGH_PRECISION, &a, d->t, inputs[0], &b, d->t, outputs[0])); - res.fetchOrStore()->queueSync(); }; } diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc index c967bff2d..8f686d597 100644 --- a/src/04kernel/src/kernels/split/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc @@ -69,7 +69,7 @@ namespace refactor::kernel { } ~Descriptors() noexcept(false) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); - for (auto i = 0; i < out.size(); i++) { + for (size_t i = 0; i < out.size(); i++) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); } } @@ -81,7 +81,7 @@ namespace refactor::kernel { // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); - for (auto i = 0; i < info.outDims.size(); i++) { + for (size_t i = 0; i < info.outDims.size(); i++) { // setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data())); } diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc index 50b7c9d85..14f8d6676 100644 --- a/src/04kernel/src/kernels/where/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc @@ -102,7 +102,6 @@ namespace refactor::kernel { d->y, inputs[2], workspace, workspaceSize, d->ans, outputs[0])); - res.fetchOrStore()->queueSync(); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc index 2ea66194d..26c1b975d 100644 --- a/src/04kernel/src/utilities/bang/cnrt_functions.cc +++ b/src/04kernel/src/utilities/bang/cnrt_functions.cc @@ -4,7 +4,7 @@ #include #include -namespace refactor::kernel::cnnl { +namespace refactor::kernel::bang { int currentDevice() { int device; @@ -22,6 +22,6 @@ namespace refactor::kernel::cnnl { CNRT_MEM_TRANS_DIR_DEV2HOST)); } -}// namespace refactor::kernel::cnnl +}// namespace refactor::kernel::bang #endif diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h index ef119819c..3a05195ce 100644 --- a/src/04kernel/src/utilities/bang/cnrt_functions.h +++ b/src/04kernel/src/utilities/bang/cnrt_functions.h @@ -3,7 +3,7 @@ #include "common.h" -namespace refactor::kernel::cnnl { +namespace refactor::kernel::bang { int currentDevice(); @@ -11,6 +11,6 @@ namespace refactor::kernel::cnnl { void copyOut(void *dst, const void *src, size_t size); -}// namespace refactor::kernel::cnnl +}// namespace refactor::kernel::bang #endif// KERNEL_CNRT_FUNCTIONS_H diff --git a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp index 14a1a07f4..d1ad9bd78 100644 --- a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp +++ b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/batch_normalization/cnnl_kernel.hh" #include "../../../src/kernels/batch_normalization/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -57,6 +58,7 @@ TEST(kernel, BatchNormalizationCnnl) { void const *inputs[]{*mluIn, *mluScale, *mluBias, *mluMean, *mluVar}; void *outputs[]{*mluOut}; rMlu(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(outTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/cast/test_cnnl.cpp b/src/04kernel/test/kernels/cast/test_cnnl.cpp index 72f842470..94297357d 100644 --- a/src/04kernel/test/kernels/cast/test_cnnl.cpp +++ b/src/04kernel/test/kernels/cast/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/cast/cpu_kernel.hh" #include "../../../src/kernels/cast/cnnl_kernel.hh" +#include "../../../src/kernels/cast/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -34,6 +35,7 @@ TEST(kernel, CastCnnl) { void const *inputs[]{*xMlu}; void *outputs[]{*yMlu}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{x_.data()}; diff --git a/src/04kernel/test/kernels/clip/test_cnnl.cpp b/src/04kernel/test/kernels/clip/test_cnnl.cpp index ab29a1cb5..ff2e77290 100644 --- a/src/04kernel/test/kernels/clip/test_cnnl.cpp +++ b/src/04kernel/test/kernels/clip/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/clip/cpu_kernel.hh" #include "../../../src/kernels/clip/cnnl_kernel.hh" +#include "../../../src/kernels/clip/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -36,6 +37,7 @@ TEST(kernel, ClipCnnl) { void const *inputs[]{*mluMem, *mluMin, *mluMax}; void *outputs[]{*mluMem}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{value.data(), &min, &max}; diff --git a/src/04kernel/test/kernels/concat/test_cnnl.cpp b/src/04kernel/test/kernels/concat/test_cnnl.cpp index 2911c9e90..ecc817aca 100644 --- a/src/04kernel/test/kernels/concat/test_cnnl.cpp +++ b/src/04kernel/test/kernels/concat/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/concat/cpu_kernel.hh" #include "../../../src/kernels/concat/cnnl_kernel.hh" +#include "../../../src/kernels/concat/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -65,6 +66,7 @@ TEST(kernel, ConcatCnnl) { void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2], *mluIns[3]}; void *outputs[]{*mluOut}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()}; diff --git a/src/04kernel/test/kernels/conv/test_cnnl.cpp b/src/04kernel/test/kernels/conv/test_cnnl.cpp index bf4bff3c8..74e799f5a 100644 --- a/src/04kernel/test/kernels/conv/test_cnnl.cpp +++ b/src/04kernel/test/kernels/conv/test_cnnl.cpp @@ -1,6 +1,7 @@ #ifdef USE_BANG #include "../../../src/kernels/conv/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -39,15 +40,7 @@ void testConvCnnl(int rank, const int64_t *pads, const int64_t *strides, const i void const *inputs[]{*xMlu, *wMlu}; void *outputs[]{*yMlu}; routine(res, *workspace, inputs, outputs); - - xMlu->copyToHost(xData.data(), xTensor->bytesSize()); - wMlu->copyToHost(wData.data(), wTensor->bytesSize()); - // fmt::println("{}", vec2str(xData)); - // fmt::println("{}", vec2str(wData)); - - // std::vector ws(workspaceSize); - // workspace->copyToHost(ws.data(), workspaceSize); - // fmt::println("{}", vec2str(ws)); + kernel::bang::sync(); // take output data std::vector result(yTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/expand/test_cnnl.cpp b/src/04kernel/test/kernels/expand/test_cnnl.cpp index e06e59777..43fb07e8d 100644 --- a/src/04kernel/test/kernels/expand/test_cnnl.cpp +++ b/src/04kernel/test/kernels/expand/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/expand/cnnl_kernel.hh" #include "../../../src/kernels/expand/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -36,6 +37,7 @@ TEST(kernel, ExpandCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp index 020b5f914..b63682d26 100644 --- a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp +++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp @@ -2,6 +2,7 @@ #include "../src/kernels/gather/cnnl_kernel.hh" #include "../src/kernels/gather/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -39,6 +40,7 @@ TEST(kernel, GatherCnnl) { void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; cnnlRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; @@ -81,6 +83,7 @@ TEST(kernel, GatherCnnl) { void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; cnnlRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; @@ -110,7 +113,7 @@ TEST(kernel, GatherCnnl) { auto cpuRoutine = cpuKernel->lower(res).routine; // Init inputs and outputs std::vector a; - for (auto i = 0; i < data->elementsSize(); i++) { + for (size_t i = 0; i < data->elementsSize(); i++) { a.push_back(i + 0.1f); } std::vector b(indices->elementsSize(), 0); @@ -126,6 +129,7 @@ TEST(kernel, GatherCnnl) { void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; cnnlRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; diff --git a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp index 251c5c898..f079b4445 100644 --- a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp +++ b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../src/kernels/mat_mul/cnnl_kernel.hh" #include "../src/kernels/mat_mul/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -48,6 +49,7 @@ TEST(kernel, MatMulCnnl_OnlyBias) { void const *inputs[]{*ma, *mb, *mc}; void *outputs[]{*my}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); // take output data std::vector result(Y->elementsSize()); my->copyToHost(result.data(), Y->bytesSize()); @@ -91,6 +93,7 @@ TEST(kernel, MatMulCnnl_Broadcast) { void const *inputs[]{*ma, *mb, *mc}; void *outputs[]{*my}; mluRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{dataA.data(), dataB.data(), dataC.data()}; @@ -135,6 +138,7 @@ TEST(kernel, MatMulCnnl_TransABNoBias) { void const *inputs[]{*ma, *mb}; void *outputs[]{*my}; mluRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{dataA.data(), dataB.data()}; @@ -189,6 +193,7 @@ TEST(kernel, MatMulCnnl_Large) { void const *inputs[]{*ma, *mb, *mc}; void *outputs[]{*my}; mluRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{dataA.data(), dataB.data(), dataC.data()}; diff --git a/src/04kernel/test/kernels/pool/test_cnnl.cpp b/src/04kernel/test/kernels/pool/test_cnnl.cpp index 405bf3f8c..1adf45139 100644 --- a/src/04kernel/test/kernels/pool/test_cnnl.cpp +++ b/src/04kernel/test/kernels/pool/test_cnnl.cpp @@ -1,6 +1,7 @@ #ifdef USE_BANG #include "../../../src/kernels/pool/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -32,6 +33,7 @@ void testPoolCnnl(PoolType poolType, int rank, const int64_t *pads, const int64_ void const *inputs[]{*mluMem}; void *outputs[]{*mluMem}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); // take output data std::vector result(yTensor->elementsSize()); mluMem->copyToHost(result.data(), yTensor->bytesSize()); diff --git a/src/04kernel/test/kernels/reduce/test_cnnl.cpp b/src/04kernel/test/kernels/reduce/test_cnnl.cpp index 32952fead..113fe7664 100644 --- a/src/04kernel/test/kernels/reduce/test_cnnl.cpp +++ b/src/04kernel/test/kernels/reduce/test_cnnl.cpp @@ -1,6 +1,7 @@ #ifdef USE_BANG #include "../../../src/kernels/reduce/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -28,6 +29,7 @@ static void testReducemean(const Shape &shape, const std::vector &data, void const *inputs[]{*mluMemIn}; void *outputs[]{*mluMemOut}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data Shape outDimArray; diff --git a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp index 1cdf72166..cf5b9c367 100644 --- a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp +++ b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/scatter_nd/cpu_kernel.hh" #include "../../../src/kernels/scatter_nd/cnnl_kernel.hh" +#include "../../../src/kernels/scatter_nd/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -48,6 +49,7 @@ TEST(kernel, ScatterNDCnnl) { void const *inputs[]{*mluData, *mluIndices, *mluUpdates}; void *outputs[]{*mluOut}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data_.data(), indices_.data(), updates_.data()}; diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp index 7bd18bf6d..dfd3c9b80 100644 --- a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp @@ -2,6 +2,7 @@ #include "../src/kernels/simple_binary/binary_cnnl.hh" #include "../src/kernels/simple_binary/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -38,6 +39,7 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; @@ -100,27 +102,6 @@ TEST(kernel, BinaryCnnlFMod) { Shape{2, 5, 10, 20, 3, 4}); } -// TEST(kernel, BinaryCnnlMod) { -// testBinaryCnnl(SimpleBinaryType::Mod, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}); -// } - -// TEST(kernel, BinaryCnnlFmodI8) { -// testBinaryCnnl(SimpleBinaryType::Fmod, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}); -// } - -// TEST(kernel, BinaryCnnlFmodF32) { -// testBinaryCnnl(SimpleBinaryType::Fmod, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}); -// } - TEST(kernel, BinaryCnnlBroadcast) { testBinaryCnnl(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6}); } diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp index e4b9428ee..8e1e8a9f6 100644 --- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp @@ -3,6 +3,7 @@ #include "../../../src/kernels/simple_unary/cnnl_activation_kernel.hh" #include "../../../src/kernels/simple_unary/cnnl_simple_unary_kernel.hh" #include "../../../src/kernels/simple_unary/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -32,6 +33,7 @@ static void testOp(SimpleUnaryType opType, bool activation = true) { void const *inputs[]{*mluMem}; void *outputs[]{*mluMem}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/slice/test_cnnl.cpp b/src/04kernel/test/kernels/slice/test_cnnl.cpp index 914a8fee4..1685d7aaa 100644 --- a/src/04kernel/test/kernels/slice/test_cnnl.cpp +++ b/src/04kernel/test/kernels/slice/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/slice/cpu_kernel.hh" #include "../../../src/kernels/slice/cnnl_kernel.hh" +#include "../../../src/kernels/slice/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -45,6 +46,7 @@ TEST(kernel, SliceCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp index a8c7fb283..09874d207 100644 --- a/src/04kernel/test/kernels/softmax/test_cnnl.cpp +++ b/src/04kernel/test/kernels/softmax/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/softmax/cpu_kernel.hh" #include "../../../src/kernels/softmax/cnnl_kernel.hh" +#include "../../../src/kernels/softmax/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -39,6 +40,7 @@ TEST(kernel, SoftmaxCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; rCnnl(res, nullptr, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(outTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/split/test_cnnl.cpp b/src/04kernel/test/kernels/split/test_cnnl.cpp index 5afa434e6..71e69b219 100644 --- a/src/04kernel/test/kernels/split/test_cnnl.cpp +++ b/src/04kernel/test/kernels/split/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/split/cpu_kernel.hh" #include "../../../src/kernels/split/cnnl_kernel.hh" +#include "../../../src/kernels/split/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -64,6 +65,7 @@ TEST(kernel, SplitCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOuts[0], *mluOuts[1], *mluOuts[2], *mluOuts[3]}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp index 4f4301d86..f46890714 100644 --- a/src/04kernel/test/kernels/transpose/test_cnnl.cpp +++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/transpose/cnnl_kernel.hh" #include "../../../src/kernels/transpose/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -42,6 +43,7 @@ TEST(kernel, TransposeCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(dataTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/where/test_cnnl.cpp b/src/04kernel/test/kernels/where/test_cnnl.cpp index aaad65857..6e26ddbd2 100644 --- a/src/04kernel/test/kernels/where/test_cnnl.cpp +++ b/src/04kernel/test/kernels/where/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/where/cnnl_kernel.hh" #include "../../../src/kernels/where/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -49,6 +50,7 @@ void testWhereCnnl(Shape cDim, Shape xDim, Shape yDim, Shape outDim) { void const *inputs[]{*mluC, *mluX, *mluY}; void *outputs[]{*mluOut}; rCnnl(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(outTensor->elementsSize()); diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc index 947410ccb..1d0e543f1 100644 --- a/src/09python_ffi/src/executor.cc +++ b/src/09python_ffi/src/executor.cc @@ -76,7 +76,7 @@ namespace refactor::python_ffi { auto ans = _stream.bench(sync ? kernel::cuda::sync : nullptr); #else #ifdef USE_BANG - auto ans = _stream.bench(sync ? kernel::cnnl::sync : nullptr); + auto ans = _stream.bench(sync ? kernel::bang::sync : nullptr); #else auto ans = _stream.bench(nullptr); #endif @@ -222,7 +222,7 @@ namespace refactor::python_ffi { kernel::cuda::copyOut(buffer.data(), addresses[idx], size); #endif #ifdef USE_BANG - kernel::cnnl::copyOut(buffer.data(), addresses[idx], size); + kernel::bang::copyOut(buffer.data(), addresses[idx], size); #endif auto file = path / fmt::format("data{:06}.{}", dataIdx++, format); From b82c861c3cf08a9eb72e93f4c77ac72f237c5bb3 Mon Sep 17 00:00:00 2001 From: Bolun Date: Wed, 31 Jan 2024 07:21:02 +0000 Subject: [PATCH 8/9] =?UTF-8?q?fix:=20=E6=95=B4=E7=90=86=E5=88=86=E6=94=AF?= =?UTF-8?q?=E5=90=8E=E8=B7=91=E9=80=9A=EF=BC=8C=E6=B7=BB=E5=8A=A0HardSwish?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/kernels/gather/cnnl_kernel.cc | 18 +++++++++------ .../src/kernels/simple_binary/binary_cnnl.cc | 22 ++++++++----------- .../simple_unary/cnnl_activation_kernel.cc | 21 ++++++++++++------ .../test/kernels/simple_unary/test_cnnl.cpp | 1 + .../test/kernels/transpose/test_cnnl.cpp | 4 ++-- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc index abdf0d5c4..58b86660b 100644 --- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc @@ -16,13 +16,16 @@ namespace refactor::kernel { #ifndef USE_BANG return nullptr; #endif - + auto indicesDim = std::vector(index.shape.begin(), index.shape.end()); + if (indicesDim.size() == 0) { + indicesDim.push_back(1); + } return std::make_unique(decltype(info){ input.dataType, - DataType::I32, - axis ? axis : 0, + index.dataType, + axis, std::vector(input.shape.begin(), input.shape.end()), - std::vector(index.shape.begin(), index.shape.end()), + std::move(indicesDim), std::vector(output.shape.begin(), output.shape.end()), }); } @@ -60,8 +63,9 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlSetTensorDescriptor( d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); + // cnnlGatherV2 does not support int64 indices CNNL_ASSERT(cnnlSetTensorDescriptor( - d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType), + d->indexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32, info.indexDim.size(), info.indexDim.data())); CNNL_ASSERT(cnnlSetTensorDescriptor( d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), @@ -71,7 +75,7 @@ namespace refactor::kernel { res.fetchOrStore(); auto routine = [d = std::move(d), - shape = std::vector(info.inDim.begin(), info.inDim.end()), + shape = std::vector(info.inDim.begin(), info.inDim.end()), workspaceSize, dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { res.fetchOrStore()->copyFromCPU(workspace, shape.data(), workspaceSize); @@ -79,7 +83,7 @@ namespace refactor::kernel { d->inDesc, inputs[0], reinterpret_cast(workspace), d->indexDesc, reinterpret_cast(inputs[1]), d->outDesc, outputs[0])); - }; + }; return {std::move(routine), workspaceSize}; } diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc index 8b4647ac8..d62ba0ab8 100644 --- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc @@ -26,10 +26,7 @@ namespace refactor::kernel { // !a.dataType.isFloat() || !ARTHIMETIC.contains(op) || // At least one of a,b should have the same shape as c - (a.shape != c.shape && b.shape != c.shape) - // Sub only supports brocasting b - // (a.shape != c.shape && op == Op::Sub) - ) { + (a.shape != c.shape && b.shape != c.shape)) { return nullptr; } @@ -124,9 +121,9 @@ namespace refactor::kernel { auto handle = res.fetchOrStore()->handle; size_t workspaceSize; CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc, - d->bDesc, d->cDesc, - &workspaceSize)); - + d->bDesc, d->cDesc, + &workspaceSize)); + res.fetchOrStore(); auto routine = [d = std::move(d), @@ -147,11 +144,11 @@ namespace refactor::kernel { beta = d->f32 ? factor(0) : factor(0); - CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc, - &alphaA, d->aDesc, a, - &alphaB, d->bDesc, b, - workspace, workspaceSize, - &beta, d->cDesc, c)); + CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc, + &alphaA, d->aDesc, a, + &alphaB, d->bDesc, b, + workspace, workspaceSize, + &beta, d->cDesc, c)); } else if (op == SimpleBinaryType::Div) { CNNL_ASSERT(cnnlDiv_v2(handle, CNNL_COMPUTATION_HIGH_PRECISION, @@ -179,7 +176,6 @@ namespace refactor::kernel { d->cDesc, c, workspace, workspaceSize)); } - }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc index d35535948..68670662c 100644 --- a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc +++ b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc @@ -17,7 +17,7 @@ namespace refactor::kernel { : Kernel(), type(type_), dataType(dataType_), size(size_) {} auto K::build(Op op, Tensor const &a) noexcept -> KernelBox { - static const std::unordered_set ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh}; + static const std::unordered_set ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh, Op::HardSwish}; #ifndef USE_BANG return nullptr; @@ -64,20 +64,27 @@ namespace refactor::kernel { auto d = std::make_shared(); // clang-format off - auto mode = type == Ty::Relu ? CNNL_ACTIVATION_RELU - : type == Ty::Sigmoid ? CNNL_ACTIVATION_SIGMOID - : type == Ty::Tanh ? CNNL_ACTIVATION_TANH + auto mode = type == Ty::Relu ? CNNL_ACTIVATION_RELU + : type == Ty::Sigmoid ? CNNL_ACTIVATION_SIGMOID + : type == Ty::Tanh ? CNNL_ACTIVATION_TANH + : type == Ty::HardSwish ? CNNL_ACTIVATION_HARDSWISH : UNREACHABLEX(cnnlActivationMode_t, ""); + float coef = 0.0; + float slicedDim = 0.0; + float gamma = 0.0; + float scale = 0.0; // clang-format on setCnnlTensor(d->tensor, dataType, slice(&size, 1)); - CNNL_ASSERT(cnnlSetActivationDescriptor_v2(d->activation, mode, CNNL_ACTIVATION_HIGH_PRECISION, - CNNL_NOT_PROPAGATE_NAN, 0.0)); + CNNL_ASSERT(cnnlSetActivationDescriptor_v5(d->activation, mode, + CNNL_ACTIVATION_HIGH_PRECISION, + CNNL_NOT_PROPAGATE_NAN, coef, + slicedDim, gamma, scale, true)); res.fetchOrStore(); return [d = std::move(d)]// (Resources & res, void *, void const *const *inputs, void *const *outputs) { - float alpha = 1, beta = 0; + float alpha = 1.f, beta = 0.f; CNNL_ASSERT(cnnlActivationForward( res.fetchOrStore()->handle, d->activation, diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp index 8e1e8a9f6..3964989c9 100644 --- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp @@ -60,6 +60,7 @@ TEST(kernel, ActivationCnnl) { testOp(SimpleUnaryType::Relu); testOp(SimpleUnaryType::Sigmoid); testOp(SimpleUnaryType::Tanh); + testOp(SimpleUnaryType::HardSwish); } diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp index f46890714..9acdd95ba 100644 --- a/src/04kernel/test/kernels/transpose/test_cnnl.cpp +++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp @@ -14,8 +14,8 @@ using namespace hardware; TEST(kernel, TransposeCnnl) { // build routine auto dataTensor = Tensor::share(DataType::F32, Shape{1, 3, 2, 5}); - auto info = TransposeInfo(dataTensor->shape, Permutation{2, 3, 0, 1}); - auto kCpu = TransposeCpu::build(dataTensor->dataType, info); + auto info = TransposeInfo(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1}); + auto kCpu = TransposeCpu::build(info); auto kernel = TransposeCnnl::build(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1}); ASSERT_TRUE(kCpu && kernel); auto res = runtime::Resources(); From 626ae8fe5aece306ea5a7c2f0a5d54d3ad22c92e Mon Sep 17 00:00:00 2001 From: Bolun Date: Wed, 31 Jan 2024 08:14:02 +0000 Subject: [PATCH 9/9] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0hardsigmoid/pad/m?= =?UTF-8?q?in/max=E7=AE=97=E5=AD=90=EF=BC=8C=E4=BF=AE=E5=A4=8Dconv/transpo?= =?UTF-8?q?se=E7=AD=89=E5=87=BA=E7=8E=B0=E7=9A=84=E5=B0=8F=E9=94=99?= =?UTF-8?q?=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/04kernel/src/collectors/hard_sigmoid.cc | 6 + src/04kernel/src/collectors/pad.cc | 7 +- src/04kernel/src/collectors/select.cc | 4 + src/04kernel/src/kernels/cast/cnnl_kernel.cc | 19 ++- src/04kernel/src/kernels/conv/cnnl_kernel.cc | 67 +++----- src/04kernel/src/kernels/conv/cnnl_kernel.hh | 2 +- .../src/kernels/hard_sigmoid/cnnl_kernel.cc | 81 ++++++++++ .../src/kernels/hard_sigmoid/cnnl_kernel.hh | 27 ++++ src/04kernel/src/kernels/pad/cnnl_kernel.cc | 95 +++++++++++ src/04kernel/src/kernels/pad/cnnl_kernel.hh | 28 ++++ .../src/kernels/scatter_nd/cnnl_kernel.cc | 1 - .../src/kernels/select/cnnl_kernel.cc | 151 ++++++++++++++++++ .../src/kernels/select/cnnl_kernel.hh | 33 ++++ .../src/kernels/transpose/cnnl_kernel.cc | 8 +- .../test/kernels/hard_sigmoid/test_cnnl.cpp | 51 ++++++ src/04kernel/test/kernels/pad/test_cnnl.cpp | 131 +++++++++++++++ .../test/kernels/select/test_cnnl.cpp | 99 ++++++++++++ 17 files changed, 757 insertions(+), 53 deletions(-) create mode 100644 src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/pad/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/pad/cnnl_kernel.hh create mode 100644 src/04kernel/src/kernels/select/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/select/cnnl_kernel.hh create mode 100644 src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/pad/test_cnnl.cpp create mode 100644 src/04kernel/test/kernels/select/test_cnnl.cpp diff --git a/src/04kernel/src/collectors/hard_sigmoid.cc b/src/04kernel/src/collectors/hard_sigmoid.cc index 69d2f9d1e..c44151c20 100644 --- a/src/04kernel/src/collectors/hard_sigmoid.cc +++ b/src/04kernel/src/collectors/hard_sigmoid.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/hard_sigmoid.h" +#include "../kernels/hard_sigmoid/cnnl_kernel.hh" #include "../kernels/hard_sigmoid/cpu_kernel.hh" #include "../kernels/hard_sigmoid/cuda_kernel.hh" @@ -20,6 +21,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = HardSigmoidCnnl::build(alpha, beta, a); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/collectors/pad.cc b/src/04kernel/src/collectors/pad.cc index f4c995e0b..1c7be68c4 100644 --- a/src/04kernel/src/collectors/pad.cc +++ b/src/04kernel/src/collectors/pad.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/pad.h" +#include "../kernels/pad/cnnl_kernel.hh" #include "../kernels/pad/cpu_kernel.hh" #include "../kernels/pad/cuda_kernel.hh" @@ -22,6 +23,11 @@ namespace refactor::kernel { ans.emplace_back(std::move(ptr)); } break; + case decltype(_target)::Mlu: + if (auto ptr = PadCnnl::build(dims, input.get().dataType, mode, const_value); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; default: UNREACHABLEX(void, "Unknown target"); } @@ -29,4 +35,3 @@ namespace refactor::kernel { } }// namespace refactor::kernel - diff --git a/src/04kernel/src/collectors/select.cc b/src/04kernel/src/collectors/select.cc index e4eff8f4b..3cec750e8 100644 --- a/src/04kernel/src/collectors/select.cc +++ b/src/04kernel/src/collectors/select.cc @@ -1,6 +1,7 @@ #include "kernel/collectors/select.h" #include "../kernels/select/cpu_kernel.hh" #include "../kernels/select/cuda_kernel.hh" +#include "../kernels/select/cnnl_kernel.hh" namespace refactor::kernel { @@ -35,6 +36,9 @@ namespace refactor::kernel { case decltype(_target)::Nvidia: REGISTER(SelectCuda) break; + case decltype(_target)::Mlu: + REGISTER(SelectCnnl) + break; default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc index 54798e670..735692b90 100644 --- a/src/04kernel/src/kernels/cast/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc @@ -46,8 +46,10 @@ namespace refactor::kernel { struct Descriptors { cnnlTensorDescriptor_t inDesc, outDesc; cnnlCastDataType_t cast; + bool needCast; - Descriptors() : inDesc(nullptr), outDesc(nullptr) { + Descriptors(bool need) : inDesc(nullptr), outDesc(nullptr), + needCast(need) { CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc)); } @@ -56,15 +58,22 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc)); } }; - auto d = std::make_shared(); - d->cast = castType(from, to); + auto d = std::make_shared(from != to); + if (d->needCast) { + d->cast = castType(from, to); + } setCnnlTensor(d->inDesc, from, slice(shape.data(), shape.size())); setCnnlTensor(d->outDesc, to, slice(shape.data(), shape.size())); res.fetchOrStore(); return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { - CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore()->handle, - d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0])); + if (d->needCast) { + CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore()->handle, + d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0])); + } else { + CNNL_ASSERT(cnnlCopy(res.fetchOrStore()->handle, + d->inDesc, inputs[0], d->outDesc, outputs[0])); + } }; } diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc index 187ac4b51..0974a7600 100644 --- a/src/04kernel/src/kernels/conv/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc @@ -29,15 +29,10 @@ namespace refactor::kernel { return nullptr; } - std::optional biasExpand = std::nullopt; + int biasSize_ = 0; if (b) { ASSERT(b->get().shape[0] == y.shape[1], ""); - std::vector input(y.rank(), 1); - input[1] = y.shape[1]; - biasExpand.emplace(ExpandInfoCnnl( - b->get().dataType, - slice(input.data(), input.size()), - slice(y.shape.data(), y.rank()))); + biasSize_ = b->get().shape[0]; } // group is not supported @@ -70,7 +65,7 @@ namespace refactor::kernel { {d[0], d[1]}, {p[0], p[1], p[2], p[3]}, {s[0], s[1]}, - std::move(biasExpand), + biasSize_, }); } @@ -92,22 +87,18 @@ namespace refactor::kernel { // RAII for closure struct Descriptors { - cnnlTensorDescriptor_t x, y, w; + cnnlTensorDescriptor_t x, y, w, b; cnnlTensorDescriptor_t xTrans, yTrans, wTrans; cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW; cnnlConvolutionDescriptor_t conv; cnnlConvolutionForwardAlgo_t algo; - // std::optional extraPadding; - std::optional biasExpand; - bool f32; - - Descriptors(decltype(f32) f32_) - :// extraPadding(std::nullopt), - biasExpand(std::nullopt), - f32(f32_) { + bool bias; + + Descriptors(decltype(bias) bias_) : bias(bias_) { CNNL_ASSERT(cnnlCreateTensorDescriptor(&x)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&y)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&w)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&b)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&xTrans)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&yTrans)); CNNL_ASSERT(cnnlCreateTensorDescriptor(&wTrans)); @@ -119,6 +110,7 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlDestroyTensorDescriptor(x)); CNNL_ASSERT(cnnlDestroyTensorDescriptor(y)); CNNL_ASSERT(cnnlDestroyTensorDescriptor(w)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(b)); CNNL_ASSERT(cnnlDestroyTensorDescriptor(xTrans)); CNNL_ASSERT(cnnlDestroyTensorDescriptor(yTrans)); CNNL_ASSERT(cnnlDestroyTensorDescriptor(wTrans)); @@ -130,11 +122,8 @@ namespace refactor::kernel { Descriptors(const Descriptors &) = delete; Descriptors(Descriptors &&) = delete; }; - auto d = std::make_shared(info.dt != DataType::F64); - // d->extraPadding = ExtraPadding::build(info.dt, info.xShape, info.pad); - if (info.biasExpand) { - d->biasExpand = ExpandCnnl(*info.biasExpand).lower(res).routine; - } + auto d = std::make_shared(info.biasSize > 0); + int xs[]{ info.xShape[0], info.xShape[1], @@ -154,10 +143,15 @@ namespace refactor::kernel { setCnnlTensor(d->x, info.dt, slice(xs, 4)); setCnnlTensor(d->y, info.dt, slice(info.yShape, 4)); setCnnlTensor(d->w, info.dt, slice(info.wShape, 4)); + CNNL_ASSERT(cnnlSetTensorDescriptor(d->xTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, xsNHWC.data())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->yTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, ysNHWC.data())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->wTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, wsNHWC.data())); - + if (d->bias) { + int biasDim[] = {1, 1, 1, info.biasSize}; + CNNL_ASSERT(cnnlSetTensorDescriptor(d->b, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, biasDim)); + } + auto xTransSize = cnnlGetTensorElementNum(d->xTrans) * info.dt.size(); auto yTransSize = cnnlGetTensorElementNum(d->yTrans) * info.dt.size(); auto wTransSize = cnnlGetTensorElementNum(d->wTrans) * info.dt.size(); @@ -188,10 +182,6 @@ namespace refactor::kernel { handle, d->xTrans, d->wTrans, d->yTrans, NULL, d->conv, d->algo, &convWorkspaceSize)); - // if (d->extraPadding) { - // workspaceSize = hardware::alignBytes(workspaceSize, 256); - // } - size_t workspaceSize = xTransSize + yTransSize + wTransSize + std::max({xWorkspaceSize, wWorkspaceSize, yWorkspaceSize, convWorkspaceSize}); res.fetchOrStore(); @@ -201,12 +191,6 @@ namespace refactor::kernel { auto handle = res.fetchOrStore()->handle; void const *x = inputs[0], *w = inputs[1]; void *y = outputs[0]; - // if (auto f = d->extraPadding; f) { - // x = (*f)(x, reinterpret_cast(workspace) + workspaceSize); - // } - // if (auto f = d->biasExpand; f) { - // (*f)(res, workspace, inputs + 2, outputs); - // } void *xTrans = workspace; void *wTrans = reinterpret_cast(xTrans) + xTransSize; @@ -218,19 +202,16 @@ namespace refactor::kernel { d->xTrans, xTrans, opWorkspace, xWorkspaceSize)); CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->w, w, d->wTrans, wTrans, opWorkspace, wWorkspaceSize)); - - // build alpha/beta for double - auto a = d->f32 ? factor(1) : factor(1), - b = d->f32 - ? factor(d->biasExpand ? 1 : 0) - : factor(d->biasExpand ? 1 : 0); + + auto bDesc = (d->bias) ? d->b : NULL; + auto bData = (d->bias) ? inputs[2] : NULL; CNNL_ASSERT(cnnlConvolutionForward( handle, - d->conv, d->algo, &a, + d->conv, d->algo, NULL, d->xTrans, xTrans, d->wTrans, wTrans, - NULL, NULL, opWorkspace, convWorkspaceSize, - &b, d->yTrans, yTrans)); - + bDesc, bData, opWorkspace, convWorkspaceSize, + NULL, d->yTrans, yTrans)); + // transpose NHWC intermediates to NCHW CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->yTrans, yTrans, d->y, y, opWorkspace, yWorkspaceSize)); diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.hh b/src/04kernel/src/kernels/conv/cnnl_kernel.hh index 814e75ad5..c486cdd17 100644 --- a/src/04kernel/src/kernels/conv/cnnl_kernel.hh +++ b/src/04kernel/src/kernels/conv/cnnl_kernel.hh @@ -19,7 +19,7 @@ namespace refactor::kernel { dilation[2], pad[4], stride[2]; - std::optional biasExpand; + int biasSize; } info; explicit ConvCnnl(decltype(info)) noexcept; diff --git a/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc new file mode 100644 index 000000000..25ec6d0cb --- /dev/null +++ b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc @@ -0,0 +1,81 @@ +#include "cnnl_kernel.hh" +#include "kernel/collectors/hard_sigmoid.h" +#include + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#include +#endif + +namespace refactor::kernel { + using K = HardSigmoidCnnl; + using DT = DataType; + + K::HardSigmoidCnnl(float alpha_, float beta_, DT dataType_, int size_) noexcept + : Kernel(), alpha(alpha_), beta(beta_), dataType(dataType_), size(size_) {} + + auto K::build(float alpha_, float beta_, Tensor const &a) noexcept -> KernelBox { + +#ifndef USE_BANG + return nullptr; +#endif + + return std::make_unique(alpha_, beta_, a.dataType, a.elementsSize()); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing hardsigmoid using CNNL"; + } + +#ifdef USE_BANG + + auto HardSigmoidCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + // RAII for closure + struct Descriptors { + cnnlActivationDescriptor_t activation; + cnnlTensorDescriptor_t tensor; + + Descriptors() : activation(nullptr), tensor(nullptr) { + CNNL_ASSERT(cnnlCreateActivationDescriptor(&activation)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyActivationDescriptor(activation)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor)); + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(); + + setCnnlTensor(d->tensor, dataType, slice(&size, 1)); + CNNL_ASSERT(cnnlSetActivationDescriptor_v5(d->activation, CNNL_ACTIVATION_HARDSIGMOID, + CNNL_ACTIVATION_HIGH_PRECISION, + CNNL_NOT_PROPAGATE_NAN, 0.0, + 0.0, alpha, beta, true)); + + res.fetchOrStore(); + return [d = std::move(d)]// + (Resources & res, void *, void const *const *inputs, void *const *outputs) { + float alpha = 1.f, beta = 0.f; + CNNL_ASSERT(cnnlActivationForward( + res.fetchOrStore()->handle, + d->activation, + &alpha, d->tensor, inputs[0], + &beta, d->tensor, outputs[0])); + }; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh new file mode 100644 index 000000000..c343d1fed --- /dev/null +++ b/src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh @@ -0,0 +1,27 @@ +#ifndef KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH +#define KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH + +#include "kernel/collectors/simple_unary.h" + +namespace refactor::kernel { + + struct HardSigmoidCnnl final : public Kernel { + float alpha, beta; + DataType dataType; + int size; + + HardSigmoidCnnl(float, float, DataType, int) noexcept; + + static KernelBox build(float, float, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/pad/cnnl_kernel.cc b/src/04kernel/src/kernels/pad/cnnl_kernel.cc new file mode 100644 index 000000000..39648f851 --- /dev/null +++ b/src/04kernel/src/kernels/pad/cnnl_kernel.cc @@ -0,0 +1,95 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = PadCnnl; + + K::PadCnnl(DataType dataType_, PadType mode_, std::vector inDim_, + std::vector outDim_, std::vector padDim_, size_t len_) noexcept + : Kernel(), dataType(dataType_), mode(mode_), inDim(std::move(inDim_)), + outDim(std::move(outDim_)), padDim(std::move(padDim_)), valueLength(len_) {} + + auto K::build(PadDimension dims_, DataType dataType_, PadType mode_, std::optional> value_) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + if (mode_ != PadType::Constant || (value_ && value_->get().dataType != dataType_)) { + return nullptr; + } + size_t valueLength_ = value_ ? value_->get().dataType.size() : 0; + std::vector inDim_, outDim_, padDim_; + for (auto dim : dims_) { + inDim_.push_back(dim.dimI); + outDim_.push_back(dim.dimO); + padDim_.push_back(dim.pads); + } + + return std::make_unique(dataType_, mode_, inDim_, outDim_, padDim_, valueLength_); + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing Pad using CNNL"; + } + +#ifdef USE_BANG + auto K::lower(Resources &res) const noexcept -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + cnnlTensorDescriptor_t inDesc, outDesc; + + Descriptors() : inDesc(nullptr), outDesc(nullptr) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc)); + CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc)); + CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc)); + } + }; + auto d = std::make_shared(); + setCnnlTensor(d->inDesc, dataType, slice(inDim.data(), inDim.size())); + setCnnlTensor(d->outDesc, dataType, slice(outDim.data(), outDim.size())); + + std::vector pads; + for (auto d : padDim) { + pads.push_back(d); + pads.push_back(d); + } + + res.fetchOrStore(); + return [d = std::move(d), val = valueLength, + p = std::vector(pads.begin(), pads.end())](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + void *paddingValue; + if (val != 0) { + paddingValue = malloc(val); + BANG_ASSERT(cnrtMemcpy(paddingValue, const_cast(inputs[2]), + val, CNRT_MEM_TRANS_DIR_DEV2HOST)); + } else { + float zero = 0.0; + paddingValue = &zero; + } + + CNNL_ASSERT(cnnlPad(res.fetchOrStore()->handle, + d->inDesc, inputs[0], p.data(), paddingValue, + d->outDesc, outputs[0])); + + if (val != 0) { + free(paddingValue); + } + }; + } +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/pad/cnnl_kernel.hh b/src/04kernel/src/kernels/pad/cnnl_kernel.hh new file mode 100644 index 000000000..202518fd4 --- /dev/null +++ b/src/04kernel/src/kernels/pad/cnnl_kernel.hh @@ -0,0 +1,28 @@ +#ifndef KERNEL_PAD_CNNL_HH +#define KERNEL_PAD_CNNL_HH + +#include "kernel/attributes/pad_info.h" +#include "kernel/collectors/pad.h" + +namespace refactor::kernel { + + struct PadCnnl final : public Kernel { + DataType dataType; + PadType mode; + std::vector inDim, outDim, padDim; + size_t valueLength; + + PadCnnl(DataType, PadType, std::vector, std::vector, std::vector, size_t) noexcept; + static KernelBox build(PadDimension, DataType, PadType, std::optional>) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif//KERNEL_PAD_CNNL_HH diff --git a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc index 8f181b58b..2152acf5b 100644 --- a/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/scatter_nd/cnnl_kernel.cc @@ -78,7 +78,6 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlScatterNd_v2(res.fetchOrStore()->handle, CNNL_SCATTERND_UPDATE, d->indexDesc, inputs[1], d->updateDesc, inputs[2], d->inDesc, inputs[0], d->outDesc, outputs[0])); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } #endif diff --git a/src/04kernel/src/kernels/select/cnnl_kernel.cc b/src/04kernel/src/kernels/select/cnnl_kernel.cc new file mode 100644 index 000000000..be54a8904 --- /dev/null +++ b/src/04kernel/src/kernels/select/cnnl_kernel.cc @@ -0,0 +1,151 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = SelectCnnl; + + K::SelectCnnl(decltype(dataType) dataType_, + decltype(selectType) selectType_, + decltype(inputDims) inputDims_, + decltype(outputDims) outputDims_, + decltype(inputsNum) inputsNum_) noexcept + : dataType(dataType_), + selectType(selectType_), + inputDims(std::move(inputDims_)), + outputDims(std::move(outputDims_)), + inputsNum(inputsNum_) {} + + auto K::build(SelectType selectType_, TensorRefs inputs_) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + auto dt = inputs_[0].get().dataType; + std::vector> inputDims_, outputDims_; + for (size_t i = 0; i < inputs_.size(); i++) { + auto shape = std::vector(inputs_[i].get().shape.begin(), + inputs_[i].get().shape.end()); + if (shape.size() == 0) { + shape.push_back(1); + } + inputDims_.push_back(shape); + } + + auto broadcastShape = [](const std::vector &shape1, const std::vector &shape2) -> std::vector { + int max_dim = std::max(shape1.size(), shape2.size()); + + std::vector resultShape(max_dim, 1); + int dim_diff1 = max_dim - shape1.size(); + int dim_diff2 = max_dim - shape2.size(); + + for (int i = 0; i < max_dim; ++i) { + int dim_size1 = (i >= dim_diff1) ? shape1[i - dim_diff1] : 1; + int dim_size2 = (i >= dim_diff2) ? shape2[i - dim_diff2] : 1; + resultShape[i] = std::max(dim_size1, dim_size2); + } + + return resultShape; + }; + + for (size_t i = 1; i < inputs_.size(); i++) { + outputDims_.push_back(broadcastShape(inputDims_[i - 1], inputDims_[i])); + } + + return std::make_unique(dt, selectType_, inputDims_, outputDims_, inputs_.size()); + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing select operation using CNNL"; + } + +#ifdef USE_BANG + auto K::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + struct Descriptors { + std::vector in, out; + + explicit Descriptors(int n) + : in(std::vector(n, nullptr)), + out(std::vector(n - 1, nullptr)) { + for (auto i = 0; i < n; i++) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&in[i])); + if (i != n - 1) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&out[i])); + } + } + } + ~Descriptors() noexcept(false) { + for (size_t i = 0; i < in.size(); i++) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(in[i])); + if (i != in.size() - 1) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); + } + } + } + + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + auto d = std::make_shared(inputsNum); + for (size_t i = 0; i < inputsNum; i++) { + setCnnlTensor(d->in[i], dataType, slice(inputDims[i].data(), inputDims[i].size())); + if (i != inputsNum - 1) { + setCnnlTensor(d->out[i], dataType, slice(outputDims[i].data(), outputDims[i].size())); + } + } + + auto handle = res.fetchOrStore()->handle; + size_t workspaceSize; + switch (selectType) { + case SelectType::Max: + CNNL_ASSERT(cnnlGetMaximumWorkspaceSize(handle, d->out.back(), &workspaceSize)); + break; + case SelectType::Min: + CNNL_ASSERT(cnnlGetMinimumWorkspaceSize(handle, d->out.back(), &workspaceSize)); + break; + default: + UNREACHABLE(); + } + + res.fetchOrStore(); + auto routine = [d = std::move(d), type = selectType, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // fetch cnnl handle from resources + auto handle = res.fetchOrStore()->handle; + + auto select = + (type == SelectType::Max) ? cnnlMaximum + : (type == SelectType::Min) ? cnnlMinimum + : nullptr; + + for (size_t i = 1; i < d->in.size(); i++) { + if (i == 1) { + CNNL_ASSERT(select( + handle, d->in[0], inputs[0], d->in[1], inputs[1], + d->out[0], outputs[0], workspace, workspaceSize)); + } else { + CNNL_ASSERT(select( + handle, d->out[i - 2], outputs[0], d->in[i], inputs[i], + d->out[i - 1], outputs[0], workspace, workspaceSize)); + } + } + }; + + return {std::move(routine), workspaceSize}; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/select/cnnl_kernel.hh b/src/04kernel/src/kernels/select/cnnl_kernel.hh new file mode 100644 index 000000000..680911f36 --- /dev/null +++ b/src/04kernel/src/kernels/select/cnnl_kernel.hh @@ -0,0 +1,33 @@ +#ifndef KERNEL_SELECT_CNNL_KERNEL_HH +#define KERNEL_SELECT_CNNL_KERNEL_HH + +#include "kernel/attributes/broadcaster.h" +#include "kernel/collectors/select.h" +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct SelectCnnl final : public Kernel { + DataType dataType; + SelectType selectType; + std::vector> inputDims; + std::vector> outputDims; + size_t inputsNum; + + SelectCnnl(decltype(dataType), decltype(selectType), decltype(inputDims), + decltype(outputDims), decltype(inputsNum)) noexcept; + + static KernelBox build(SelectType, TensorRefs) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SELECT_CNNL_KERNEL_HH diff --git a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc index a5a6881e4..58f2d4fd4 100644 --- a/src/04kernel/src/kernels/transpose/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/transpose/cnnl_kernel.cc @@ -72,8 +72,12 @@ namespace refactor::kernel { }; auto d = std::make_shared(info.dataType != DT::F64); - setCnnlTensor(d->x, info.dataType, slice(info.inDim.data(), info.inDim.size())); - setCnnlTensor(d->y, info.dataType, slice(info.outDim.data(), info.outDim.size())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.inDim.size(), info.inDim.data())); + CNNL_ASSERT(cnnlSetTensorDescriptor( + d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType), + info.outDim.size(), info.outDim.data())); CNNL_ASSERT(cnnlSetTransposeDescriptor(d->trans, info.perm.size(), info.perm.data())); auto handle = res.fetchOrStore()->handle; diff --git a/src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp b/src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp new file mode 100644 index 000000000..ad26438bf --- /dev/null +++ b/src/04kernel/test/kernels/hard_sigmoid/test_cnnl.cpp @@ -0,0 +1,51 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/hard_sigmoid/cnnl_kernel.hh" +#include "../../../src/kernels/hard_sigmoid/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, HardSigmoidCnnl) { + // build routine + auto dataTensor = Tensor::share(DataType::F32, Shape{2, 3, 5}); + float alpha = 0.2f, beta = 0.5f; + auto kernel = HardSigmoidCnnl::build(alpha, beta, *dataTensor); + auto kCpu = HardSigmoidCpu::build(alpha, beta, *dataTensor); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluMem = dev.malloc(dataTensor->bytesSize()); + // put input data + std::vector data(dataTensor->elementsSize()); + for (auto i : range0_(data.size())) { data[i] = i; } + mluMem->copyFromHost(data.data(), dataTensor->bytesSize()); + // inference + { + void const *inputs[]{*mluMem}; + void *outputs[]{*mluMem}; + routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); + } + { + void const *inputs[]{data.data()}; + void *outputs[]{data.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(dataTensor->elementsSize()); + mluMem->copyToHost(result.data(), dataTensor->bytesSize()); + // check + for (auto i : range0_(data.size())) { + EXPECT_FLOAT_EQ(data[i], result[i]); + } +} + +#endif diff --git a/src/04kernel/test/kernels/pad/test_cnnl.cpp b/src/04kernel/test/kernels/pad/test_cnnl.cpp new file mode 100644 index 000000000..9243be4c7 --- /dev/null +++ b/src/04kernel/test/kernels/pad/test_cnnl.cpp @@ -0,0 +1,131 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/pad/cpu_kernel.hh" +#include "../../../src/kernels/pad/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, PadCnnl) { + { + PadDimension dims{ + {2, 4, 1}, + {3, 5, 1}, + {1, 1, 0}, + {4, 8, 2}, + }; + // build routine + auto t1Tensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 4}); + auto t2Tensor = Tensor::share(DataType::I64, Shape{8}); + auto t3Tensor = Tensor::share(DataType::F32, Shape{}); + auto yTensor = Tensor::share(DataType::F32, Shape{4, 5, 1, 8}); + PadType type = PadType::Constant; + auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor))); + auto kernel = PadCnnl::build(dims, DataType::F32, type, std::make_optional(std::reference_wrapper(*t3Tensor))); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluIn = dev.malloc(t1Tensor->bytesSize()), + mluIn2 = dev.malloc(t2Tensor->bytesSize()), + mluIn3 = dev.malloc(t3Tensor->bytesSize()), + mluOut = dev.malloc(yTensor->bytesSize()); + // put input data + std::vector data(t1Tensor->elementsSize()), + constvalue(1, 1.2f), + cpuOut(yTensor->elementsSize()); + std::vector pads{1, 1, 0, 2, 1, 1, 0, 2}; + + + for (auto i : range0_(data.size())) { data[i] = i; } + mluIn->copyFromHost(data.data(), t1Tensor->bytesSize()); + mluIn2->copyFromHost(pads.data(), t2Tensor->bytesSize()); + mluIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize()); + + // inference + { + void const *inputs[]{*mluIn, *mluIn2, *mluIn3}; + void *outputs[]{*mluOut}; + routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); + } + { + void const *inputs[]{data.data(), pads.data(), constvalue.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(yTensor->elementsSize()); + mluOut->copyToHost(result.data(), yTensor->bytesSize()); + // check + for (auto i : range0_(cpuOut.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } + } + + { + PadDimension dims{ + {2, 2, 0}, + {3, 3, 0}, + {1, 1, 0}, + {4, 4, 0}, + }; + // build routine + auto t1Tensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 4}); + auto t2Tensor = Tensor::share(DataType::I64, Shape{8}); + auto t3Tensor = Tensor::share(DataType::F32, Shape{}); + auto yTensor = Tensor::share(DataType::F32, Shape{2, 3, 1, 4}); + PadType type = PadType::Constant; + auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor))); + auto kernel = PadCnnl::build(dims, DataType::F32, type, std::make_optional(std::reference_wrapper(*t3Tensor))); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluIn = dev.malloc(t1Tensor->bytesSize()), + mluIn2 = dev.malloc(t2Tensor->bytesSize()), + mluIn3 = dev.malloc(t3Tensor->bytesSize()), + mluOut = dev.malloc(yTensor->bytesSize()); + // put input data + std::vector data(t1Tensor->elementsSize()), + constvalue(1, 1.2f), + cpuOut(yTensor->elementsSize()); + std::vector pads{0, 0, 0, 0, 0, 0, 0, 0}; + + + for (auto i : range0_(data.size())) { data[i] = i; } + mluIn->copyFromHost(data.data(), t1Tensor->bytesSize()); + mluIn2->copyFromHost(pads.data(), t2Tensor->bytesSize()); + mluIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize()); + + // inference + { + void const *inputs[]{*mluIn, *mluIn2, *mluIn3}; + void *outputs[]{*mluOut}; + routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); + } + { + void const *inputs[]{data.data(), pads.data(), constvalue.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(yTensor->elementsSize()); + mluOut->copyToHost(result.data(), yTensor->bytesSize()); + // check + for (auto i : range0_(cpuOut.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } + } +} + +#endif diff --git a/src/04kernel/test/kernels/select/test_cnnl.cpp b/src/04kernel/test/kernels/select/test_cnnl.cpp new file mode 100644 index 000000000..b691fc3d1 --- /dev/null +++ b/src/04kernel/test/kernels/select/test_cnnl.cpp @@ -0,0 +1,99 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/select/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" +#include "hardware/device_manager.h" +#include +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +static void testSelect(const SelectType selectType, const std::vector &shapes, const Shape &outShape, const std::vector> &data, + const std::vector expectData) { + // build routine + TensorRefs dataTensors; + std::vector tensorsVec; + for (size_t i = 0; i < shapes.size(); ++i) { + tensorsVec.push_back(Tensor(DataType::F32, shapes[i], LayoutType::Others, nullptr)); + } + for (size_t i = 0; i < shapes.size(); ++i) { + dataTensors.push_back(std::cref(tensorsVec[i])); + } + auto result = Tensor::share(DataType::F32, outShape); + auto kernel = SelectCnnl::build(selectType, dataTensors); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto [routine, workspaceSize] = kernel->lower(res); + // cnnl malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + Arc + workspace = dev.malloc(workspaceSize), + mluIns[]{ + dev.malloc(dataTensors[0].get().bytesSize()), + dev.malloc(dataTensors[1].get().bytesSize()), + dev.malloc(dataTensors[2].get().bytesSize()), + }, + mluOut = dev.malloc(result->bytesSize()); + // put input data + mluIns[0]->copyFromHost(data[0].data(), dataTensors[0].get().bytesSize()); + mluIns[1]->copyFromHost(data[1].data(), dataTensors[1].get().bytesSize()); + mluIns[2]->copyFromHost(data[2].data(), dataTensors[2].get().bytesSize()); + // inference + { + void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2]}; + void *outputs[]{*mluOut}; + routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); + } + // check + std::vector out(result->elementsSize()); + mluOut->copyToHost(out.data(), result->bytesSize()); + for (auto i : range0_(expectData.size())) { + EXPECT_FLOAT_EQ(expectData[i], out[i]); + } +} + +TEST(kernel, SelectCnnl) { + // no need broadcast + testSelect(SelectType::Max, + {{1, 3}, {1, 3}, {1, 3}}, + {1, 3}, + {{3, 2, 1}, {1, 4, 4}, {2, 5, 3}}, + {3, 5, 4}); + + testSelect(SelectType::Min, + {{1, 3}, {1, 3}, {1, 3}}, + {1, 3}, + {{3, 2, 1}, {1, 4, 4}, {2, 5, 3}}, + {1, 2, 1}); + + // need broadcast + testSelect(SelectType::Max, + {{3}, {1, 3}, {1, 3}}, + {1, 3}, + {{3, 3, 3}, {1, 4, 4}, {2, 5, 3}}, + {3, 5, 4}); + + testSelect(SelectType::Min, + {{3}, {1, 3}, {1, 3}}, + {1, 3}, + {{3, 3, 3}, {1, 4, 4}, {2, 5, 3}}, + {1, 3, 3}); + + testSelect(SelectType::Max, + {{1}, {1, 3}, {1, 3}}, + {1, 3}, + {{3}, {1, 4, 4}, {2, 5, 3}}, + {3, 5, 4}); + + testSelect(SelectType::Min, + {{1}, {1, 3}, {1, 3}}, + {1, 3}, + {{3}, {1, 4, 4}, {2, 5, 3}}, + {1, 3, 3}); +} + +#endif