From e838327ae7450b7b6dc49c0b1f43bf57288d53db Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 1 Mar 2024 07:09:31 +0000 Subject: [PATCH 1/2] copy cnnlSoftmax --- src/04kernel/src/collectors/softmax.cc | 7 ++ .../src/kernels/softmax/cnnl_kernel.cc | 88 +++++++++++++++++++ .../src/kernels/softmax/cnnl_kernel.hh | 36 ++++++++ .../src/utilities/bang/cnnl_context.cc | 45 ++++++++++ .../src/utilities/bang/cnnl_context.hh | 32 +++++++ .../src/utilities/bang/cnnl_functions.cpp | 39 ++++++++ .../src/utilities/bang/cnnl_functions.h | 41 +++++++++ .../src/utilities/bang/cnrt_functions.cc | 28 ++++++ .../src/utilities/bang/cnrt_functions.h | 17 ++++ .../test/kernels/softmax/test_cnnl.cpp | 55 ++++++++++++ src/09python_ffi/CMakeLists.txt | 3 + src/09python_ffi/src/compiler.cc | 1 + 12 files changed, 392 insertions(+) create mode 100644 src/04kernel/src/kernels/softmax/cnnl_kernel.cc create mode 100644 src/04kernel/src/kernels/softmax/cnnl_kernel.hh create mode 100644 src/04kernel/src/utilities/bang/cnnl_context.cc create mode 100644 src/04kernel/src/utilities/bang/cnnl_context.hh create mode 100644 src/04kernel/src/utilities/bang/cnnl_functions.cpp create mode 100644 src/04kernel/src/utilities/bang/cnnl_functions.h create mode 100644 src/04kernel/src/utilities/bang/cnrt_functions.cc create mode 100644 src/04kernel/src/utilities/bang/cnrt_functions.h create mode 100644 src/04kernel/test/kernels/softmax/test_cnnl.cpp diff --git a/src/04kernel/src/collectors/softmax.cc b/src/04kernel/src/collectors/softmax.cc index 2ce442696..020bc6ded 100644 --- a/src/04kernel/src/collectors/softmax.cc +++ b/src/04kernel/src/collectors/softmax.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/softmax.h" +#include "../kernels/softmax/cnnl_kernel.hh" #include "../kernels/softmax/cpu_kernel.hh" #include "../kernels/softmax/cuda_kernel.hh" #include "../kernels/softmax/cudnn_kernel.hh" @@ -28,6 +29,12 @@ namespace refactor::kernel { } break; } + case decltype(_target)::Mlu: { + if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + } default: UNREACHABLEX(void, "Unknown target"); } diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc new file mode 100644 index 000000000..babaf33cc --- /dev/null +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc @@ -0,0 +1,88 @@ +#include "cnnl_kernel.hh" + +#ifdef USE_BANG +#include "../../utilities/bang/cnnl_context.hh" +#include "../../utilities/bang/cnnl_functions.h" +#endif + +namespace refactor::kernel { + using K = SoftmaxCnnl; + + K::SoftmaxCnnl(cnnl::SoftmaxAlgo algo_, DataType type_, + int pre_, int mid_, int post_) noexcept + : Kernel(), algo(algo_), dataType(type_), + pre(pre_), mid(mid_), post(post_) {} + + auto K::build(cnnl::SoftmaxAlgo algo, SoftmaxInfo info) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + return std::make_unique(algo, info.type, info.pre, info.mid, info.post); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing softmax forward with CNNL"; + } + +#ifdef USE_BANG + + auto SoftmaxCnnl::lower(Resources &res) const -> RoutineWorkspace { + using namespace cnnl; + using namespace runtime; + + // RAII for closure + struct Descriptors { + cnnlTensorDescriptor_t t; + cnnlSoftmaxAlgorithm_t algo; + bool f32; + + Descriptors(decltype(algo) algo_, decltype(f32) f32_) + : algo(algo_), f32(f32_) { + CNNL_ASSERT(cnnlCreateTensorDescriptor(&t)); + } + ~Descriptors() noexcept(false) { + CNNL_ASSERT(cnnlDestroyTensorDescriptor(t)); + } + Descriptors(const Descriptors &) = delete; + Descriptors(Descriptors &&) = delete; + }; + + auto d = std::make_shared( + static_cast(algo), + dataType != DataType::F64); + int dims[]{pre, mid, post}; + // cnnlSoftmaxMode_t mode = (pre == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION + // : (post == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION + // : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + // FIXME(bolun): CNNL Softmax mode + cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + + // cnnlSoftmaxForward_v2 is applied to a 3D input tensor only + CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims)); + + res.fetchOrStore(); + return [d = std::move(d), mode](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + // build alpha/beta for double + auto a = d->f32 ? factor(1) : factor(1), + b = d->f32 ? factor(0) : factor(0); + CNNL_ASSERT(cnnlSoftmaxForward_v2( + res.fetchOrStore()->handle, + d->algo, + mode, + CNNL_COMPUTATION_ULTRAHIGH_PRECISION, + &a, d->t, inputs[0], + &b, d->t, outputs[0])); + }; + } + +#endif + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.hh b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh new file mode 100644 index 000000000..b9bedb5a4 --- /dev/null +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.hh @@ -0,0 +1,36 @@ +#ifndef KERNEL_SOFTMAX_CNNL_HH +#define KERNEL_SOFTMAX_CNNL_HH + +#include "kernel/attributes/softmax_info.h" +#include "kernel/collectors/softmax.h" + +namespace refactor::kernel { + + namespace cnnl { + enum class SoftmaxAlgo { + FAST = 0, + ACCURATE = 1, + LOG = 2, + }; + }// namespace cnnl + + struct SoftmaxCnnl final : public Kernel { + cnnl::SoftmaxAlgo algo; + DataType dataType; + int pre, mid, post; + + SoftmaxCnnl(cnnl::SoftmaxAlgo, DataType, int, int, int) noexcept; + + static KernelBox build(cnnl::SoftmaxAlgo, SoftmaxInfo) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SOFTMAX_CNNL_HH diff --git a/src/04kernel/src/utilities/bang/cnnl_context.cc b/src/04kernel/src/utilities/bang/cnnl_context.cc new file mode 100644 index 000000000..54bad2d93 --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_context.cc @@ -0,0 +1,45 @@ +#ifdef USE_BANG + +#include "cnnl_context.hh" +#include "cnnl_functions.h" + +namespace refactor::kernel::cnnl { + + CnnlContext::CnnlContext() : runtime::Resource() { + BANG_ASSERT(cnrtQueueCreate(&queue)); + CNNL_ASSERT(cnnlCreate(&handle)); + CNNL_ASSERT(cnnlSetQueue(handle, queue)); + } + CnnlContext::~CnnlContext() { + BANG_ASSERT(cnrtQueueDestroy(queue)); + CNNL_ASSERT(cnnlDestroy(handle)); + } + + auto CnnlContext::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + auto CnnlContext::build() -> runtime::ResourceBox { + return std::make_unique(); + } + + auto CnnlContext::resourceTypeId() const noexcept -> size_t { + return typeId(); + } + auto CnnlContext::description() const noexcept -> std::string_view { + return "CnnlContext"; + } + + void CnnlContext::copyFromCPU(void *dst, const void *src, size_t size) { + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), size, + CNRT_MEM_TRANS_DIR_HOST2DEV)); + } + + void CnnlContext::queueSync() { + BANG_ASSERT(cnrtQueueSync(queue)); + } + +}// namespace refactor::kernel::cnnl + +#endif + diff --git a/src/04kernel/src/utilities/bang/cnnl_context.hh b/src/04kernel/src/utilities/bang/cnnl_context.hh new file mode 100644 index 000000000..d7a66d0be --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_context.hh @@ -0,0 +1,32 @@ +#ifndef KERNEL_CNNL_CONTEXT_HH +#define KERNEL_CNNL_CONTEXT_HH + +#include "runtime/resource.h" +#include +#include + +namespace refactor::kernel::cnnl { + + struct CnnlContext final : public runtime::Resource { + cnnlHandle_t handle; + cnrtQueue_t queue; + + CnnlContext(); + ~CnnlContext(); + CnnlContext(CnnlContext const &) noexcept = delete; + CnnlContext(CnnlContext &&) noexcept = delete; + + static size_t typeId() noexcept; + static runtime::ResourceBox build(); + + size_t resourceTypeId() const noexcept final; + std::string_view description() const noexcept final; + + void copyFromCPU(void *dst, const void *src, size_t size); + void queueSync(); + }; + +}// namespace refactor::kernel::cnnl + +#endif// KERNEL_CNNL_CONTEXT_HH + diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.cpp b/src/04kernel/src/utilities/bang/cnnl_functions.cpp new file mode 100644 index 000000000..12bdbfaee --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_functions.cpp @@ -0,0 +1,39 @@ +#ifdef USE_BANG + +#include "cnnl_functions.h" + +namespace refactor::kernel::cnnl { + + cnnlDataType_t cnnlDataTypeConvert(DataType dataType) { + // clang-format off + switch (dataType) { + case DataType::F32 : return CNNL_DTYPE_FLOAT; break; + case DataType::F64 : return CNNL_DTYPE_DOUBLE; break; + case DataType::FP16: return CNNL_DTYPE_HALF; break; + case DataType::I8 : return CNNL_DTYPE_INT8; break; + case DataType::I32 : return CNNL_DTYPE_INT32; break; + case DataType::U8 : return CNNL_DTYPE_UINT8; break; + case DataType::BF16: return CNNL_DTYPE_BFLOAT16; break; + case DataType::I64 : return CNNL_DTYPE_INT64; break; + case DataType::Bool: return CNNL_DTYPE_BOOL; break; + default: UNREACHABLE(); + } + // clang-format on + } + + void setCnnlTensor(cnnlTensorDescriptor_t t, DataType dt, slice_t d) { + auto dt_ = cnnlDataTypeConvert(dt); + if (auto n = d.size(); n == 4) { + CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin())); + } else if (n < 4) { + int d_[]{1, 1, 1, 1}; + std::copy_n(d.begin(), n, d_ + 4 - n); + CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, 4, std::move(d_))); + } else { + CNNL_ASSERT(cnnlSetTensorDescriptor(t, CNNL_LAYOUT_NCHW, dt_, d.size(), d.begin())); + } + } +}// namespace refactor::kernel::cnnl + +#endif + diff --git a/src/04kernel/src/utilities/bang/cnnl_functions.h b/src/04kernel/src/utilities/bang/cnnl_functions.h new file mode 100644 index 000000000..60b06ac5c --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnnl_functions.h @@ -0,0 +1,41 @@ +#ifndef KERNEL_CNNL_FUNCTIONS_H +#define KERNEL_CNNL_FUNCTIONS_H + +#include "common.h" +#include + +#define BANG_ASSERT(STATUS) \ + if (auto status = (STATUS); status != CNRT_RET_SUCCESS) { \ + RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \ + cnrtGetErrorStr(status), (int) status)); \ + } + +#define CNNL_ASSERT(STATUS) \ + if (auto status = (STATUS); status != CNNL_STATUS_SUCCESS) { \ + fmt::println("cnnl failed on \"" #STATUS "\" with {}", \ + cnnlGetErrorString(status)); \ + abort(); \ + } + +namespace refactor::kernel::cnnl { + + cnnlDataType_t cnnlDataTypeConvert(DataType); + + // A helper function that set Cnnl tensor descriptor given tensor shape and type + void setCnnlTensor(cnnlTensorDescriptor_t, DataType, slice_t); + + template + constexpr uint64_t factor(T x) noexcept { + static_assert(std::is_floating_point_v); + static_assert(sizeof(T) <= sizeof(uint64_t)); + union { + T f; + uint64_t i; + } u{x}; + return u.i; + } + +}// namespace refactor::kernel::cnnl + +#endif// KERNEL_CNNL_FUNCTIONS_H + diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc new file mode 100644 index 000000000..f2468d971 --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnrt_functions.cc @@ -0,0 +1,28 @@ +#ifdef USE_BANG +#include "cnrt_functions.h" +#include "cnnl_functions.h" +#include +#include + +namespace refactor::kernel::bang { + + int currentDevice() { + int device; + BANG_ASSERT(cnrtGetDevice(&device)); + return device; + } + + void sync() { + BANG_ASSERT(cnrtSyncDevice()); + } + + void copyOut(void *dst, const void *src, size_t size) { + sync(); + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), size, + CNRT_MEM_TRANS_DIR_DEV2HOST)); + } + +}// namespace refactor::kernel::bang + +#endif + diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h new file mode 100644 index 000000000..504addd6f --- /dev/null +++ b/src/04kernel/src/utilities/bang/cnrt_functions.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_CNRT_FUNCTIONS_H +#define KERNEL_CNRT_FUNCTIONS_H + +#include "common.h" + +namespace refactor::kernel::bang { + + int currentDevice(); + + void sync(); + + void copyOut(void *dst, const void *src, size_t size); + +}// namespace refactor::kernel::bang + +#endif// KERNEL_CNRT_FUNCTIONS_H + diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp new file mode 100644 index 000000000..e97701ca8 --- /dev/null +++ b/src/04kernel/test/kernels/softmax/test_cnnl.cpp @@ -0,0 +1,55 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/softmax/cnnl_kernel.hh" +#include "../../../src/kernels/softmax/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, SoftmaxCnnl) { + // build routine + auto xTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4}); + auto outTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4}); + dim_t axis = 2; + auto kCpu = SoftmaxCpu::build(SoftmaxInfo(*xTensor, axis)); + auto kCnnl = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::FAST, SoftmaxInfo(*xTensor, axis)); + ASSERT_TRUE(kCpu && kCnnl); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto rCnnl = kCnnl->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto mluIn = dev.malloc(xTensor->bytesSize()), + mluOut = dev.malloc(outTensor->bytesSize()); + // put input data + std::vector + data(xTensor->elementsSize(), 0), + cpuOut(outTensor->elementsSize()); + mluIn->copyFromHost(data.data(), xTensor->bytesSize()); + // inference + { + void const *inputs[]{data.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{*mluIn}; + void *outputs[]{*mluOut}; + rCnnl(res, nullptr, inputs, outputs); + kernel::bang::sync(); + } + // take output data + std::vector result(outTensor->elementsSize()); + mluOut->copyToHost(result.data(), outTensor->bytesSize()); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } +} + +#endif + diff --git a/src/09python_ffi/CMakeLists.txt b/src/09python_ffi/CMakeLists.txt index ccce34d37..c4d28a9f5 100644 --- a/src/09python_ffi/CMakeLists.txt +++ b/src/09python_ffi/CMakeLists.txt @@ -10,6 +10,9 @@ pybind11_add_module(python_ffi SHARED ${PYFFI_SRC}) target_link_libraries(python_ffi PRIVATE onnx llm communication) target_include_directories(python_ffi PRIVATE include) +if(USE_BANG) + target_include_directories(python_ffi PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../04kernel/src/utilities/bang) +endif() # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a # define (VERSION_INFO) here. # target_compile_definitions(python_ffi diff --git a/src/09python_ffi/src/compiler.cc b/src/09python_ffi/src/compiler.cc index bf04053e9..45450582e 100644 --- a/src/09python_ffi/src/compiler.cc +++ b/src/09python_ffi/src/compiler.cc @@ -95,6 +95,7 @@ namespace refactor::python_ffi { // clang-format off auto target_ = target == "cpu" ? Target::Cpu : target == "cuda" ? Target::Nvidia + : target == "mlu" ? Target::Mlu : UNREACHABLEX(Target, "Unknown target: {}", target); // clang-format on return compileOn(hardware::device::fetch(target_), From 6981bafb8e316aeced7d9a7ecbecf8d24168ad33 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 5 Mar 2024 05:45:28 +0000 Subject: [PATCH 2/2] compile error --- src/04kernel/CMakeLists.txt | 3 + src/04kernel/src/collectors/softmax.cc | 4 + .../src/kernels/softmax/bang_kernel.cc | 29 + .../src/kernels/softmax/bang_kernel.hh | 26 + .../src/kernels/softmax/bang_kernel.mlu | 599 ++++++++++++++++++ .../test/kernels/softmax/test_bang.cpp | 54 ++ 6 files changed, 715 insertions(+) create mode 100644 src/04kernel/src/kernels/softmax/bang_kernel.cc create mode 100644 src/04kernel/src/kernels/softmax/bang_kernel.hh create mode 100644 src/04kernel/src/kernels/softmax/bang_kernel.mlu create mode 100644 src/04kernel/test/kernels/softmax/test_bang.cpp diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt index 77b655c0e..e63009a08 100644 --- a/src/04kernel/CMakeLists.txt +++ b/src/04kernel/CMakeLists.txt @@ -7,6 +7,9 @@ if(USE_CUDA) file(GLOB_RECURSE KERNEL_CUDA_SRC src/*.cu) add_subdirectory(cuda) endif() +if(USE_BANG) + file(GLOB_RECURSE KERNEL_BANG_SRC src/*.mlu) +endif() add_library(kernel STATIC ${KERNEL_SRC} ${KERNEL_CUDA_SRC}) target_link_libraries(kernel PUBLIC runtime) diff --git a/src/04kernel/src/collectors/softmax.cc b/src/04kernel/src/collectors/softmax.cc index 020bc6ded..6f7b0a0c6 100644 --- a/src/04kernel/src/collectors/softmax.cc +++ b/src/04kernel/src/collectors/softmax.cc @@ -1,4 +1,5 @@ #include "kernel/collectors/softmax.h" +#include "../kernels/softmax/bang_kernel.hh" #include "../kernels/softmax/cnnl_kernel.hh" #include "../kernels/softmax/cpu_kernel.hh" #include "../kernels/softmax/cuda_kernel.hh" @@ -33,6 +34,9 @@ namespace refactor::kernel { if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) { ans.emplace_back(std::move(ptr)); } + if (auto ptr = SoftmaxBang::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } break; } default: diff --git a/src/04kernel/src/kernels/softmax/bang_kernel.cc b/src/04kernel/src/kernels/softmax/bang_kernel.cc new file mode 100644 index 000000000..e39e25a20 --- /dev/null +++ b/src/04kernel/src/kernels/softmax/bang_kernel.cc @@ -0,0 +1,29 @@ +#include "bang_kernel.hh" + +namespace refactor::kernel { + using K = SoftmaxBang; + + K::SoftmaxBang(SoftmaxInfo info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(SoftmaxInfo info) noexcept -> KernelBox { +#ifndef USE_BANG + return nullptr; +#endif + + return info.type.isFloat() + ? std::make_unique(std::move(info)) + : nullptr; + } + + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { return typeId(); } + auto K::description() const noexcept -> std::string_view { + return "Performing Softmax using BANG"; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/softmax/bang_kernel.hh b/src/04kernel/src/kernels/softmax/bang_kernel.hh new file mode 100644 index 000000000..b2b5fc038 --- /dev/null +++ b/src/04kernel/src/kernels/softmax/bang_kernel.hh @@ -0,0 +1,26 @@ +#ifndef KERNEL_SOFTMAX_BANG_HH +#define KERNEL_SOFTMAX_BANG_HH + +#include "cnnl.h" +#include "cnrt.h" +#include "kernel/attributes/softmax_info.h" +#include "kernel/collectors/softmax.h" +namespace refactor::kernel { + + struct SoftmaxBang final : public Kernel { + SoftmaxInfo info; + + SoftmaxBang(SoftmaxInfo) noexcept; + static KernelBox build(SoftmaxInfo) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_BANG + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif//KERNEL_SOFTMAX_BANG_HH diff --git a/src/04kernel/src/kernels/softmax/bang_kernel.mlu b/src/04kernel/src/kernels/softmax/bang_kernel.mlu new file mode 100644 index 000000000..f19c898cf --- /dev/null +++ b/src/04kernel/src/kernels/softmax/bang_kernel.mlu @@ -0,0 +1,599 @@ +#include "bang_kernel.hh" +#include +#include +#define EPS 1e-7 +const int NRAM_MAX_SIZE = 1024 * 256;//Apply for maximum memory in advance from NRAM +const int nramNum = NRAM_MAX_SIZE / sizeof(float); +const int SRC_MAX_SIZE = 1024 * 32;//The subsequent tree summation must ensure that SRC-MAX-SIZE is a power of 2 +const int maxNum = SRC_MAX_SIZE / sizeof(float); +const int warpSize = 32; + +namespace refactor::kernel { + using namespace runtime; + + __mlu_device__ void softmaxKernelAxis_m(float *destination, float *source, int frontsize, int dimsize, int stride, int strideS) { + 0= maxNum) { + float *src = nram_buffer; + float *tmpSum = src + maxNum; + float *tmpNewMax = tmpSum + maxNum; + float *tmpOldMax = tmpNewMax + maxNum; + + int remain = stride % maxNum; + int repeat = (stride - remain) / maxNum; + + int taskRemain = frontsize % taskDim; + int stepEasy = (frontsize - taskRemain) / taskDim; + int stepHard = stepEasy + 1; + + int indStart = (taskId < taskRemain ? taskId * stepHard : taskRemain * stepHard + (taskId - taskRemain) * stepEasy); + source = source + indStart * dimsize * stride; + destination = destination + indStart * dimsize * stride; + + for (int ind = taskId; ind < frontsize; ind += taskDim) { + int frontIdx = ind * dimsize * stride; + for (int j = 0; j < repeat; j++) { + __bang_write_value(tmpNewMax, maxNum, -INFINITY); + __bang_write_zero(tmpSum, maxNum); + __bang_write_zero(src, maxNum); + for (int i = 0; i < dimsize; i++) { + __memcpy(src, source + frontIdx + i * stride + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//Continuously updating the maximum value + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum); //exp(x - M) + if (i > 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, maxNum); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, maxNum);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, maxNum); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, src, maxNum); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM + } + __bang_active_recip_greater_1(tmpSum, tmpSum, maxNum);//compute 1/sum + //Start exponential transformation and write back to GDRAM + __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized + __memcpy(destination + (dimsize - 1) * stride + frontIdx + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM); + for (int i = 0; i < dimsize - 1; i++) { + __memcpy(src, source + frontIdx + i * stride + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM); + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum);//exp(x - M) + __bang_mul(src, src, tmpSum, maxNum); + __memcpy(destination + frontIdx + i * stride + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM); + } + } + if (remain) { + __bang_write_value(tmpNewMax, maxNum, -INFINITY); + __bang_write_zero(tmpSum, maxNum); + __bang_write_value(src, maxNum, -INFINITY); + for (int i = 0; i < dimsize; i++) { + __memcpy(src, source + frontIdx + i * stride + repeat * maxNum, remain * sizeof(float), GDRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum); + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum);//exp(x - M) + if (i > 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, maxNum); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, maxNum);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, maxNum); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, src, maxNum); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM + } + //------------------- + __bang_active_recip_greater_1(tmpSum, tmpSum, maxNum);//compute 1/sum + //Start exponential transformation and write back to GDRAM + __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized + __memcpy(destination + (dimsize - 1) * stride + frontIdx + repeat * maxNum, src, remain * sizeof(float), NRAM2GDRAM); + for (int i = 0; i < dimsize - 1; i++) { + __memcpy(src, source + i * stride + frontIdx + repeat * maxNum, remain * sizeof(float), GDRAM2NRAM); + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum);//exp(x - M) + __bang_mul(src, src, tmpSum, maxNum); + __memcpy(destination + i * stride + frontIdx + repeat * maxNum, src, remain * sizeof(float), NRAM2GDRAM); + } + //--------------------- + } + } + } else if (stride < maxNum && dimsize * stride >= maxNum) { + + + float *src = nram_buffer; + float *tmp = src + maxNum; + float *tmpOldMax = tmp + strideS; + float *tmpNewMax = tmpOldMax + strideS; + float *tmpSum = tmpNewMax + strideS; + + int multiple = maxNum / stride; + int size = multiple * stride; //The maximum amount of data that can be stored in an SRC + int remain = dimsize % multiple; //If it cannot be divisible, this part of the data needs special processing + int repeat = (dimsize - remain) / multiple;//The total number of loops required to load the entire dimsize + + int taskRemain = frontsize % taskDim; + int stepEasy = (frontsize - taskRemain) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < taskRemain ? stepHard : stepEasy);//The number of frontsize processed per taskId + int indStart = (taskId < taskRemain ? taskId * stepHard : taskRemain * stepHard + (taskId - taskRemain) * stepEasy); + source = source + indStart * dimsize * stride; + destination = destination + indStart * dimsize * stride; + //printf("maxNum:%d, dimsize * stride:%d, multiple:%d, size:%d, repeat:%d,remain:%d\n",maxNum, dimsize * stride, multiple, size, repeat,remain); + for (int ind = 0; ind < step; ind++) { + int frontIdx = ind * dimsize * stride; + + __bang_write_value(tmpNewMax, strideS, -INFINITY);//Must be initialized to negative infinity + __bang_write_value(tmp, strideS, -INFINITY); //Must be initialized to negative infinity + __bang_write_zero(tmpSum, strideS); //Must be initialized to zero + + for (int j = 0; j < repeat; j++) { + __memcpy(src, source + frontIdx + j * multiple * stride, size * sizeof(float), GDRAM2NRAM); + for (int m = 0; m < multiple; m++) { + __memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM); + + __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);//Although the stream S stream section after tmpNewMax is 0, there is no need to write back to GDRAM, which does not affect the result + + __bang_sub(tmp, tmp, tmpNewMax, strideS);//The stripe S stripe section after tmp is 0 + __bang_active_exp_less_0(tmp, tmp, strideS); + if (j != 0 || m != 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, strideS);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, strideS); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, tmp, strideS);//sum += exp(x - M) + //if(m == 0) __bang_printf("tmp:%.2f, tmpMax[0]:%.2f,tmpSum[0]:%.2f\n", tmp[1], tmpNewMax[1],tmpSum[0]); + __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM + } + } + //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[0],tmpSum[0]); + if (remain) { + __memcpy(src, source + frontIdx + repeat * multiple * stride, remain * stride * sizeof(float), GDRAM2NRAM); + for (int m = 0; m < remain; m++) { + __memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS); + __bang_sub(tmp, tmp, tmpNewMax, strideS);//The stripe S stripe section after tmp is 0 + __bang_active_exp_less_0(tmp, tmp, strideS); + if (repeat != 0 || m != 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, strideS);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, strideS); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, tmp, strideS); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM + } + } + + //At this point, tmpNewMax stores the maximum value of the data corresponding to a fixed frontIdx and bedsize, while tmpSum stores the corresponding value sum + //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]); + __bang_active_recip_greater_1(tmpSum, tmpSum, strideS); + //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]); + if (remain) { + for (int m = 0; m < remain; m++) { + __memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM); + __bang_sub(tmp, tmp, tmpNewMax, strideS); + __bang_active_exp_less_0(tmp, tmp, strideS); + __bang_mul(tmp, tmp, tmpSum, strideS); + __memcpy(destination + frontIdx + repeat * multiple * stride + m * stride, tmp, stride * sizeof(float), NRAM2GDRAM); + } + } + for (int j = 0; j < repeat; j++) { + __memcpy(src, source + frontIdx + j * multiple * stride, size * sizeof(float), GDRAM2NRAM); + for (int m = 0; m < multiple; m++) { + __memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM); + + __bang_sub(tmp, tmp, tmpNewMax, strideS); + __bang_active_exp_less_0(tmp, tmp, strideS); + __bang_mul(tmp, tmp, tmpSum, strideS); + __memcpy(destination + frontIdx + j * multiple * stride + m * stride, tmp, stride * sizeof(float), NRAM2GDRAM); + } + } + } + } else if (dimsize * stride < maxNum) { + + float *src = nram_buffer; + float *tmp = src + maxNum; + float *tmpOldMax = tmp + strideS; + float *tmpNewMax = tmpOldMax + strideS; + float *tmpSum = tmpNewMax + strideS; + int behindsize = dimsize * stride; + int multiple = maxNum / behindsize;//Represents the amount that a maxNum can share in frontsize + + int remainF = frontsize % (taskDim * multiple); + int remainT = remainF % taskDim; + int stepEasy = (remainF - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int taskRepeat = (frontsize - remainF) / (taskDim * multiple); + //At this point, corresponding to frontsize, the amount of data processed by each taskId is taskRepeat * multiple+step + int startHard = taskId * (taskRepeat * multiple + stepHard); + int startEasy = remainT * (taskRepeat * multiple + stepHard) + (taskId - remainT) * (taskRepeat * multiple + stepEasy); + int indStart = (taskId < remainT ? startHard : startEasy); + source = source + indStart * behindsize;//indStart * behindsize Indicates the offset corresponding to different taskIds + destination = destination + indStart * behindsize; + int tid; + for (int s = 0; s < taskRepeat; s++) { + tid = s * multiple * behindsize; + __memcpy(src, source + tid, multiple * behindsize * sizeof(float), GDRAM2NRAM); + for (int m = 0; m < multiple; m++) { + __bang_write_zero(tmpSum, strideS); + __bang_write_value(tmp, strideS, -INFINITY); + __bang_write_value(tmpNewMax, strideS, -INFINITY); + for (int i = 0; i < dimsize; i++) { + __memcpy(tmp, src + m * behindsize + i * stride, stride * sizeof(float), NRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS); + __bang_sub(tmp, tmp, tmpNewMax, strideS); //x - M + __bang_active_exp_less_0(tmp, tmp, strideS);//exp(x - M) + if (i > 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, strideS);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, strideS); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, tmp, strideS); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM + } + __bang_active_recip_greater_1(tmpSum, tmpSum, strideS); + __bang_mul(tmp, tmp, tmpSum, strideS);//The data stored in tmp at the end of the loop above can be utilized + //__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM); + __memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM); + for (int i = 0; i < dimsize - 1; i++) { + __memcpy(tmp, src + m * behindsize + i * stride, stride * sizeof(float), NRAM2NRAM); + __bang_sub(tmp, tmp, tmpNewMax, strideS); //x - M + __bang_active_exp_less_0(tmp, tmp, strideS);//exp(x - M) + __bang_mul(tmp, tmp, tmpSum, strideS); + //__memcpy(destination + tid + m * behindsize + i * stride, tmp, stride * sizeof(float), NRAM2GDRAM); + __memcpy(src + m * behindsize + i * stride, tmp, stride * sizeof(float), NRAM2NRAM); + } + } + __memcpy(destination + tid, src, multiple * behindsize * sizeof(float), NRAM2GDRAM); + } + //__bang_printf("taskId:%d, multiple:%d, taskRepeat:%d, step:%d, indStart:%d\n",taskId, multiple, taskRepeat, step, indStart * behindsize); + if (step) { + tid = taskRepeat * multiple * behindsize; + __memcpy(src, source + tid, step * behindsize * sizeof(float), GDRAM2NRAM); + for (int m = 0; m < step; m++) { + __bang_write_zero(tmpSum, strideS); + __bang_write_value(tmp, strideS, -INFINITY); + __bang_write_value(tmpNewMax, strideS, -INFINITY); + for (int i = 0; i < dimsize; i++) { + __memcpy(tmp, src + m * behindsize + i * stride, stride * sizeof(float), NRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS); + __bang_sub(tmp, tmp, tmpNewMax, strideS); //x - M + __bang_active_exp_less_0(tmp, tmp, strideS);//exp(x - M) + if (i > 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, strideS);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, strideS); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, tmp, strideS); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM + } + //__bang_printf("max:%.2f,%.2f, sum:%.2f,sum:%.2f\n", tmpNewMax[0], tmpNewMax[1], tmpSum[0], tmpSum[0]); + __bang_active_recip_greater_1(tmpSum, tmpSum, strideS); + __bang_mul(tmp, tmp, tmpSum, strideS);//The data stored in tmp at the end of the loop above can be utilized + //__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM); + __memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM); + for (int i = 0; i < dimsize - 1; i++) { + __memcpy(tmp, src + m * behindsize + i * stride, stride * sizeof(float), NRAM2NRAM); + __bang_sub(tmp, tmp, tmpNewMax, strideS); //x - M + __bang_active_exp_less_0(tmp, tmp, strideS);//exp(x - M) + __bang_mul(tmp, tmp, tmpSum, strideS); + //__memcpy(destination + tid + m * behindsize + i * stride, tmp, stride * sizeof(float), NRAM2GDRAM); + __memcpy(src + m * behindsize + i * stride, tmp, stride * sizeof(float), NRAM2NRAM); + } + } + __memcpy(destination + tid, src, step * behindsize * sizeof(float), NRAM2GDRAM); + } + } + } + + __mlu_device__ void softmaxKernelAxis_e(float *destination, float *source, int othersize, int dimsize, int dimS) {axis = -1 + int multiple = maxNum / dimsize; + int size = taskDim * multiple; + int remainS = othersize % size; + int taskRepeat = (othersize - remainS) / size; + int remainT = remainS % taskDim; + int stepEasy = (remainS - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + //The amount allocated for processing othersize for each taskId is taskRepeat * multiple+step + //Overall, the amount of data processed by each taskId is (taskRepeat * multiple+step) * dimsize + int startHard = taskId * (taskRepeat * multiple + stepHard); + int startEasy = remainT * (taskRepeat * multiple + stepHard) + (taskId - remainT) * (taskRepeat * multiple + stepEasy); + int indStart = (taskId < remainT ? startHard : startEasy); + source = source + indStart * dimsize; + destination = destination + indStart * dimsize; + + __nram__ float nram_buffer[nramNum]; + + float *src = nram_buffer; + float *tmp = src + maxNum; + float *destSum = tmp + dimS; + int remainDim = dimsize % dimS;//Dimsize may not be a power of 2 + int repeatDim = (dimsize - remainDim) / dimS; + __nram__ float destSumFinal[warpSize];//Reduce destSum to destFinal [0] + __nram__ float srcMax[2]; + __nram__ float destOldMax; + __nram__ float destNewMax; + //printf("taskId:%d, taskRepeat:%d, step:%d, repeatDim:%d, indstart:%d, %d\n", taskId, taskRepeat, step, repeatDim, indStart, indStart * dimsize); + int tid; + for (int s = 0; s < taskRepeat; s++) { + tid = s * multiple * dimsize; + __memcpy(src, source + tid, multiple * dimsize * sizeof(float), GDRAM2NRAM); + for (int j = 0; j < multiple; j++) { + __bang_write_zero(destSum, dimS); + __bang_write_zero(destSumFinal, warpSize); + destNewMax = -INFINITY; + + for (int i = 0; i < repeatDim; i++) { + __memcpy(tmp, src + j * dimsize + i * dimS, dimS * sizeof(float), NRAM2NRAM); + __bang_argmax(srcMax, tmp, dimS); + if (destNewMax < srcMax[0]) { + destNewMax = srcMax[0]; + } + __bang_sub_scalar(tmp, tmp, destNewMax, dimS); + __bang_active_exp_less_0(tmp, tmp, dimS); + if (i > 0) { + __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), dimS); + } + __bang_add(destSum, destSum, tmp, dimS); + destOldMax = destNewMax; + } + if (remainDim) { + __bang_write_value(tmp, dimS, -INFINITY); + __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM); + __bang_argmax(srcMax, tmp, dimS); + if (destNewMax < srcMax[0]) { + destNewMax = srcMax[0]; + } + __bang_write_value(tmp, dimS, destNewMax);//Must be reinitialized to NewMax + __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM); + __bang_sub_scalar(tmp, tmp, destNewMax, dimS); + __bang_active_exp_less_0(tmp, tmp, dimS); + if (repeatDim > 0) { + __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), dimS); + } + __bang_add(destSum, destSum, tmp, dimS); + destOldMax = destNewMax; + } + + int segNum = dimS / warpSize;//Starting numerical summation + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int i = 0; i < strip; i++) { + __bang_add(destSum + i * warpSize, destSum + i * warpSize, destSum + (i + strip) * warpSize, warpSize); + } + } + __bang_reduce_sum(destSumFinal, destSum, warpSize);//At this point, destSumFinal [0] saves the numerical value of the current dimsize length data sum + if (remainDim) { + destSumFinal[0] = destSumFinal[0] - (dimS - remainDim); + } + //Now let's start writing back the data + float globalSumInv = 1.0 / destSumFinal[0]; + if (remainDim) { + __bang_mul_scalar(tmp, tmp, globalSumInv, dimS); + __memcpy(destination + tid + j * dimsize + repeatDim * dimS, tmp, remainDim * sizeof(float), NRAM2GDRAM); + } + for (int i = 0; i < repeatDim; i++) { + __memcpy(tmp, src + j * dimsize + i * dimS, dimS * sizeof(float), NRAM2NRAM); + __bang_sub_scalar(tmp, tmp, destNewMax, dimS); + __bang_active_exp_less_0(tmp, tmp, dimS); + __bang_mul_scalar(tmp, tmp, globalSumInv, dimS); + __memcpy(destination + tid + j * dimsize + i * dimS, tmp, dimS * sizeof(float), NRAM2GDRAM); + } + } + //it is necessary to write back to GDRAM immediately. If you first write back to src and then write back to GDRAM, + //there may be a situation where src writes back to GDRAM before modifying the src data + } + if (step) {//Step targets parts of othersize that cannot be divided by multiple * dimsize + tid = taskRepeat * multiple * dimsize; + __memcpy(src, source + tid, step * dimsize * sizeof(float), GDRAM2NRAM); + for (int j = 0; j < step; j++) { + __bang_write_zero(destSum, dimS); + __bang_write_zero(destSumFinal, warpSize); + destNewMax = -INFINITY; + for (int i = 0; i < repeatDim; i++) {//RepeatDim refers to the total number of cycles required to read the current dimsize data using dimS after fixing otherIdx + __memcpy(tmp, src + j * dimsize + i * dimS, dimS * sizeof(float), NRAM2NRAM); + __bang_argmax(srcMax, tmp, dimS); + if (destNewMax < srcMax[0]) { + destNewMax = srcMax[0]; + } + __bang_sub_scalar(tmp, tmp, destNewMax, dimS); + __bang_active_exp_less_0(tmp, tmp, dimS); + if (i > 0) { + __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), dimS); + } + __bang_add(destSum, destSum, tmp, dimS); + destOldMax = destNewMax; + } + if (remainDim) {//RemainDim refers to the part of dimsize that cannot be divided by dimS after fixing otherIdx + __bang_write_value(tmp, dimS, -INFINITY); + __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM); + __bang_argmax(srcMax, tmp, dimS); + if (destNewMax < srcMax[0]) { + destNewMax = srcMax[0]; + } + + __bang_write_value(tmp, dimS, destNewMax);//Must be reinitialized to NewMax + __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM); + __bang_sub_scalar(tmp, tmp, destNewMax, dimS); + __bang_active_exp_less_0(tmp, tmp, dimS); + if (repeatDim > 0) { + __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), dimS); + } + __bang_add(destSum, destSum, tmp, dimS); + destOldMax = destNewMax; + } + int segNum = dimS / warpSize;//Starting numerical summation + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int i = 0; i < strip; i++) { + __bang_add(destSum + i * warpSize, destSum + i * warpSize, destSum + (i + strip) * warpSize, warpSize); + } + } + __bang_reduce_sum(destSumFinal, destSum, warpSize); + //At this point, destSumFinal [0] saves the numerical value of the current dimsize length data sum + if (remainDim) { + destSumFinal[0] = destSumFinal[0] - (dimS - remainDim); + } + //__bang_printf("taskId:%d, max:%.2f, sum:%.2f\n", taskId, destNewMax, destSumFinal[0]); + float globalSumInv = 1.0 / destSumFinal[0]; + if (remainDim) { + __bang_mul_scalar(tmp, tmp, globalSumInv, dimS); + __memcpy(destination + tid + j * dimsize + repeatDim * dimS, tmp, remainDim * sizeof(float), NRAM2GDRAM); + } + for (int i = 0; i < repeatDim; i++) { + __memcpy(tmp, src + j * dimsize + i * dimS, dimS * sizeof(float), NRAM2NRAM); + __bang_sub_scalar(tmp, tmp, destNewMax, dimS); + __bang_active_exp_less_0(tmp, tmp, dimS); + __bang_mul_scalar(tmp, tmp, globalSumInv, dimS); + __memcpy(destination + tid + j * dimsize + i * dimS, tmp, dimS * sizeof(float), NRAM2GDRAM); + } + } + } + } + __mlu_device__ void softmaxKernelAxis_s(float *destination, float *source, int othersize, int dimsize, int stride) {axis = 0 + __nram__ float src[maxNum]; //Transfer maxNum data to NRAM every time + __nram__ float tmpSum[maxNum]; + __nram__ float tmpNewMax[maxNum]; + __nram__ float tmpOldMax[maxNum]; + + int remain = othersize % taskDim; + int stepEasy = (othersize - remain) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remain ? stepHard : stepEasy);//The first part of taskId handles an additional element + int indStart = (taskId < remain ? taskId * stepHard : remain * stepHard + (taskId - remain) * stepEasy); + int remainNram = step % maxNum; + int repeat = (step - remainNram) / maxNum; + + //__bang_printf("taskId:%d, repeat:%d, step:%d, indStart:%d, remainNram:%d\n", taskId, repeat, step, indStart, remainNram); + for (int j = 0; j < repeat; j++) { + __bang_write_value(tmpNewMax, maxNum, -INFINITY); + __bang_write_zero(tmpSum, maxNum); + for (int i = 0; i < dimsize; i++) { + __memcpy(src, source + i * stride + indStart + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//Continuously updating the maximum value + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum); //exp(x - M) + if (i > 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, maxNum); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, maxNum);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, maxNum); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, src, maxNum); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM + } + __bang_active_recip_greater_1(tmpSum, tmpSum, maxNum);//compute 1/sum + //Start exponential transformation and write back to GDRAM + __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized + __memcpy(destination + (dimsize - 1) * stride + indStart + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM); + for (int i = 0; i < dimsize - 1; i++) { + __memcpy(src, source + i * stride + indStart + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM); + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum);//exp(x - M) + __bang_mul(src, src, tmpSum, maxNum); + __memcpy(destination + i * stride + indStart + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM); + } + } + if (remainNram) { + __bang_write_value(tmpNewMax, maxNum, -INFINITY); + __bang_write_zero(tmpSum, maxNum); + __bang_write_zero(src, maxNum); + + + for (int i = 0; i < dimsize; i++) { + __memcpy(src, source + i * stride + indStart + repeat * maxNum, remainNram * sizeof(float), GDRAM2NRAM); + __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum); + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum);//exp(x - M) + if (i > 0) { + __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, maxNum); //oldM = oldM - newM + __bang_active_exp_less_0(tmpOldMax, tmpOldMax, maxNum);//exp(oldM - newM) + __bang_mul(tmpSum, tmpSum, tmpOldMax, maxNum); //sum = sum * exp(oldM - newM) + } + __bang_add(tmpSum, tmpSum, src, maxNum); //sum += exp(x - M) + __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM + } + + __bang_active_recip_greater_1(tmpSum, tmpSum, maxNum);//compute 1/sum + //Start exponential transformation and write back to GDRAM + __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized + __memcpy(destination + (dimsize - 1) * stride + indStart + repeat * maxNum, src, remainNram * sizeof(float), NRAM2GDRAM); + for (int i = 0; i < dimsize - 1; i++) { + __memcpy(src, source + i * stride + indStart + repeat * maxNum, remainNram * sizeof(float), GDRAM2NRAM); + __bang_sub(src, src, tmpNewMax, maxNum); //x - M + __bang_active_exp_less_0(src, src, maxNum);//exp(x - M) + __bang_mul(src, src, tmpSum, maxNum); + __memcpy(destination + i * stride + indStart + repeat * maxNum, src, remainNram * sizeof(float), NRAM2GDRAM); + } + } + } + + __mlu_global__ void softmaxUnion1(float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride) { + if (axis == nDim - 1) { + int dimS; + float mi = log2(dimsize); + if (floor(mi) == mi) { + dimS = dimsize; + } else { + dimS = pow(2, floor(mi) + 1); + } + if (dimS < warpSize) { + dimS = warpSize; + } + softmaxKernelAxis_e(mlu_destination, mlu_src, othersize, dimsize, dimS); + } else if (axis == 0) { + softmaxKernelAxis_s(mlu_destination, mlu_src, othersize, dimsize, stride); + } else { + float mi = log2(stride); + int strideS; + if (floor(mi) == mi) { + strideS = stride; + } else { + strideS = pow(2, floor(mi) + 1); + } + softmaxKernelAxis_m(mlu_destination, mlu_src, frontsize, dimsize, stride, strideS); + } + } + + + template + Routine lowerTypedBang(SoftmaxInfo info) { + using namespace runtime; + + return [info](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + auto mlu_src = reinterpret_cast(inputs[0]); + auto mlu_destination = reinterpret_cast(outputs[0]); + int dimsize = info.mid; + int stride = info.post; + int frontsize = info.pre; + int othersize = frontsize * stride; + int numBlocks = info.pre * info.post; + int nDim = 4; + int axis = 1; + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + res.fetchOrStore(); + cnrtQueue_t queue; + cnnlHandle_t handle = res.fetchOrStore()->handle; + cnnlGetQueue(handle, &queue); + k_dim.x = 4; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + + softmaxUnion1<<>>(mlu_destination, mlu_src, nDim, axis, othersize, frontsize, dimsize, stride); + }; + } + + auto SoftmaxBang::lower(Resources &res) const noexcept -> RoutineWorkspace { + switch (info.type.internal) { + case DataType::F32: + return lowerTypedBang(info); + case DataType::F64: + return lowerTypedBang(info); + case DataType::FP16: + return lowerTypedBang(info); + case DataType::BF16: + return lowerTypedBang(info); + default: + UNREACHABLE(); + } + } + + +}//namespace refactor::kernel diff --git a/src/04kernel/test/kernels/softmax/test_bang.cpp b/src/04kernel/test/kernels/softmax/test_bang.cpp new file mode 100644 index 000000000..806f169fc --- /dev/null +++ b/src/04kernel/test/kernels/softmax/test_bang.cpp @@ -0,0 +1,54 @@ +#ifdef USE_BANG + +#include "../../../src/kernels/softmax/bang_kernel.hh" +#include "../../../src/kernels/softmax/cpu_kernel.hh" +#include "hardware/device_manager.h" +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, SoftmaxBang) { + // build routine + auto xTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4}); + auto outTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 5, 4}); + dim_t axis = 1; + int nDim = 5; + auto kCpu = SoftmaxCpu::build(SoftmaxInfo(*xTensor, axis)); + auto kBang = SoftmaxBang::build(SoftmaxInfo(*xTensor, axis)); + ASSERT_TRUE(kCpu && kBang); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto rBang = kBang->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Mlu, 0, ""); + auto gpuIn = dev.malloc(xTensor->bytesSize()), + gpuOut = dev.malloc(outTensor->bytesSize()); + // put input data + std::vector + data(xTensor->elementsSize(), 0), + cpuOut(outTensor->elementsSize()); + gpuIn->copyFromHost(data.data(), xTensor->bytesSize()); + // inference + { + void const *inputs[]{data.data()}; + void *outputs[]{cpuOut.data()}; + rCpu(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{*gpuIn}; + void *outputs[]{*gpuOut}; + + rBang(res, nullptr, inputs, outputs); + } + // take output data + std::vector result(outTensor->elementsSize()); + gpuOut->copyToHost(result.data(), outTensor->bytesSize()); + // check + for (auto i : range0_(result.size())) { + EXPECT_FLOAT_EQ(cpuOut[i], result[i]); + } +} + +#endif