Skip to content

Commit

Permalink
feat: 接入CNNL,并添加unary/binary/softmax/batchnorm/reduce/transpose/pooli…
Browse files Browse the repository at this point in the history
…ng算子
  • Loading branch information
Chamberlain0w0 committed Jan 10, 2024
1 parent 7f82d74 commit 9f34dda
Show file tree
Hide file tree
Showing 40 changed files with 1,918 additions and 9 deletions.
8 changes: 2 additions & 6 deletions src/02hardware/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@ project(hardware VERSION 0.0.0 LANGUAGES CXX)
message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})

# Source files
file(GLOB HARDWARE_SRC src/*.cc src/*.cpp src/devices/cpu/*.cc)
file(GLOB_RECURSE HARDWARE_SRC src/*.cc src/*.cpp)

if(USE_CUDA)
file(GLOB_RECURSE HARDWARE_CUDA_SRC src/devices/nvidia/*.cu src/devices/nvidia/*.cc)
endif()

if(USE_BANG)
file(GLOB_RECURSE HARDWARE_BANG_SRC src/devices/mlu/*.cc)
file(GLOB_RECURSE HARDWARE_CUDA_SRC src/devices/nvidia/*.cu)
endif()

add_library(hardware STATIC ${HARDWARE_SRC} ${HARDWARE_CUDA_SRC} ${HARDWARE_BANG_SRC})
Expand Down
2 changes: 2 additions & 0 deletions src/02hardware/src/device_manager.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "hardware/device_manager.h"
#include "hardware/devices/cpu.h"
#include "hardware/devices/nvidia.h"
#include "hardware/devices/mlu.h"

namespace refactor::hardware::device {

Expand Down Expand Up @@ -37,6 +38,7 @@ namespace refactor::hardware::device {
using T = Device::Type;
// clang-format off
auto device = type == T::Nvidia ? std::make_shared<Nvidia>(card)
: type == T::Mlu ? std::make_shared<Mlu>(card)
: UNREACHABLEX(Arc<Device>, "");
// clang-format on
auto [kind, ok] = DEVICES.try_emplace(static_cast<int32_t>(type));
Expand Down
8 changes: 6 additions & 2 deletions src/02hardware/src/devices/mlu/device.cc
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
#include "functions.cc"
#include "functions.hh"
#include "hardware/devices/mlu.h"
#include "hardware/mem_pool.h"
#include "memory.hh"

namespace refactor::hardware {

static Arc<Memory> bangMemory(int32_t card) {
#ifdef USE_BANG
ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
setDevice(card);
auto [free, total] = getMemInfo();
auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}",
fmt::println("initializing Cambricon MLU {}, memory {} / {}, alloc {}",
card, free, total, size);
return std::make_shared<MemPool>(
std::make_shared<MluMemory>(),
size,
256ul);
#else
return nullptr;
#endif
}

Mlu::Mlu(int32_t card) : Device(card, bangMemory(card)) {}
Expand Down
2 changes: 2 additions & 0 deletions src/02hardware/src/devices/mlu/functions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace refactor::hardware {

#ifdef USE_BANG
int getDeviceCount() {
unsigned deviceCount;
BANG_ASSERT(cnrtGetDeviceCount(&deviceCount));
Expand All @@ -15,5 +16,6 @@ namespace refactor::hardware {
BANG_ASSERT(cnrtMemGetInfo(&memInfo.free, &memInfo.total));
return memInfo;
}
#endif

}// namespace refactor::hardware
5 changes: 4 additions & 1 deletion src/02hardware/src/devices/mlu/functions.hh
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#ifndef HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
#define HARDWARE_DEVICES_MLU_FUNCTIONS_CUH

#include "cnrt.h"
#include "common.h"

#ifdef USE_BANG
#include "cnrt.h"

#define BANG_ASSERT(STATUS) \
if (auto status = (STATUS); status != CNRT_RET_SUCCESS) { \
RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
cnrtGetErrorStr(status), (int) status)); \
}
#endif

namespace refactor::hardware {

Expand Down
2 changes: 2 additions & 0 deletions src/02hardware/src/devices/mlu/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "functions.hh"

namespace refactor::hardware {
#ifdef USE_BANG
using M = MluMemory;

void *M::malloc(size_t size) {
Expand All @@ -27,5 +28,6 @@ namespace refactor::hardware {
CNRT_MEM_TRANS_DIR_PEER2PEER));
return dst;
}
#endif

}// namespace refactor::hardware
4 changes: 4 additions & 0 deletions src/02hardware/src/devices/nvidia/device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
namespace refactor::hardware {

static Arc<Memory> cudaMemory(int32_t card) {
#ifdef USE_CUDA
ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
setDevice(card);
auto [free, total] = getMemInfo();
Expand All @@ -16,6 +17,9 @@ namespace refactor::hardware {
std::make_shared<NvidiaMemory>(),
size,
256ul);
#else
return nullptr;
#endif
}

Nvidia::Nvidia(int32_t card) : Device(card, cudaMemory(card)) {}
Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/batch_normalization.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/batch_normalization.h"
#include "../kernels/batch_normalization/cpu_kernel.hh"
#include "../kernels/batch_normalization/cudnn_kernel.hh"
#include "../kernels/batch_normalization/cnnl_kernel.hh"

namespace refactor::kernel {

Expand All @@ -20,6 +21,9 @@ namespace refactor::kernel {
case decltype(_target)::Nvidia:
REGISTER(BatchNormalizationCudnn)
break;
case decltype(_target)::Mlu:
REGISTER(BatchNormalizationCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/reduce.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/reduce.h"
#include "../kernels/reduce/cpu_kernel.hh"
#include "../kernels/reduce/cudnn_kernel.hh"
#include "../kernels/reduce/cnnl_kernel.hh"

namespace refactor::kernel {

Expand All @@ -27,6 +28,9 @@ namespace refactor::kernel {
case decltype(_target)::Nvidia:
REGISTER(ReduceCudnn)
break;
case decltype(_target)::Mlu:
REGISTER(ReduceCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/simple_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "../kernels/simple_binary/binary_cudnn.hh"
#include "../kernels/simple_binary/cpu_kernel.hh"
#include "../kernels/simple_binary/cuda_kernel.hh"
#include "../kernels/simple_binary/binary_cnnl.hh"

namespace refactor::kernel {

Expand Down Expand Up @@ -48,6 +49,9 @@ namespace refactor::kernel {
REGISTER_BROCAST(BinaryCudnn)
REGISTER(BinaryCuda)
break;
case decltype(_target)::Mlu:
REGISTER_BROCAST(BinaryCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/simple_unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include "../kernels/simple_unary/cpu_kernel.hh"
#include "../kernels/simple_unary/cuda_kernel.hh"
#include "../kernels/simple_unary/cudnn_activation_kernel.hh"
#include "../kernels/simple_unary/cnnl_activation_kernel.hh"
#include "../kernels/simple_unary/cnnl_simple_unary_kernel.hh"
#include "common.h"

namespace refactor::kernel {
Expand Down Expand Up @@ -54,6 +56,10 @@ namespace refactor::kernel {
REGISTER(ActivationCudnn)
REGISTER(SimpleUnaryCuda)
break;
case decltype(_target)::Mlu:
REGISTER(ActivationCnnl)
REGISTER(SimpleUnaryCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
7 changes: 7 additions & 0 deletions src/04kernel/src/collectors/softmax.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "kernel/collectors/softmax.h"
#include "../kernels/softmax/cnnl_kernel.hh"
#include "../kernels/softmax/cpu_kernel.hh"
#include "../kernels/softmax/cuda_kernel.hh"
#include "../kernels/softmax/cudnn_kernel.hh"
Expand Down Expand Up @@ -28,6 +29,12 @@ namespace refactor::kernel {
}
break;
}
case decltype(_target)::Mlu: {
if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
}
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/transpose.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/transpose.h"
#include "../kernels/transpose/cpu_kernel.hh"
#include "../kernels/transpose/cuda_kernel.hh"
#include "../kernels/transpose/cnnl_kernel.hh"

namespace refactor::kernel {

Expand All @@ -25,6 +26,11 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = TransposeCnnl::build(data.dataType, data.shape, perm); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
158 changes: 158 additions & 0 deletions src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#include "cnnl_kernel.hh"

#ifdef USE_BANG
#include "../../utilities/bang/cnnl_context.hh"
#include "../../utilities/bang/cnnl_functions.h"
#include <cnnl.h>
#endif

namespace refactor::kernel {
using K = BatchNormalizationCnnl;
using DT = DataType;

K::BatchNormalizationCnnl(decltype(info) info_) noexcept
: info(info_) {}

auto K::build(float epsilon, TensorRefs inputs) noexcept -> KernelBox {
#ifndef USE_BANG
return nullptr;
#endif

auto const &x = inputs[0].get();
auto const &scale = inputs[1].get();
auto const &mean = inputs[3].get();

if (x.rank() != 4) {
return nullptr;
}

// see "Supported Configurations for `cnnlBatchNormalizationForwardInference`"
if (scale.dataType != mean.dataType) {
return nullptr;
}
if (x.dataType == DT::F64) {
if (scale.dataType != DT::F64) {
return nullptr;
}
} else {
if (scale.dataType != DT::F32) {
return nullptr;
}
}
return std::make_unique<K>(decltype(info){
epsilon,
x.dataType,
scale.dataType,
x.layout,
{
static_cast<int>(x.shape[0]),
static_cast<int>(x.shape[1]),
static_cast<int>(x.shape[2]),
static_cast<int>(x.shape[3]),
}});
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
auto K::description() const noexcept -> std::string_view {
return "Performing batch normalization for non-training-mode using CNNL";
}

#ifdef USE_BANG

auto K::lower(Resources &res) const -> RoutineWorkspace {
using namespace cnnl;
using namespace runtime;
using DT = DataType;

// RAII for closure
struct Descriptors {
cnnlTensorDescriptor_t inDesc, inDescTrans, p;
cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
bool f32;

explicit Descriptors(decltype(f32) f32_)
: inDesc(nullptr), inDescTrans(nullptr), p(nullptr),
NCHW2NHWC(nullptr), NHWC2NCHW(nullptr), f32(f32_) {
CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDescTrans));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&p));
CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC));
CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW));
}
~Descriptors() noexcept(false) {
CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDescTrans));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(p));
CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC));
CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW));
}

Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>(info.dtX != DT::F64);
int dimNCHW[4] = {info.dimAx[0], info.dimAx[1], info.dimAx[2], info.dimAx[3]};
int dimNHWC[4] = {info.dimAx[0], info.dimAx[2], info.dimAx[3], info.dimAx[1]};
int dimParam[]{info.dimAx[1]};
setCnnlTensor(d->inDesc, info.dtX, slice(dimNCHW, 4));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->inDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dtX), 4, dimNHWC));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->p, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dtP), 1, dimParam));
int permute[4] = {0, 2, 3, 1};
int permuteOut[4] = {0, 3, 1, 2};
CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permute));
CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut));

auto handle = res.fetchOrStore<CnnlContext>()->handle;
auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX);
size_t workspaceSize;
CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize));
size_t totalWorkspaceSize = xTransSize + workspaceSize;

res.fetchOrStore<CnnlContext>();
auto routine = [d = std::move(d),
epsilon = info.epsilon,
xTransSize, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
// fetch cnnl handle from resources
auto handle = res.fetchOrStore<CnnlContext>()->handle;

// name inputs and outputs
auto x = inputs[0],
scale = inputs[1],
bias = inputs[2],
mean = inputs[3],
var = inputs[4];
auto y = outputs[0];

void *xTrans = workspace;
void *yTrans = xTrans + xTransSize;
void *cursor = yTrans + workspaceSize;

// transpose NCHW input to NHWC
CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
d->inDescTrans, xTrans, cursor, workspaceSize));

// build alpha/beta for double
auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
CNNL_ASSERT(cnnlBatchNormForwardInference(
handle, &a, &b,
d->inDescTrans, xTrans, d->p, scale, bias, mean, var,
epsilon, d->inDescTrans, yTrans));

// transpose NHWC intermediates to NCHW
CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans,
d->inDesc, y, cursor, workspaceSize));

BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
};

return {std::move(routine), totalWorkspaceSize};
}

#endif

}// namespace refactor::kernel
Loading

0 comments on commit 9f34dda

Please sign in to comment.