diff --git a/nntrainer/layers/cl_layers/concat_cl.cpp b/nntrainer/layers/cl_layers/concat_cl.cpp index 0d230a526f..5b04a48100 100644 --- a/nntrainer/layers/cl_layers/concat_cl.cpp +++ b/nntrainer/layers/cl_layers/concat_cl.cpp @@ -24,44 +24,6 @@ #include #include -std::string concat_cl_axis3_kernel_fp16_ = - R"( - #pragma OPENCL EXTENSION cl_khr_fp16 : enable - __kernel void concat_cl_axis3_fp16(__global const half* in1, - __global const half* in2, - __global half* out, - const int batch_size, - const int channels, - const int height, - const int width1, - const int width2) { - int global_id = get_global_id(0); - - int total_width = width1 + width2; - - int width = total_width; - - // 4D space coordinates - int w = global_id % total_width; - int h = (global_id / total_width) % height; - int c = (global_id / (total_width * height)) % channels; - int b = global_id / (total_width * height * channels); - - int output_index = ((b * channels + c) * height + h) * total_width + w; - - // Determining if the index is in in1 or in2 - if (w < width1) { - // in1 index calculation - int input1_index = ((b * channels + c) * height + h) * width1 + w; - out[output_index] = in1[input1_index]; - - } else { - // in2 index calculation - int input2_index = ((b * channels + c) * height + h) * width2 + (w - width1); - out[output_index] = in2[input2_index]; - } -})"; - std::string concat_cl_axis3_kernel_ = R"(__kernel void concat_cl_axis3(__global const float* in1, __global const float* in2, @@ -98,40 +60,6 @@ std::string concat_cl_axis3_kernel_ = } })"; -std::string concat_cl_axis2_kernel_fp16_ = - R"(__kernel void concat_cl_axis2_fp16(__global const half* in1, - __global const half* in2, - __global half* out, - const int batch_size, - const int channels, - const int height1, - const int height2, - const int width) { - - int total_height = height1 + height2; - int global_id = get_global_id(0); - - // Calculate the coordinates in the 4D space - int w = global_id % width; - int h = (global_id / width) % total_height; - int c = (global_id / (width * total_height)) % channels; - int b = global_id / (width * total_height * channels); - - // Calculate the offset for the current batch, channel, and width in the output tensor - int output_index = ((b * channels + c) * total_height + h) * width + w; - - if (h < height1) { - // Index within input1 - int input1_index = ((b * channels + c) * height1 + h) * width + w; - out[output_index] = in1[input1_index]; - } else { - // Index within input2 - int input2_index = ((b * channels + c) * height2 + (h - height1)) * width + w; - out[output_index] = in2[input2_index]; - } - -})"; - std::string concat_cl_axis2_kernel_ = R"(__kernel void concat_cl_axis2(__global const float* in1, __global const float* in2, @@ -166,39 +94,6 @@ std::string concat_cl_axis2_kernel_ = })"; -std::string concat_cl_axis1_kernel_fp16_ = - R"(__kernel void concat_cl_axis1_fp16(__global const half* in1, - __global const half* in2, - __global half* out, - const int batch_size, - const int channels1, - const int channels2, - const int height, - const int width) { - int global_id = get_global_id(0); - - int total_channels = channels1 + channels2; - - // Calculate the coordinates in the 4D space - int w = global_id % width; - int h = (global_id / width) % height; - int c = (global_id / (width * height)) % total_channels; - int b = global_id / (width * height * total_channels); - - // Calculate the offset for the current batch, height, and width in the output tensor - int output_index = ((b * total_channels + c) * height + h) * width + w; - - if (c < channels1) { - // Index within input1 - int input1_index = ((b * channels1 + c) * height + h) * width + w; - out[output_index] = in1[input1_index]; - } else { - // Index within input2 - int input2_index = ((b * channels2 + (c - channels1)) * height + h) * width + w; - out[output_index] = in2[input2_index]; - } -})"; - std::string concat_cl_axis1_kernel_ = R"(__kernel void concat_cl_axis1(__global const float* in1, __global const float* in2, @@ -303,11 +198,8 @@ void ConcatLayerCl::incremental_forwarding(RunLayerContext &context, } opencl::Kernel ConcatLayerCl::kernel_concat_axis3; -opencl::Kernel ConcatLayerCl::kernel_concat_axis3_fp16; opencl::Kernel ConcatLayerCl::kernel_concat_axis2; -opencl::Kernel ConcatLayerCl::kernel_concat_axis2_fp16; opencl::Kernel ConcatLayerCl::kernel_concat_axis1; -opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16; void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2, Tensor &result) { @@ -321,7 +213,6 @@ void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2, input1_height = dim1.height(); input1_channels = dim1.channel(); input1_width = dim1.width(); - input2_height = dim2.height(); input2_channels = dim2.channel(); input2_width = dim2.width(); @@ -477,117 +368,6 @@ void ConcatLayerCl::concat_cl_axis3(const float *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis3_fp16( - const _FP16 *matAdata, const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int input1_batch_size, unsigned int input1_channels, - unsigned int input1_height, unsigned int input1_width, - unsigned int input2_width) { - - bool result = false; - - do { - ClContext::SharedPtrClKernel kernel_concat_ptr = - cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_, - "concat_cl_axis3_fp16"); - if (!kernel_concat_ptr) { - break; - } - - int dim = int(input1_batch_size * input1_channels * input1_height * - (input1_width + input2_width)); - - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input2_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * (input1_width + input2_width), - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); - if (!result) { - break; - } - - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); - if (!result) { - break; - } - - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int)); - if (!result) { - break; - } - - const int work_groups_count[3] = {dim, 1, 1}; - const int work_group_size[3] = {32, 32, 1}; // test-value - - result = cl_context_ref.command_queue_inst_.DispatchCommand( - kernel_concat_ptr, work_groups_count, work_group_size); - if (!result) { - break; - } - - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - } while (false); -} - void ConcatLayerCl::concat_cl_axis2(const float *matAdata, const float *vecXdata, float *vecYdata, unsigned int input1_batch_size, @@ -701,117 +481,6 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis2_fp16( - const _FP16 *matAdata, const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int input1_batch_size, unsigned int input1_channels, - unsigned int input1_width, unsigned int input1_height, - unsigned int input2_height) { - - bool result = false; - - do { - ClContext::SharedPtrClKernel kernel_concat_ptr = - cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_, - "concat_cl_axis2_fp16"); - if (!kernel_concat_ptr) { - break; - } - - int dim = int(input1_batch_size * input1_channels * input1_width * - (input1_height + input2_height)); - - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input2_height * input1_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - (input1_height + input2_height) * input1_width, - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); - if (!result) { - break; - } - - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); - if (!result) { - break; - } - - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int)); - if (!result) { - break; - } - - const int work_groups_count[3] = {dim, 1, 1}; - const int work_group_size[3] = {32, 32, 1}; // test-value - - result = cl_context_ref.command_queue_inst_.DispatchCommand( - kernel_concat_ptr, work_groups_count, work_group_size); - if (!result) { - break; - } - - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - } while (false); -} - void ConcatLayerCl::concat_cl_axis1(const float *matAdata, const float *vecXdata, float *vecYdata, unsigned int input1_batch_size, @@ -925,117 +594,6 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis1_fp16( - const _FP16 *matAdata, const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int input1_batch_size, unsigned int input1_height, - unsigned int input1_width, unsigned int input1_channels, - unsigned int input2_channels) { - - bool result = false; - - do { - ClContext::SharedPtrClKernel kernel_concat_ptr = - cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_, - "concat_cl_axis1_fp16"); - if (!kernel_concat_ptr) { - break; - } - - int dim = int(input1_batch_size * input1_width * input1_height * - (input1_channels + input2_channels)); - - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input2_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_width * - input1_height * (input1_channels + input2_channels), - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); - if (!result) { - break; - } - - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); - if (!result) { - break; - } - - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int)); - if (!result) { - break; - } - - const int work_groups_count[3] = {dim, 1, 1}; - const int work_group_size[3] = {32, 32, 1}; // test-value - - result = cl_context_ref.command_queue_inst_.DispatchCommand( - kernel_concat_ptr, work_groups_count, work_group_size); - if (!result) { - break; - } - - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - } while (false); -} - void ConcatLayerCl::calcDerivative(RunLayerContext &context) { // /** // * @todo skipping calcDerivative, support yet to be added diff --git a/nntrainer/layers/cl_layers/concat_cl.h b/nntrainer/layers/cl_layers/concat_cl.h index ce8599409e..ad52b48e32 100644 --- a/nntrainer/layers/cl_layers/concat_cl.h +++ b/nntrainer/layers/cl_layers/concat_cl.h @@ -106,12 +106,15 @@ class ConcatLayerCl : public Layer { inline static const std::string type = "concat"; - static opencl::Kernel kernel_concat_axis3; - static opencl::Kernel kernel_concat_axis3_fp16; - static opencl::Kernel kernel_concat_axis2; - static opencl::Kernel kernel_concat_axis2_fp16; static opencl::Kernel kernel_concat_axis1; + static opencl::Kernel kernel_concat_axis2; + static opencl::Kernel kernel_concat_axis3; + +#ifdef ENABLE_FP16 static opencl::Kernel kernel_concat_axis1_fp16; + static opencl::Kernel kernel_concat_axis2_fp16; + static opencl::Kernel kernel_concat_axis3_fp16; +#endif /** * @brief Process data and dimensions for concat @@ -174,6 +177,7 @@ class ConcatLayerCl : public Layer { unsigned int input2_channels); #ifdef ENABLE_FP16 + /** * @brief concat computation for axis 3 fp16 * @param[in] matAdata fp16 * for Input Tensor A @@ -231,6 +235,7 @@ class ConcatLayerCl : public Layer { unsigned int input1_channels, unsigned int input2_channelst); #endif + private: std::tuple concat_props; }; diff --git a/nntrainer/layers/cl_layers/concat_cl_fp16.cpp b/nntrainer/layers/cl_layers/concat_cl_fp16.cpp new file mode 100644 index 0000000000..ec90f66793 --- /dev/null +++ b/nntrainer/layers/cl_layers/concat_cl_fp16.cpp @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Niket Agarwal + * + * @file concat_cl.cpp + * @date 2 July 2024 + * @brief Implementation of Concat Layer + * @see https://github.com/nnstreamer/nntrainer + * @author Niket Agarwal + * @author Eunju Yang + * @bug No known bugs except for NYI items + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +std::string concat_cl_axis3_kernel_fp16_ = + R"( + #pragma OPENCL EXTENSION cl_khr_fp16 : enable + __kernel void concat_cl_axis3_fp16(__global const half* in1, + __global const half* in2, + __global half* out, + const int batch_size, + const int channels, + const int height, + const int width1, + const int width2) { + int global_id = get_global_id(0); + + int total_width = width1 + width2; + + int width = total_width; + + // 4D space coordinates + int w = global_id % total_width; + int h = (global_id / total_width) % height; + int c = (global_id / (total_width * height)) % channels; + int b = global_id / (total_width * height * channels); + + int output_index = ((b * channels + c) * height + h) * total_width + w; + + // Determining if the index is in in1 or in2 + if (w < width1) { + // in1 index calculation + int input1_index = ((b * channels + c) * height + h) * width1 + w; + out[output_index] = in1[input1_index]; + + } else { + // in2 index calculation + int input2_index = ((b * channels + c) * height + h) * width2 + (w - width1); + out[output_index] = in2[input2_index]; + } +})"; + +std::string concat_cl_axis2_kernel_fp16_ = + R"(__kernel void concat_cl_axis2_fp16(__global const half* in1, + __global const half* in2, + __global half* out, + const int batch_size, + const int channels, + const int height1, + const int height2, + const int width) { + + int total_height = height1 + height2; + int global_id = get_global_id(0); + + // Calculate the coordinates in the 4D space + int w = global_id % width; + int h = (global_id / width) % total_height; + int c = (global_id / (width * total_height)) % channels; + int b = global_id / (width * total_height * channels); + + // Calculate the offset for the current batch, channel, and width in the output tensor + int output_index = ((b * channels + c) * total_height + h) * width + w; + + if (h < height1) { + // Index within input1 + int input1_index = ((b * channels + c) * height1 + h) * width + w; + out[output_index] = in1[input1_index]; + } else { + // Index within input2 + int input2_index = ((b * channels + c) * height2 + (h - height1)) * width + w; + out[output_index] = in2[input2_index]; + } + +})"; + +std::string concat_cl_axis1_kernel_fp16_ = + R"(__kernel void concat_cl_axis1_fp16(__global const half* in1, + __global const half* in2, + __global half* out, + const int batch_size, + const int channels1, + const int channels2, + const int height, + const int width) { + int global_id = get_global_id(0); + + int total_channels = channels1 + channels2; + + // Calculate the coordinates in the 4D space + int w = global_id % width; + int h = (global_id / width) % height; + int c = (global_id / (width * height)) % total_channels; + int b = global_id / (width * height * total_channels); + + // Calculate the offset for the current batch, height, and width in the output tensor + int output_index = ((b * total_channels + c) * height + h) * width + w; + + if (c < channels1) { + // Index within input1 + int input1_index = ((b * channels1 + c) * height + h) * width + w; + out[output_index] = in1[input1_index]; + } else { + // Index within input2 + int input2_index = ((b * channels2 + (c - channels1)) * height + h) * width + w; + out[output_index] = in2[input2_index]; + } +})"; + +namespace nntrainer { + +static constexpr size_t SINGLE_INOUT_IDX = 0; +static constexpr size_t INPUT_IDX_1 = 0; +static constexpr size_t INPUT_IDX_2 = 1; + +opencl::Kernel ConcatLayerCl::kernel_concat_axis3_fp16; +opencl::Kernel ConcatLayerCl::kernel_concat_axis2_fp16; +opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16; + +void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata, + const _FP16 *vecXdata, _FP16 *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_channels, + unsigned int input1_height, + unsigned int input1_width, + unsigned int input2_width) { + + bool result = false; + + do { + ClContext::SharedPtrClKernel kernel_concat_ptr = + cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_, + "concat_cl_axis3_fp16"); + if (!kernel_concat_ptr) { + break; + } + + int dim = int(input1_batch_size * input1_channels * input1_height * + (input1_width + input2_width)); + + opencl::Buffer inputA(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * input1_width, + true, nullptr); + + opencl::Buffer inputX(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * input2_width, + true, nullptr); + + opencl::Buffer inOutY(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * (input1_width + input2_width), + true, nullptr); + + result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + if (!result) { + break; + } + + result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + if (!result) { + break; + } + + result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {dim, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = cl_context_ref.command_queue_inst_.DispatchCommand( + kernel_concat_ptr, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + } while (false); +} + +void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata, + const _FP16 *vecXdata, _FP16 *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_channels, + unsigned int input1_width, + unsigned int input1_height, + unsigned int input2_height) { + + bool result = false; + + do { + ClContext::SharedPtrClKernel kernel_concat_ptr = + cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_, + "concat_cl_axis2_fp16"); + if (!kernel_concat_ptr) { + break; + } + + int dim = int(input1_batch_size * input1_channels * input1_width * + (input1_height + input2_height)); + + opencl::Buffer inputA(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * input1_width, + true, nullptr); + + opencl::Buffer inputX(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + input2_height * input1_width, + true, nullptr); + + opencl::Buffer inOutY(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, + true, nullptr); + + result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + if (!result) { + break; + } + + result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + if (!result) { + break; + } + + result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {dim, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = cl_context_ref.command_queue_inst_.DispatchCommand( + kernel_concat_ptr, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + } while (false); +} + +void ConcatLayerCl::concat_cl_axis1_fp16(const _FP16 *matAdata, + const _FP16 *vecXdata, _FP16 *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_height, + unsigned int input1_width, + unsigned int input1_channels, + unsigned int input2_channels) { + + bool result = false; + + do { + ClContext::SharedPtrClKernel kernel_concat_ptr = + cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_, + "concat_cl_axis1_fp16"); + if (!kernel_concat_ptr) { + break; + } + + int dim = int(input1_batch_size * input1_width * input1_height * + (input1_channels + input2_channels)); + + opencl::Buffer inputA(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * input1_width, + true, nullptr); + + opencl::Buffer inputX(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input2_channels * + input1_height * input1_width, + true, nullptr); + + opencl::Buffer inOutY(cl_context_ref.context_inst_, + sizeof(_FP16) * input1_batch_size * input1_width * + input1_height * (input1_channels + input2_channels), + true, nullptr); + + result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + if (!result) { + break; + } + + result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + if (!result) { + break; + } + + result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {dim, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = cl_context_ref.command_queue_inst_.DispatchCommand( + kernel_concat_ptr, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + } while (false); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build index fbfd46961b..57fb942bc9 100644 --- a/nntrainer/layers/cl_layers/meson.build +++ b/nntrainer/layers/cl_layers/meson.build @@ -7,6 +7,15 @@ cl_layer_sources = [ 'concat_cl.cpp', ] +if get_option('enable-fp16') + cl_layer_sources += [ + 'swiglu_cl_fp16.cpp', + 'reshape_cl_fp16.cpp', + 'rmsnorm_layer_cl_fp16.cpp', + 'concat_cl_fp16.cpp', + ] +endif + foreach s : cl_layer_sources nntrainer_sources += meson.current_source_dir() / s endforeach diff --git a/nntrainer/layers/cl_layers/reshape_cl.cpp b/nntrainer/layers/cl_layers/reshape_cl.cpp index 724e38c549..5e48bbeb02 100644 --- a/nntrainer/layers/cl_layers/reshape_cl.cpp +++ b/nntrainer/layers/cl_layers/reshape_cl.cpp @@ -17,21 +17,6 @@ #include #include -std::string copy_cl_kernel_fp16_ = - R"( - #pragma OPENCL EXTENSION cl_khr_fp16 : enable - __kernel void copy_cl_fp16(__global const half* input, - __global half* output, - const int batchsize, - const int channels, - const int height, - const int width) { - - int i= get_global_id(0); - output[i] = input[i]; - -})"; - std::string copy_cl_kernel_ = R"(__kernel void copy_cl(__global const float* input, __global float* output, @@ -99,7 +84,6 @@ void ReshapeLayerCl::incremental_forwarding(RunLayerContext &context, } opencl::Kernel ReshapeLayerCl::kernel_copy; -opencl::Kernel ReshapeLayerCl::kernel_copy_fp16; void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output) { @@ -127,89 +111,6 @@ void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output) { } } -void ReshapeLayerCl::copy_cl_fp16(const _FP16 *input, _FP16 *res, - unsigned int input_batch_size, - unsigned int input_channels, - unsigned int input_height, - unsigned int input_width) { - - bool result = false; - - do { - ClContext::SharedPtrClKernel kernel_copy_ptr = - cl_context_ref.registerClKernel(copy_cl_kernel_fp16_, "copy_cl_fp16"); - if (!kernel_copy_ptr) { - break; - } - - size_t dim_size = sizeof(_FP16) * input_batch_size * input_height * - input_width * input_channels; - - opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true, - nullptr); - - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, input); - if (!result) { - break; - } - - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res); - if (!result) { - break; - } - - result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem)); - if (!result) { - break; - } - - result = - kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int)); - if (!result) { - break; - } - - result = - kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int)); - if (!result) { - break; - } - - result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int)); - if (!result) { - break; - } - - result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int)); - if (!result) { - break; - } - - const int work_groups_count[3] = {(int)dim_size, 1, 1}; - const int work_group_size[3] = {32, 32, 1}; // test-value - - result = cl_context_ref.command_queue_inst_.DispatchCommand( - kernel_copy_ptr, work_groups_count, work_group_size); - if (!result) { - break; - } - - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res); - if (!result) { - break; - } - - } while (false); -} - void ReshapeLayerCl::copy_cl(const float *input, float *res, unsigned int input_batch_size, unsigned int input_channels, diff --git a/nntrainer/layers/cl_layers/reshape_cl.h b/nntrainer/layers/cl_layers/reshape_cl.h index 679214189e..9c83beb360 100644 --- a/nntrainer/layers/cl_layers/reshape_cl.h +++ b/nntrainer/layers/cl_layers/reshape_cl.h @@ -106,7 +106,10 @@ class ReshapeLayerCl : public Layer { inline static const std::string type = "reshape"; static opencl::Kernel kernel_copy; + +#ifdef ENABLE_FP16 static opencl::Kernel kernel_copy_fp16; +#endif /** * @brief Process data and dimensions for reshape operation diff --git a/nntrainer/layers/cl_layers/reshape_cl_fp16.cpp b/nntrainer/layers/cl_layers/reshape_cl_fp16.cpp new file mode 100644 index 0000000000..9936bc4ef1 --- /dev/null +++ b/nntrainer/layers/cl_layers/reshape_cl_fp16.cpp @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Niket Agarwal + * + * @file reshape_cl.cpp + * @date 18 June 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Niket Agarwal + * @author Eunju Yang + * @bug No known bugs except for NYI items + * @brief This is Reshape GPU Layer Implementation + */ + +#include +#include +#include +#include +#include +#include + +std::string copy_cl_kernel_fp16_ = + R"( + #pragma OPENCL EXTENSION cl_khr_fp16 : enable + __kernel void copy_cl_fp16(__global const half* input, + __global half* output, + const int batchsize, + const int channels, + const int height, + const int width) { + + int i= get_global_id(0); + output[i] = input[i]; + +})"; + +namespace nntrainer { + +opencl::Kernel ReshapeLayerCl::kernel_copy_fp16; + +void ReshapeLayerCl::copy_cl_fp16(const _FP16 *input, _FP16 *res, + unsigned int input_batch_size, + unsigned int input_channels, + unsigned int input_height, + unsigned int input_width) { + + bool result = false; + + do { + ClContext::SharedPtrClKernel kernel_copy_ptr = + cl_context_ref.registerClKernel(copy_cl_kernel_fp16_, "copy_cl_fp16"); + if (!kernel_copy_ptr) { + break; + } + + size_t dim_size = sizeof(_FP16) * input_batch_size * input_height * + input_width * input_channels; + + opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true, + nullptr); + + opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true, + nullptr); + + result = inputA.WriteData(cl_context_ref.command_queue_inst_, input); + if (!result) { + break; + } + + result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res); + if (!result) { + break; + } + + result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem)); + if (!result) { + break; + } + + result = + kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int)); + if (!result) { + break; + } + + result = + kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int)); + if (!result) { + break; + } + + result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int)); + if (!result) { + break; + } + + result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {(int)dim_size, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = cl_context_ref.command_queue_inst_.DispatchCommand( + kernel_copy_ptr, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res); + if (!result) { + break; + } + + } while (false); +} + +} /* namespace nntrainer */ + diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp index 2df9caf29e..137e94a90c 100644 --- a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp +++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp @@ -20,41 +20,6 @@ #include #include -std::string rmsnorm_cl_kernel_fp16_ = - R"( - #pragma OPENCL EXTENSION cl_khr_fp16 : enable - __kernel void rmsnorm_cl_fp16( - __global const half *input, // Input tensor - __global half *output, // Output tensor - __global const half *alpha, // Alpha values (one for each width) - half epsilon, - int B, // Number of batches - int C, // Number of channels - int H, // Height of feature map - int W // Width of feature map -) { - int global_id = get_global_id(0); // Get the global work item index - - // Compute the corresponding batch, height, and channel indices - int n = global_id / C; // Batch index - int c = global_id % C; // Height index - int h = get_global_id(1); // Channel index - int index = ((n * C + c) * H + h) * W; - - // Calculate RMS norm for the current channel, height, and batch - half sum_squares = 0.0f; - for (int j = 0; j < W; ++j) { - sum_squares += input[index+j] * input[index+j]; - } - sum_squares /= W; - half rms_norm = sqrt(sum_squares + epsilon); - // Each work item processes all width elements for its specific n, h, c - for (int w = 0; w < W; ++w) { - output[index+w] = (input[index+w] / rms_norm) * alpha[w]; - } -} -)"; - std::string rmsnorm_cl_kernel_ = R"(__kernel void rmsnorm_cl( __global const float *input, // Input tensor @@ -124,7 +89,6 @@ void RMSNormLayerCl::forwarding(RunLayerContext &context, bool training) { } opencl::Kernel RMSNormLayerCl::kernel_rmsnorm; -opencl::Kernel RMSNormLayerCl::kernel_rmsnorm_fp16; void RMSNormLayerCl::rmsnormProcess(Tensor const &input, Tensor &result, Tensor const &gamma, const float epsilon) { @@ -219,96 +183,6 @@ void RMSNormLayerCl::rmsnormProcess(Tensor const &input, Tensor &result, } while (false); } -void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result, - Tensor const &gamma, - const float epsilon) { - - bool ret = false; - int dim1 = input.batch() * input.height() * input.width() * input.channel(); - CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(), - input.width(), input.getTensorType()); - int b = input.batch(); - int c = input.channel(); - int h = input.height(); - int w = input.width(); - do { - ClContext::SharedPtrClKernel kernel_rmsnorm_ptr = - cl_context_ref.registerClKernel(rmsnorm_cl_kernel_fp16_, - "rmsnorm_cl_fp16"); - if (!kernel_rmsnorm_ptr) { - break; - } - opencl::Buffer inputbuf(cl_context_ref.context_inst_, - dim1 * sizeof(cl_half), true, nullptr); - - opencl::Buffer gammabuf(cl_context_ref.context_inst_, - input.width() * sizeof(cl_half), true, nullptr); - opencl::Buffer resultbuf(cl_context_ref.context_inst_, - dim1 * sizeof(cl_half), true, nullptr); - - const _FP16 *data = input.getData<_FP16>(); - _FP16 *rdata = result.getData<_FP16>(); - const _FP16 *gdata = gamma.getData<_FP16>(); - ret = inputbuf.WriteData(cl_context_ref.command_queue_inst_, data); - if (!ret) { - break; - } - - ret = gammabuf.WriteData(cl_context_ref.command_queue_inst_, gdata); - if (!ret) { - break; - } - ret = kernel_rmsnorm_ptr->SetKernelArguments(0, &inputbuf, sizeof(cl_mem)); - if (!ret) { - break; - } - ret = kernel_rmsnorm_ptr->SetKernelArguments(1, &resultbuf, sizeof(cl_mem)); - if (!ret) { - break; - } - - ret = kernel_rmsnorm_ptr->SetKernelArguments(2, &gammabuf, sizeof(cl_mem)); - if (!ret) { - break; - } - ret = kernel_rmsnorm_ptr->SetKernelArguments(4, &b, sizeof(int)); - if (!ret) { - break; - } - - ret = kernel_rmsnorm_ptr->SetKernelArguments(3, &epsilon, sizeof(cl_half)); - if (!ret) { - break; - } - - ret = kernel_rmsnorm_ptr->SetKernelArguments(5, &c, sizeof(int)); - if (!ret) { - break; - } - ret = kernel_rmsnorm_ptr->SetKernelArguments(6, &h, sizeof(int)); - if (!ret) { - break; - } - ret = kernel_rmsnorm_ptr->SetKernelArguments(7, &w, sizeof(int)); - if (!ret) { - break; - } - const int work_groups_count[3] = {b * c, h, 1}; - const int work_group_size[3] = {32, 32, 1}; // test-value - - ret = cl_context_ref.command_queue_inst_.DispatchCommand( - kernel_rmsnorm_ptr, work_groups_count, work_group_size); - if (!ret) { - break; - } - - ret = resultbuf.ReadData(cl_context_ref.command_queue_inst_, rdata); - if (!ret) { - break; - } - } while (false); -} - void RMSNormLayerCl::incremental_forwarding(nntrainer::RunLayerContext &context, unsigned int from, unsigned int to, bool training) { @@ -339,7 +213,11 @@ void RMSNormLayerCl::incremental_forwarding(nntrainer::RunLayerContext &context, if (in_step.getDataType() == ml::train::TensorDim::DataType::FP32) { rmsnormProcess(in, out, gamma, epsilon); } else { +#ifdef ENABLE_FP16 rmsnormProcess_fp16(in, out, gamma, epsilon); +#else + throw std::invalid_argument("Error: enable-fp16 is not enabled"); +#endif } } diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h index 43f942ea1e..a3b123dc6e 100644 --- a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h +++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h @@ -119,7 +119,10 @@ class RMSNormLayerCl : public LayerImpl { const std::string getType() const override { return RMSNormLayerCl::type; }; static opencl::Kernel kernel_rmsnorm; + +#ifdef ENABLE_FP16 static opencl::Kernel kernel_rmsnorm_fp16; +#endif /** * @brief Process data and dimensions for rms norm operation diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl_fp16.cpp b/nntrainer/layers/cl_layers/rmsnorm_layer_cl_fp16.cpp new file mode 100644 index 0000000000..b788d877cd --- /dev/null +++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl_fp16.cpp @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Thummala Pallavi + * + * @file rmsnorm_layer_fp16_cl.cpp + * @date 8 June 2024 + * @brief This is RMSNorm Layer Class for Neural Network with + * OpenCl implementation + * @see https://github.com/nnstreamer/nntrainer + * @author Thummala Pallavi + * @author Eunju Yang + * @bug No known bugs except for NYI items + * + */ + + +#include +#include +#include +#include +#include +#include +#include + +std::string rmsnorm_cl_kernel_fp16_ = + R"( + #pragma OPENCL EXTENSION cl_khr_fp16 : enable + __kernel void rmsnorm_cl_fp16( + __global const half *input, // Input tensor + __global half *output, // Output tensor + __global const half *alpha, // Alpha values (one for each width) + half epsilon, + int B, // Number of batches + int C, // Number of channels + int H, // Height of feature map + int W // Width of feature map +) { + int global_id = get_global_id(0); // Get the global work item index + + // Compute the corresponding batch, height, and channel indices + int n = global_id / C; // Batch index + int c = global_id % C; // Height index + int h = get_global_id(1); // Channel index + int index = ((n * C + c) * H + h) * W; + + // Calculate RMS norm for the current channel, height, and batch + half sum_squares = 0.0f; + for (int j = 0; j < W; ++j) { + sum_squares += input[index+j] * input[index+j]; + } + sum_squares /= W; + half rms_norm = sqrt(sum_squares + epsilon); + // Each work item processes all width elements for its specific n, h, c + for (int w = 0; w < W; ++w) { + output[index+w] = (input[index+w] / rms_norm) * alpha[w]; + } +} +)"; + +namespace nntrainer { + +opencl::Kernel RMSNormLayerCl::kernel_rmsnorm_fp16; + +void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result, + Tensor const &gamma, + const float epsilon) { + + bool ret = false; + int dim1 = input.batch() * input.height() * input.width() * input.channel(); + CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(), + input.width(), input.getTensorType()); + int b = input.batch(); + int c = input.channel(); + int h = input.height(); + int w = input.width(); + do { + ClContext::SharedPtrClKernel kernel_rmsnorm_ptr = + cl_context_ref.registerClKernel(rmsnorm_cl_kernel_fp16_, + "rmsnorm_cl_fp16"); + if (!kernel_rmsnorm_ptr) { + break; + } + opencl::Buffer inputbuf(cl_context_ref.context_inst_, + dim1 * sizeof(cl_half), true, nullptr); + + opencl::Buffer gammabuf(cl_context_ref.context_inst_, + input.width() * sizeof(cl_half), true, nullptr); + opencl::Buffer resultbuf(cl_context_ref.context_inst_, + dim1 * sizeof(cl_half), true, nullptr); + + const _FP16 *data = input.getData<_FP16>(); + _FP16 *rdata = result.getData<_FP16>(); + const _FP16 *gdata = gamma.getData<_FP16>(); + ret = inputbuf.WriteData(cl_context_ref.command_queue_inst_, data); + if (!ret) { + break; + } + + ret = gammabuf.WriteData(cl_context_ref.command_queue_inst_, gdata); + if (!ret) { + break; + } + ret = kernel_rmsnorm_ptr->SetKernelArguments(0, &inputbuf, sizeof(cl_mem)); + if (!ret) { + break; + } + ret = kernel_rmsnorm_ptr->SetKernelArguments(1, &resultbuf, sizeof(cl_mem)); + if (!ret) { + break; + } + + ret = kernel_rmsnorm_ptr->SetKernelArguments(2, &gammabuf, sizeof(cl_mem)); + if (!ret) { + break; + } + ret = kernel_rmsnorm_ptr->SetKernelArguments(4, &b, sizeof(int)); + if (!ret) { + break; + } + + ret = kernel_rmsnorm_ptr->SetKernelArguments(3, &epsilon, sizeof(cl_half)); + if (!ret) { + break; + } + + ret = kernel_rmsnorm_ptr->SetKernelArguments(5, &c, sizeof(int)); + if (!ret) { + break; + } + ret = kernel_rmsnorm_ptr->SetKernelArguments(6, &h, sizeof(int)); + if (!ret) { + break; + } + ret = kernel_rmsnorm_ptr->SetKernelArguments(7, &w, sizeof(int)); + if (!ret) { + break; + } + const int work_groups_count[3] = {b * c, h, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + ret = cl_context_ref.command_queue_inst_.DispatchCommand( + kernel_rmsnorm_ptr, work_groups_count, work_group_size); + if (!ret) { + break; + } + + ret = resultbuf.ReadData(cl_context_ref.command_queue_inst_, rdata); + if (!ret) { + break; + } + } while (false); +} + +} // namespace nntrainer + diff --git a/nntrainer/layers/cl_layers/swiglu_cl.cpp b/nntrainer/layers/cl_layers/swiglu_cl.cpp index c9c1891e44..ca991b2354 100644 --- a/nntrainer/layers/cl_layers/swiglu_cl.cpp +++ b/nntrainer/layers/cl_layers/swiglu_cl.cpp @@ -13,15 +13,6 @@ #include "swiglu_cl.h" #include -std::string swiglu_cl_kernel_fp16_ = - R"( - #pragma OPENCL EXTENSION cl_khr_fp16 : enable - __kernel void swiglu_cl_fp16(__global const half *in1, __global const half *in2, __global half *out) { - int i = get_global_id(0); - half swish = in1[i] * exp(in1[i]) / (1 + exp(in1[i])); - out[i] = swish * in2[i]; -})"; - std::string swiglu_cl_kernel_ = R"(__kernel void swiglu_cl(__global const float *in1, __global const float *in2, __global float *out) { int i = get_global_id(0); @@ -64,7 +55,6 @@ void SwiGLULayerCl::incremental_forwarding(RunLayerContext &context, } opencl::Kernel SwiGLULayerCl::kernel_swiglu; -opencl::Kernel SwiGLULayerCl::kernel_swiglu_fp16; void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2, Tensor &result) { @@ -104,84 +94,14 @@ void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata, } int dim = int(dim1 * dim2); - opencl::Buffer inputA(cl_context_ref.context_inst_, sizeof(float) * dim1 * dim2, true, - nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, sizeof(float) * dim1 * dim2, true, - nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, sizeof(float) * dim1 * dim2, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); - if (!result) { - break; - } - - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); - if (!result) { - break; - } - - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); - if (!result) { - break; - } - - result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); - if (!result) { - break; - } - - const int work_groups_count[3] = {dim, 1, 1}; - const int work_group_size[3] = {32, 32, 1}; // test-value - - result = cl_context_ref.command_queue_inst_.DispatchCommand( - kernel_swiglu_ptr, work_groups_count, work_group_size); - if (!result) { - break; - } - - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); - if (!result) { - break; - } - - } while (false); -} - -void SwiGLULayerCl::swiglu_cl_fp16(const _FP16 *matAdata, - const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int dim1, unsigned int dim2){ - - bool result = false; - - do { - ClContext::SharedPtrClKernel kernel_swiglu_ptr = - cl_context_ref.registerClKernel(swiglu_cl_kernel_fp16_, "swiglu_cl_fp16"); - if (!kernel_swiglu_ptr) { - break; - } - - int dim = int(dim1 * dim2); - opencl::Buffer inputA(cl_context_ref.context_inst_, sizeof(_FP16) * dim1 * dim2, true, - nullptr); + opencl::Buffer inputA(cl_context_ref.context_inst_, + sizeof(float) * dim1 * dim2, true, nullptr); - opencl::Buffer inputX(cl_context_ref.context_inst_, sizeof(_FP16) * dim1 * dim2, true, - nullptr); + opencl::Buffer inputX(cl_context_ref.context_inst_, + sizeof(float) * dim1 * dim2, true, nullptr); - opencl::Buffer inOutY(cl_context_ref.context_inst_, sizeof(_FP16) * dim1 * dim2, true, - nullptr); + opencl::Buffer inOutY(cl_context_ref.context_inst_, + sizeof(float) * dim1 * dim2, true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); if (!result) { diff --git a/nntrainer/layers/cl_layers/swiglu_cl.h b/nntrainer/layers/cl_layers/swiglu_cl.h index 215d817c1e..51f388bd88 100644 --- a/nntrainer/layers/cl_layers/swiglu_cl.h +++ b/nntrainer/layers/cl_layers/swiglu_cl.h @@ -79,7 +79,7 @@ class SwiGLULayerCl final : public Layer { * @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method) */ void exportTo(Exporter &exporter, - const ml::train::ExportMethods &method) const override {}; + const ml::train::ExportMethods &method) const override{}; /** * @copydoc Layer::getType() @@ -94,7 +94,10 @@ class SwiGLULayerCl final : public Layer { inline static const std::string type = "swiglu"; static opencl::Kernel kernel_swiglu; + +#ifdef ENABLE_FP16 static opencl::Kernel kernel_swiglu_fp16; +#endif std::tuple swiglu_props; /**< swiglu layer properties : unit - number of output neurons */ diff --git a/nntrainer/layers/cl_layers/swiglu_cl_fp16.cpp b/nntrainer/layers/cl_layers/swiglu_cl_fp16.cpp new file mode 100644 index 0000000000..3fe2d1f701 --- /dev/null +++ b/nntrainer/layers/cl_layers/swiglu_cl_fp16.cpp @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * + * @file swiglu_cl.cpp + * @date 6th June 2024 + * @brief Implementation of SwiGLU activation function + * @see https://github.com/nnstreamer/nntrainer + * @author Niket Agarwal + * @bug No known bugs except for NYI items + * + */ + +#include "swiglu_cl.h" +#include + +std::string swiglu_cl_kernel_fp16_ = + R"( + #pragma OPENCL EXTENSION cl_khr_fp16 : enable + __kernel void swiglu_cl_fp16(__global const half *in1, __global const half *in2, __global half *out) { + int i = get_global_id(0); + half swish = in1[i] * exp(in1[i]) / (1 + exp(in1[i])); + out[i] = swish * in2[i]; +})"; + +namespace nntrainer { + +void SwiGLULayerCl::swiglu_cl_fp16(const _FP16 *matAdata, const _FP16 *vecXdata, + _FP16 *vecYdata, unsigned int dim1, + unsigned int dim2) { + + bool result = false; + + do { + ClContext::SharedPtrClKernel kernel_swiglu_ptr = + cl_context_ref.registerClKernel(swiglu_cl_kernel_fp16_, "swiglu_cl_fp16"); + if (!kernel_swiglu_ptr) { + break; + } + + int dim = int(dim1 * dim2); + opencl::Buffer inputA(cl_context_ref.context_inst_, + sizeof(_FP16) * dim1 * dim2, true, nullptr); + + opencl::Buffer inputX(cl_context_ref.context_inst_, + sizeof(_FP16) * dim1 * dim2, true, nullptr); + + opencl::Buffer inOutY(cl_context_ref.context_inst_, + sizeof(_FP16) * dim1 * dim2, true, nullptr); + + result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + if (!result) { + break; + } + + result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + if (!result) { + break; + } + + result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + if (!result) { + break; + } + + const int work_groups_count[3] = {dim, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = cl_context_ref.command_queue_inst_.DispatchCommand( + kernel_swiglu_ptr, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + } while (false); +} + +} // namespace nntrainer + diff --git a/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp index c1284b0a9c..b4c3978dee 100644 --- a/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp +++ b/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp @@ -16,7 +16,7 @@ namespace nntrainer { -void rotary_emb_cl(__fp16 *in, __fp16 *out, +void rotary_emb_cl(_FP16 *in, _FP16 *out, std::vector> freqs_cos, std::vector> freqs_sin, std::vector cos_, std::vector sin_, diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp index 44d0d023c7..9235605eeb 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp @@ -10,6 +10,7 @@ * @bug No known bugs except for NYI items * */ +#ifdef ENABLE_FP16 #include #include @@ -402,3 +403,5 @@ void sscal_cl(_FP16 *X, const unsigned int N, const float alpha){ } while (false); } } // namespace nntrainer + +#endif