From 0b432a67d193589d3c7a85893c710ee86e9c7bc8 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 23 Apr 2025 09:53:03 +0530 Subject: [PATCH 1/4] SYCL: Add all missing unary kernels ggml-ci --- ggml/src/ggml-sycl/element_wise.cpp | 179 ++++++++++++++++++++++++++++ ggml/src/ggml-sycl/element_wise.hpp | 5 + ggml/src/ggml-sycl/ggml-sycl.cpp | 13 ++ 3 files changed, 197 insertions(+) diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index fc25d98ddff1a..c7a7e4c109f50 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,6 +1,10 @@ #include "common.hpp" #include "ggml.h" #include "element_wise.hpp" +#include +#include +#include +#include static void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12, @@ -21,6 +25,33 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne, } } +template +static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_global_id(2); + if (i >= k) { + return; + } + dst[i] = x[i] > static_cast(0.f) ? static_cast(1.f) : ((x[i] < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); +} + +template +static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_global_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::fabs(x[i]); +} + +template +static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_global_id(2); + if (i >= k) { + return; + } + dst[i] = (x[i] > static_cast(0.f)) ? x[i] : sycl::expm1(x[i]); +} + template static void gelu(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { @@ -335,6 +366,37 @@ static void silu_sycl(const T *x, T *dst, const int k, }); } +template +static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { + // hard code for now + const int num_blocks = (k + 256 - 1) / 256; + stream->parallel_for( + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { + sgn(x, dst, k, item_ct1); + }); +} + +template +static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { + // hard code for now + const int num_blocks = (k + 256 - 1) / 256; + stream->parallel_for( + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { + abs_op(x, dst, k, item_ct1); + }); +} + + +template +static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { + // hard code for now + const int num_blocks = (k + 256 - 1) / 256; + stream->parallel_for( + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { + elu_op(x, dst, k, item_ct1); + }); +} + template static void gelu_quick_sycl(const T *x, T *dst, const int k, queue_ptr stream) { @@ -574,6 +636,106 @@ static void clamp_sycl(const T *x, T *dst, const float min, }); } +inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + break; + } +} + +inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + break; + } +} + + +inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + break; + } +} + inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); @@ -1388,3 +1550,20 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s done\n", __func__); } +void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); + ggml_sycl_op_sgn(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); + ggml_sycl_op_abs(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); + ggml_sycl_op_elu(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index e623cb56f7625..f4199d69da694 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -66,5 +66,10 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 548f2d0a06be0..66b6f2cca4da9 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -38,6 +38,7 @@ #include "ggml-sycl/backend.hpp" #include "ggml-sycl/common.hpp" +#include "ggml-sycl/element_wise.hpp" #include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" #include "ggml-sycl/sycl_hw.hpp" @@ -3355,6 +3356,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_UNARY_OP_EXP: ggml_sycl_exp(ctx, dst); break; + case GGML_UNARY_OP_SGN: + ggml_sycl_sgn(ctx, dst); + break; + case GGML_UNARY_OP_ABS: + ggml_sycl_abs(ctx, dst); + break; + case GGML_UNARY_OP_ELU: + ggml_sycl_elu(ctx, dst); + break; default: return false; } @@ -3837,6 +3847,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_ELU: #if defined (GGML_SYCL_F16) return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type); #else From 8facdb8701dffc482dc84a3201b6151238cab71c Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 23 Apr 2025 21:29:04 +0530 Subject: [PATCH 2/4] decouple kernel launch range from data size using strided loop --- ggml/src/ggml-sycl/element_wise.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index c7a7e4c109f50..34095066433c0 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -27,29 +27,23 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne, template static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_global_id(2); - if (i >= k) { - return; + for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { + dst[i] = x[i] > static_cast(0.f) ? static_cast(1.f) : ((x[i] < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); } - dst[i] = x[i] > static_cast(0.f) ? static_cast(1.f) : ((x[i] < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); } template static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_global_id(2); - if (i >= k) { - return; + for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { + dst[i] = sycl::fabs(x[i]); } - dst[i] = sycl::fabs(x[i]); } template static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_global_id(2); - if (i >= k) { - return; + for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { + dst[i] = (x[i] > static_cast(0.f)) ? x[i] : sycl::expm1(x[i]); } - dst[i] = (x[i] > static_cast(0.f)) ? x[i] : sycl::expm1(x[i]); } template From 706451a669b442af5b9b96da191a1db75713047b Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 24 Apr 2025 12:29:13 +0530 Subject: [PATCH 3/4] use ciel_div helper for num_blocks ggml-ci --- ggml/src/ggml-sycl/common.hpp | 4 ++++ ggml/src/ggml-sycl/element_wise.cpp | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 0ab0fb0aa394d..c3d9d186456ac 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -493,5 +493,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor acc) { int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size); +constexpr size_t ceil_div(const size_t m, const size_t n) { + return (m + n - 1) / n; +} + bool gpu_has_xmx(sycl::device &dev); #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 34095066433c0..597d929917e14 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -363,7 +363,7 @@ static void silu_sycl(const T *x, T *dst, const int k, template static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { // hard code for now - const int num_blocks = (k + 256 - 1) / 256; + const int num_blocks = ceil_div(k, 256); stream->parallel_for( sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); @@ -373,7 +373,7 @@ static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { template static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { // hard code for now - const int num_blocks = (k + 256 - 1) / 256; + const int num_blocks = ceil_div(k, 256); stream->parallel_for( sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); @@ -384,7 +384,7 @@ static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { template static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { // hard code for now - const int num_blocks = (k + 256 - 1) / 256; + const int num_blocks = ceil_div(k, 256); stream->parallel_for( sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); From bafafc383c7b7dd11d1b8c9376e41096f9271dd5 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 26 Apr 2025 11:14:34 +0530 Subject: [PATCH 4/4] clean auto imported header files --- ggml/src/ggml-sycl/element_wise.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 597d929917e14..dcc6ec809a7d1 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,10 +1,6 @@ #include "common.hpp" #include "ggml.h" #include "element_wise.hpp" -#include -#include -#include -#include static void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12,