Skip to content

sycl : Implemented reorder Q4_K mmvq #13109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions ggml/src/ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,24 @@
#ifndef GGML_SYCL_BACKEND_HPP
#define GGML_SYCL_BACKEND_HPP

#include "concat.hpp"
#include "common.hpp"
#include "concat.hpp"
#include "conv.hpp"
#include "convert.hpp"
#include "cpy.hpp"
#include "dequantize.hpp"
#include "dmmv.hpp"
#include "element_wise.hpp"
#include "gla.hpp"
#include "im2col.hpp"
#include "mmq.hpp"
#include "mmvq.hpp"
#include "rope.hpp"
#include "norm.hpp"
#include "outprod.hpp"
#include "quants.hpp"
#include "rope.hpp"
#include "softmax.hpp"
#include "tsembd.hpp"
#include "im2col.hpp"
#include "wkv.hpp"
#include "outprod.hpp"
#include "element_wise.hpp"
#include "cpy.hpp"
#include "gla.hpp"

#endif // GGML_SYCL_BACKEND_HPP
#endif // GGML_SYCL_BACKEND_HPP
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,4 +776,9 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
}

bool gpu_has_xmx(sycl::device &dev);

constexpr size_t safe_div(const size_t m, const size_t n) {
return (m + n - 1) / n;
}

#endif // GGML_SYCL_COMMON_HPP
23 changes: 22 additions & 1 deletion ggml/src/ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,23 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
}
}

template <typename dst_t>
static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

stream->submit([&](sycl::handler & cgh) {
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
[=](sycl::nd_item<3> item_ct1) {
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
});
});
}
}

template <typename dst_t>
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
Expand Down Expand Up @@ -493,7 +510,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
case GGML_TYPE_Q4_K:
return dequantize_row_q4_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q4_K_sycl_reorder;
} else {
return dequantize_row_q4_K_sycl;
}
case GGML_TYPE_Q5_K:
return dequantize_row_q5_K_sycl;
case GGML_TYPE_Q6_K:
Expand Down
81 changes: 60 additions & 21 deletions ggml/src/ggml-sycl/dequantize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,31 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
}
#endif

template <typename dst_t>
inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
const float dmin, uint8_t * __restrict__ scales_local,
const sycl::nd_item<3> & item_ct1, int il, int ir) {
const int is = 2 * il;
const int n = 4;

item_ct1.barrier(sycl::access::fence_space::local_space);

uint8_t sc, m;
get_scale_min_k4(is + 0, scales_local, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;

get_scale_min_k4(is + 1, scales_local, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;

sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
for (int l = 0; l < n; ++l) {
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
}
}

template<typename dst_t>
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
Expand All @@ -365,36 +390,21 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
const int64_t i = item_ct1.get_group(2);

#if QK_K == 256
// assume 32 threads
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t is = 2*il;
const int64_t n = 4;
const int64_t il = tid / 8;
const int64_t ir = tid % 8;

dst_t * y = yy + i*QK_K + 64*il + n*ir;
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;

const sycl::half2 dm = x[i].dm;
const float dall = dm[0];
const float dmin = dm[1];

if (tid < 12)
if (tid < 12) {
scales_local[tid] = x[i].scales[tid];
item_ct1.barrier(sycl::access::fence_space::local_space);

uint8_t sc, m;
get_scale_min_k4(is + 0, scales_local, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, scales_local, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;

sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
for (int l = 0; l < n; ++l) {
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
}

dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, item_ct1, il, ir);
#else
const int64_t tid = item_ct1.get_local_id(2);
const uint8_t * q = x[i].qs;
Expand All @@ -406,6 +416,35 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
#endif
}

template <typename dst_t>
static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
const sycl::nd_item<3> & item_ct1, int64_t nb) {
const int64_t i = item_ct1.get_group(2);
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid / 8;
const int64_t ir = tid % 8;

dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;

const uint8_t * base = static_cast<const uint8_t *>(vx);
const size_t qs_offset = i * (QK_K / 2);
const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);

const uint8_t * qs_ptr = base + qs_offset;
const uint8_t * scales_ptr = base + scales_offset;
const ggml_half2 * dm_ptr = reinterpret_cast<const ggml_half2 *>(base + dm_offset);

const float dall = dm_ptr->x();
const float dmin = dm_ptr->y();

if (tid < 12) {
scales_local[tid] = scales_ptr[tid];
}

dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, item_ct1, il, ir);
}

template<typename dst_t>
static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
Expand Down
8 changes: 7 additions & 1 deletion ggml/src/ggml-sycl/dmmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,7 +1129,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q4_K:
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
// reorder is currently not supported for dmmv
GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
} else {
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q5_K:
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
Expand Down
Loading