diff --git a/ggml-common.h b/ggml-common.h index b2d67d5db529c..43c7978a0982d 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -447,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_ #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_SYCL) + #include -#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory name(sycl::range<1>(size), { -#define GGML_TABLE_END() }); +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; #define GGML_COMMON_IMPL #endif diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6fd6ebd3a5a7a..2b0e5f54827a5 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -8079,7 +8079,7 @@ template static void template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) { + const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -9956,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq2xxs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0]; + auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; + auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), @@ -9985,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq2xs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; + auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), @@ -10014,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq3xxs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; + auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; + auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), @@ -10043,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq3s_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; + auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; + auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), @@ -10072,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq1s_grid_gpu.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; + auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; + auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), @@ -10415,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10428,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10444,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10457,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10473,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10486,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10502,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10515,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10531,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10544,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10560,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10573,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10589,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10602,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10618,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10631,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10647,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10660,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10676,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10689,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } } + static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -10705,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq2xxs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); - stream->submit([&](sycl::handler &cgh) { - auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0]; + auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; + auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10736,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq2xs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10764,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10792,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3s_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10820,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq1s_grid_gpu.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims),