From 9764a57d78e6531101e9a0c43bda0242e4962eff Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 5 Sep 2024 12:08:52 -0700 Subject: [PATCH 01/65] Optimize memory transactions in SYCL backend parallel for Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 54 +++++++++++++++++-- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 96d63e33aee..652db0c65f5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -237,13 +237,57 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); _PRINT_INFO_IN_DEBUG_MODE(__exec); - auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - - __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { - auto __idx = __item_id.get_linear_id(); - __brick(__idx, __rngs...); + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + + // For target architectures, 512 bytes is the maximum amount of data that can be performed in a single load / store + // transaction. Assuming a sub-group size of 32, 512 / 32 = 16 which is the number of bytes we wish to load / store + // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller + // but coalesced transactions will be made and performance should still be maximized. + // Grab the value type of the first range to estimate the optimal iters per work item. + using _ValueType = oneapi::dpl::__internal::__value_t>>>; + constexpr std::uint16_t __max_bytes_per_transaction = 512; + constexpr std::uint16_t __predicted_sub_group_size = 32; + constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size; + // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. + constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); + std::size_t __num_items = std::max(static_cast<_Index>(__work_group_size), oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); + // TODO: optimize for small data sizes that do not saturate the device with this scheme + __cgh.parallel_for<_Name...>(sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { + __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + + std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + + __sub_group_size * __sub_group_id); + bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; + if (__is_full_sub_group) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + { + __brick(__idx, __rngs...); + __idx += __sub_group_size; + } + } + else + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + { + if (__idx < __count) + { + __brick(__idx, __rngs...); + __idx += __sub_group_size; + } + } + } }); }); return __future(__event); From c836b1d20486f7b3b6ca783937a9848884b54cde Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 5 Sep 2024 14:12:08 -0500 Subject: [PATCH 02/65] clang-format Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 68 ++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 652db0c65f5..1b077268117 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -247,48 +247,54 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller // but coalesced transactions will be made and performance should still be maximized. // Grab the value type of the first range to estimate the optimal iters per work item. - using _ValueType = oneapi::dpl::__internal::__value_t>>>; + using _ValueType = + oneapi::dpl::__internal::__value_t>>>; constexpr std::uint16_t __max_bytes_per_transaction = 512; constexpr std::uint16_t __predicted_sub_group_size = 32; constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size; // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. - constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); - std::size_t __num_items = std::max(static_cast<_Index>(__work_group_size), oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); + constexpr std::uint16_t __iters_per_work_item = + std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); + std::size_t __num_items = + std::max(static_cast<_Index>(__work_group_size), + oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); // TODO: optimize for small data sizes that do not saturate the device with this scheme - __cgh.parallel_for<_Name...>(sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); - std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - - std::size_t __sub_group_start_idx = - __iters_per_work_item * (__work_group_id * __work_group_size + - __sub_group_size * __sub_group_id); - bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; - if (__is_full_sub_group) - { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) - { - __brick(__idx, __rngs...); - __idx += __sub_group_size; - } - } - else - { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + __cgh.parallel_for<_Name...>( + sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item __ndi) { + __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + + std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + + __sub_group_size * __sub_group_id); + bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; + if (__is_full_sub_group) { - if (__idx < __count) + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) { __brick(__idx, __rngs...); __idx += __sub_group_size; } } - } - }); + else + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + { + if (__idx < __count) + { + __brick(__idx, __rngs...); + __idx += __sub_group_size; + } + } + } + }); }); return __future(__event); } From 55f33a4c274542f814ec3c715430cd2c6ca9ac69 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 6 Sep 2024 09:42:06 -0700 Subject: [PATCH 03/65] Correct comment and error handling. 128 byte memory operations are performed instead of 512 after inspecting the assembly. Processing 512 bytes per sub-group still seems to be the best value after experimentation. Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 1b077268117..70405bedce6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -242,22 +242,21 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); - // For target architectures, 512 bytes is the maximum amount of data that can be performed in a single load / store - // transaction. Assuming a sub-group size of 32, 512 / 32 = 16 which is the number of bytes we wish to load / store - // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller - // but coalesced transactions will be made and performance should still be maximized. + // Processing 512 bytes per sub-group has shown the best performance on target architectures. // Grab the value type of the first range to estimate the optimal iters per work item. using _ValueType = oneapi::dpl::__internal::__value_t>>>; - constexpr std::uint16_t __max_bytes_per_transaction = 512; + + constexpr std::uint16_t __max_bytes_per_sub_group = 512; constexpr std::uint16_t __predicted_sub_group_size = 32; - constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size; + constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_sub_group / __predicted_sub_group_size; // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); - std::size_t __num_items = - std::max(static_cast<_Index>(__work_group_size), - oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); + std::size_t __num_groups = + std::max(__work_group_size, + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item))); + std::size_t __num_items = __num_groups * __work_group_size; // TODO: optimize for small data sizes that do not saturate the device with this scheme __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), From adadd56dfe608b537682f74f0131784e657a5d88 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 10 Sep 2024 08:29:09 -0700 Subject: [PATCH 04/65] __num_groups bugfix Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 70405bedce6..e54b2c5f300 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -254,7 +254,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); std::size_t __num_groups = - std::max(__work_group_size, + std::max(std::size_t{1}, oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item))); std::size_t __num_items = __num_groups * __work_group_size; // TODO: optimize for small data sizes that do not saturate the device with this scheme From 71d7bccc1f5f39406d11541dfc7de397c6abc9e2 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 16 Sep 2024 08:58:22 -0700 Subject: [PATCH 05/65] Introduce stride recommender for different targets and better distribute work for small inputs Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 48 ++++++++----------- .../dpcpp/parallel_backend_sycl_utils.h | 34 +++++++++++++ 2 files changed, 54 insertions(+), 28 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index e54b2c5f300..88a62ffdc65 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -240,56 +240,48 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + + // Limit the work-group size to 512 which has empirically yielded the best results. + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); + __work_group_size = std::min(__work_group_size, static_cast(__count)); // Processing 512 bytes per sub-group has shown the best performance on target architectures. // Grab the value type of the first range to estimate the optimal iters per work item. using _ValueType = oneapi::dpl::__internal::__value_t>>>; - constexpr std::uint16_t __max_bytes_per_sub_group = 512; - constexpr std::uint16_t __predicted_sub_group_size = 32; - constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_sub_group / __predicted_sub_group_size; - // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. - constexpr std::uint16_t __iters_per_work_item = - std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); + constexpr std::size_t __bytes_per_work_item = 16; + constexpr std::size_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); + auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + std::size_t __elems_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); + // For small data sizes, distribute the work evenly among compute units. + std::size_t __iters_per_work_item = std::min(__elems_per_compute_unit, __max_iters_per_work_item); std::size_t __num_groups = - std::max(std::size_t{1}, - oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item))); + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); std::size_t __num_items = __num_groups * __work_group_size; - // TODO: optimize for small data sizes that do not saturate the device with this scheme __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); - std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - - std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + - __sub_group_size * __sub_group_id); - bool __is_full_sub_group = - __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; - if (__is_full_sub_group) + auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); + // TODO: Investigate using a vectorized approach similar to reduce. + // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but + // performance regressions for out-of-place (e.g. std::copy). + if (__is_full) { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) { __brick(__idx, __rngs...); - __idx += __sub_group_size; + __idx += __stride; } } else { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) { if (__idx < __count) { __brick(__idx, __rngs...); - __idx += __sub_group_size; + __idx += __stride; } } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index f4eb557170e..a9c195368b6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -834,6 +834,40 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; +// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different +// devices. This utility can only be called from the device. +// +// SPIR-V compilation targets show best performance with a stride of the sub-group size. +// Other compilation targets perform best with a work-group size stride. +template +std::tuple +__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) +{ + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + + std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + + __sub_group_size * __sub_group_id); + bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); + bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } +} + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi From ebb3d569ca6d91d31ab7705790751d70350b15bc Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 16 Sep 2024 14:27:52 -0700 Subject: [PATCH 06/65] Cleanup Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 22 +++++++++---------- .../dpcpp/parallel_backend_sycl_utils.h | 22 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 88a62ffdc65..17eda57cd09 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -245,20 +245,20 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); __work_group_size = std::min(__work_group_size, static_cast(__count)); - // Processing 512 bytes per sub-group has shown the best performance on target architectures. - // Grab the value type of the first range to estimate the optimal iters per work item. using _ValueType = oneapi::dpl::__internal::__value_t>>>; - constexpr std::size_t __bytes_per_work_item = 16; - constexpr std::size_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); - auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - std::size_t __elems_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); + // Process up to 16 bytes per work-item. This results in 512 bytes loaded input range per size 32 sub-group which + // has yielded best performance on target architectures. For larger data types, load a single element. + constexpr std::uint8_t __bytes_per_work_item = 16; + constexpr std::uint8_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); // For small data sizes, distribute the work evenly among compute units. - std::size_t __iters_per_work_item = std::min(__elems_per_compute_unit, __max_iters_per_work_item); - std::size_t __num_groups = + const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); + const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); - std::size_t __num_items = __num_groups * __work_group_size; + const std::size_t __num_items = __num_groups * __work_group_size; __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { @@ -268,7 +268,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // performance regressions for out-of-place (e.g. std::copy). if (__is_full) { - for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) + for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { __brick(__idx, __rngs...); __idx += __stride; @@ -276,7 +276,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> } else { - for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) + for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { if (__idx < __count) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index a9c195368b6..1c18eeeda8f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -845,24 +845,24 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) { - __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); - std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + + const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); - bool __is_full_sub_group = + const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); } else { - std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; - std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); - bool __is_full_work_group = + const std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); + const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); } From 2c4ecd0ddec719b9ec1e6fc8d97fe74499e56bfc Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 18 Sep 2024 12:52:47 -0700 Subject: [PATCH 07/65] Unroll loop if possible Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 17eda57cd09..c4e23190bf6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -256,6 +256,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); // For small data sizes, distribute the work evenly among compute units. const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); + const bool __can_unroll_loop = __max_iters_per_work_item == __iters_per_work_item; const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -266,7 +267,16 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // TODO: Investigate using a vectorized approach similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy). - if (__is_full) + if (__is_full && __can_unroll_loop) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __max_iters_per_work_item; ++__i) + { + __brick(__idx, __rngs...); + __idx += __stride; + } + } + else if (__is_full) { for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { From dc6bd0c6b94e81aeb9601ee0078c9d2a8c35402c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 18 Sep 2024 13:15:44 -0700 Subject: [PATCH 08/65] Revert "Unroll loop if possible" This reverts commit e4cbcebf6ec43c2eced1a90124e6306883793da0. Small sizes slightly slower and for horizontal vectorization no "real" benefit is observed. --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c4e23190bf6..17eda57cd09 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -256,7 +256,6 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); // For small data sizes, distribute the work evenly among compute units. const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); - const bool __can_unroll_loop = __max_iters_per_work_item == __iters_per_work_item; const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -267,16 +266,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // TODO: Investigate using a vectorized approach similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy). - if (__is_full && __can_unroll_loop) - { - _ONEDPL_PRAGMA_UNROLL - for (std::uint8_t __i = 0; __i < __max_iters_per_work_item; ++__i) - { - __brick(__idx, __rngs...); - __idx += __stride; - } - } - else if (__is_full) + if (__is_full) { for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { From d5126b2df69fbc482375c9fd01ff5edc5e3c60c8 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 19 Sep 2024 20:02:21 -0700 Subject: [PATCH 09/65] Use a small and large kernel in parallel for Small but measurable overheads can be observed for small inputs where runtime dispatch in the kernel is present to check for the correct path to take. Letting the compiler handle the the small input case in the original kernel shows the best performance. Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 70 +++++++++++++++---- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 17eda57cd09..2e4869fdb6b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -218,6 +218,12 @@ class __scan_single_wg_dynamic_kernel; template class __scan_copy_single_wg_kernel; +template +class __parallel_for_small_kernel; + +template +class __parallel_for_large_kernel; + //------------------------------------------------------------------------ // parallel_for - async pattern //------------------------------------------------------------------------ @@ -226,10 +232,35 @@ class __scan_copy_single_wg_kernel; // as the parameter pack that can be empty (for unnamed kernels) or contain exactly one // type (for explicitly specified name by the user) template -struct __parallel_for_submitter; +struct __parallel_for_small_submitter; template -struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> +struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + + __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { + auto __idx = __item_id.get_linear_id(); + __brick(__idx, __rngs...); + }); + }); + return __future(__event); + } +}; + +template +struct __parallel_for_large_submitter; + +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>> { template auto @@ -248,14 +279,13 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> using _ValueType = oneapi::dpl::__internal::__value_t>>>; - // Process up to 16 bytes per work-item. This results in 512 bytes loaded input range per size 32 sub-group which - // has yielded best performance on target architectures. For larger data types, load a single element. + // Process up to 16 bytes per work-item per input range. This value has been the empirically determined minimum + // number of bytes for a single input range to saturate HW bandwidth on target architecures. constexpr std::uint8_t __bytes_per_work_item = 16; - constexpr std::uint8_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); - const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); - // For small data sizes, distribute the work evenly among compute units. - const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); + // TODO: Better handle this heuristic for the case where the input is a zip iterator + constexpr std::uint8_t __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); + const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -268,6 +298,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // performance regressions for out-of-place (e.g. std::copy). if (__is_full) { + _ONEDPL_PRAGMA_UNROLL for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { __brick(__idx, __rngs...); @@ -299,10 +330,23 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>; - - return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count, - ::std::forward<_Ranges>(__rngs)...); + using _ForKernelSmall = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>; + using _ForKernelLarge = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single + // kernel that worsen performance for small cases. + if (__count <= 262144) + { + return __parallel_for_small_submitter<_ForKernelSmall>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + else + { + return __parallel_for_large_submitter<_ForKernelLarge>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } //------------------------------------------------------------------------ From 6433a5004234e56ae8132274537d3a02c1c5d6aa Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 20 Sep 2024 10:46:24 -0500 Subject: [PATCH 10/65] Improve __iters_per_work_item heuristic. We now flatten the user-provided ranges and find the minimum sized type to estimate the best __iters_per_work_item. This benefits performance in calls that wrap multiple buffers in a single input / output through a zip_iterator (e.g. dpct::scatter_if in SYCLomatic compatibility headers). Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 59 ++++++++++++------- include/oneapi/dpl/pstl/tuple_impl.h | 20 +++++++ include/oneapi/dpl/pstl/utils.h | 20 +++++++ 3 files changed, 77 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2e4869fdb6b..65d358cb045 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -256,13 +256,35 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name.. } }; -template +template struct __parallel_for_large_submitter; -template -struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>> -{ - template +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> +{ + static constexpr std::uint8_t __bytes_per_work_item = 16; + // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip + // iterator. + using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< + std::tuple...>>::type; + using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type; + // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the + // flattened ranges. This allows us to launch enough work per item to saturate device memory. + static constexpr std::uint8_t __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); + + // Once there is enough work to launch a group on each compute unit with our __iters_per_item, + // then we should start using this code path. + template + static std::size_t + __estimate_best_start_size(const _ExecutionPolicy& __exec) + { + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + return __work_group_size * __iters_per_work_item * __max_cu; + } + + template auto operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { @@ -274,25 +296,16 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // Limit the work-group size to 512 which has empirically yielded the best results. std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); - __work_group_size = std::min(__work_group_size, static_cast(__count)); - using _ValueType = - oneapi::dpl::__internal::__value_t>>>; - - // Process up to 16 bytes per work-item per input range. This value has been the empirically determined minimum - // number of bytes for a single input range to saturate HW bandwidth on target architecures. - constexpr std::uint8_t __bytes_per_work_item = 16; // TODO: Better handle this heuristic for the case where the input is a zip iterator - constexpr std::uint8_t __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); - const std::size_t __num_groups = - oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); + auto [__idx, __stride, __is_full] = + __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); // TODO: Investigate using a vectorized approach similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy). @@ -335,17 +348,19 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& using _ForKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; + using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single // kernel that worsen performance for small cases. - if (__count <= 262144) + if (__count < __large_submitter::__estimate_best_start_size(__exec)) { - return __parallel_for_small_submitter<_ForKernelSmall>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } else { - return __parallel_for_large_submitter<_ForKernelLarge>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } } diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index 239734d4861..0c528b0d15e 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,6 +793,26 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; + +// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. +template +struct __flatten_std_or_internal_tuple +{ + using type = std::tuple<_T>; +}; + +template +struct __flatten_std_or_internal_tuple> +{ + using type = decltype(std::tuple_cat(std::declval::type>()...)); +}; + +template +struct __flatten_std_or_internal_tuple> +{ + using type = decltype(std::tuple_cat(std::declval::type>()...)); +}; + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 8a8dfdae1bc..1168dc76586 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -784,6 +784,26 @@ union __lazy_ctor_storage } }; +// Utility that returns the smallest type in tuple. +template +class __min_tuple_type; + +template +class __min_tuple_type> +{ + public: + using type = _T; +}; + +template +class __min_tuple_type> +{ + using __min_type_ts = typename __min_tuple_type>::type; + + public: + using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>; +}; + } // namespace __internal } // namespace dpl } // namespace oneapi From d376124b827f2232bf0600b9efe8b5236d074ff8 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 20 Sep 2024 16:00:45 -0500 Subject: [PATCH 11/65] Code cleanup Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 22 ++++++++++--------- .../dpcpp/parallel_backend_sycl_utils.h | 16 +++++++------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 65d358cb045..04c1c3a07c6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -262,24 +262,29 @@ struct __parallel_for_large_submitter; template struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> { - static constexpr std::uint8_t __bytes_per_work_item = 16; // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip // iterator. using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< std::tuple...>>::type; using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the - // flattened ranges. This allows us to launch enough work per item to saturate device memory. + // flattened ranges. This allows us to launch enough work per item to saturate the device's memory + // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to + // achieve full bandwidth utilization for algorithms that have multiple ranges as this has shown the + // best general performance. + static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); + // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. + static constexpr std::uint16_t __max_work_group_size = 512; - // Once there is enough work to launch a group on each compute unit with our __iters_per_item, + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, // then we should start using this code path. template static std::size_t __estimate_best_start_size(const _ExecutionPolicy& __exec) { - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); return __work_group_size * __iters_per_work_item * __max_cu; } @@ -294,10 +299,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - // Limit the work-group size to 512 which has empirically yielded the best results. - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); - - // TODO: Better handle this heuristic for the case where the input is a zip iterator + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -368,7 +370,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_scan_submitter; @@ -2284,7 +2286,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 1c18eeeda8f..0b382b5e248 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -841,7 +841,8 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // Other compilation targets perform best with a work-group size stride. template std::tuple -__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) +__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) { @@ -851,19 +852,18 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + - __sub_group_size * __sub_group_id); - const bool __is_full_sub_group = - __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); } else { - const std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_group_start_idx = + __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); - const bool __is_full_work_group = - __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); } } From a7c7606af0985fff5caf66862fcaf23b4df57905 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 13:29:09 -0500 Subject: [PATCH 12/65] Clang format Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 ++- include/oneapi/dpl/pstl/tuple_impl.h | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 04c1c3a07c6..cc79df785b9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -299,7 +299,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index 0c528b0d15e..c758a4a3f1b 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,12 +793,11 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; - // Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. template struct __flatten_std_or_internal_tuple { - using type = std::tuple<_T>; + using type = std::tuple<_T>; }; template From b8aa15cadbcf8a02986ab4a18ba39baa69865f4e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 14:02:21 -0500 Subject: [PATCH 13/65] Update comments Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index cc79df785b9..4fe18421543 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -270,8 +270,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to - // achieve full bandwidth utilization for algorithms that have multiple ranges as this has shown the - // best general performance. + // achieve full bandwidth utilization. 16 bytes per input range per work item has been found as a good + // value across the different for-based algorithms. static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); @@ -309,9 +309,10 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. [=](sycl::nd_item __ndi) { auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); - // TODO: Investigate using a vectorized approach similar to reduce. + // TODO: Investigate adding a vectorized path similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but - // performance regressions for out-of-place (e.g. std::copy). + // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to + // vectorize our code. if (__is_full) { _ONEDPL_PRAGMA_UNROLL From b45a7c2f6dfe2ad8b59bf8cb1589c0cfcc6e2c9f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 14:05:09 -0500 Subject: [PATCH 14/65] Bugfix in comment Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 4fe18421543..6613606800a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -270,7 +270,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to - // achieve full bandwidth utilization. 16 bytes per input range per work item has been found as a good + // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good // value across the different for-based algorithms. static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = From 4f9a3606ea95af41b8a8b5fa443fa3111a91e854 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 15:10:06 -0500 Subject: [PATCH 15/65] More cleanup and better handle non-full case Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 31 ++++++++++++------- .../dpcpp/parallel_backend_sycl_utils.h | 6 ++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 6613606800a..ab96e782d7a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -284,7 +284,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. static std::size_t __estimate_best_start_size(const _ExecutionPolicy& __exec) { - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); return __work_group_size * __iters_per_work_item * __max_cu; } @@ -298,8 +299,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - - std::size_t __work_group_size = + const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); @@ -307,12 +307,13 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - auto [__idx, __stride, __is_full] = - __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); // TODO: Investigate adding a vectorized path similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to - // vectorize our code. + // vectorize our code. Vectorization may also improve performance of for-algorithms over small data + // types. + auto [__idx, __group_start_idx, __stride, __is_full] = + __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); if (__is_full) { _ONEDPL_PRAGMA_UNROLL @@ -324,13 +325,19 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. } else { - for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) + // Recompute iters per item and manually unroll last loop iteration to remove most branching. + if (__group_start_idx >= __count) + return; + const std::uint8_t __adjusted_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i) { - if (__idx < __count) - { - __brick(__idx, __rngs...); - __idx += __stride; - } + __brick(__idx, __rngs...); + __idx += __stride; + } + if (__idx < __count) + { + __brick(__idx, __rngs...); } } }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 0b382b5e248..39f5052382a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -840,7 +840,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. template -std::tuple +std::tuple __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { @@ -856,7 +856,7 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; - return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group); } else { @@ -864,7 +864,7 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; - return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group); } } From 7bb1d2b81d11161cbce2009318559fba18fa0a63 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 08:38:40 -0500 Subject: [PATCH 16/65] Rename __ndi to __item for consistency with codebase Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ab96e782d7a..6c0cf593f3c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -306,14 +306,14 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. const std::size_t __num_items = __num_groups * __work_group_size; __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), - [=](sycl::nd_item __ndi) { + [=](sycl::nd_item __item) { // TODO: Investigate adding a vectorized path similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to // vectorize our code. Vectorization may also improve performance of for-algorithms over small data // types. auto [__idx, __group_start_idx, __stride, __is_full] = - __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); + __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size); if (__is_full) { _ONEDPL_PRAGMA_UNROLL diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 39f5052382a..fc98cc86db2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -839,18 +839,18 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. -template +template std::tuple -__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, +__stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) { - const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); @@ -861,8 +861,8 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite else { const std::size_t __work_group_start_idx = - __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; - const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group); } From a2ad92041e603f69cd075c6d8f08142038fbc2e4 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 08:45:52 -0500 Subject: [PATCH 17/65] Update all comments on kernel naming trick Signed-off-by: Matthew Michel --- .../experimental/kt/internal/esimd_radix_sort_submitters.h | 2 +- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h | 2 +- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4d7b81e6a2e..4fc274f2445 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl { //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ template struct __parallel_scan_submitter; @@ -2295,7 +2295,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..3be82fdc623 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -48,7 +48,7 @@ namespace __par_backend_hetero //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_for_fpga_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cadff26a15d..9c331148f3c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -112,7 +112,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_merge_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index edad63d2a79..23e38268bf9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ //------------------------------------------------------------------------ // parallel_transform_reduce - async patterns -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ // Parallel_transform_reduce for a small arrays using a single work group. From 47fe214c25cde24305bd118a689dbdba1a385e4b Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 09:29:54 -0500 Subject: [PATCH 18/65] Handle non-full case in a cleaner way Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 16 +++++----------- .../hetero/dpcpp/parallel_backend_sycl_utils.h | 6 +++--- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ba4a4537da2..282b787448a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -312,7 +312,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to // vectorize our code. Vectorization may also improve performance of for-algorithms over small data // types. - auto [__idx, __group_start_idx, __stride, __is_full] = + auto [__idx, __stride, __is_full] = __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size); if (__is_full) { @@ -323,22 +323,16 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. __idx += __stride; } } - else + // If we are not full, then take this branch only if there is work to process. + else if (__idx < __count) { - // Recompute iters per item and manually unroll last loop iteration to remove most branching. - if (__group_start_idx >= __count) - return; const std::uint8_t __adjusted_iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride); - for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i) + oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i) { __brick(__idx, __rngs...); __idx += __stride; } - if (__idx < __count) - { - __brick(__idx, __rngs...); - } } }); }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index fc98cc86db2..c447f9b592f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -840,7 +840,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. template -std::tuple +std::tuple __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { @@ -856,7 +856,7 @@ __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __i __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; - return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group); + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); } else { @@ -864,7 +864,7 @@ __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __i __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; - return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group); + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); } } From 79a18e9545c84c1727f283f8fd358dbacc7f9da6 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 11:17:14 -0500 Subject: [PATCH 19/65] Switch min tuple type utility to return size of type Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 +++-- include/oneapi/dpl/pstl/utils.h | 17 ++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 282b787448a..dd963a5e591 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -266,7 +266,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // iterator. using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< std::tuple...>>::type; - using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type; + static constexpr std::size_t __min_type_size = + oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to @@ -274,7 +275,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // value across the different for-based algorithms. static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size); // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. static constexpr std::uint16_t __max_work_group_size = 512; diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 1168dc76586..10d60d8c5d6 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -784,26 +784,29 @@ union __lazy_ctor_storage } }; -// Utility that returns the smallest type in tuple. +// Utility that returns the smallest type size in a tuple. template -class __min_tuple_type; +class __min_tuple_type_size; template -class __min_tuple_type> +class __min_tuple_type_size> { public: - using type = _T; + static constexpr std::size_t value = sizeof(_T); }; template -class __min_tuple_type> +class __min_tuple_type_size> { - using __min_type_ts = typename __min_tuple_type>::type; + static constexpr std::size_t __min_type_value_ts = __min_tuple_type_size>::value; public: - using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>; + static constexpr std::size_t value = std::min(sizeof(_T), __min_type_value_ts); }; +template +inline constexpr std::size_t __min_tuple_type_size_v = __min_tuple_type_size<_Tuple>::value; + } // namespace __internal } // namespace dpl } // namespace oneapi From 3ab8c75aeab4a05becd8da49879ff0003f6701e3 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 11:59:53 -0500 Subject: [PATCH 20/65] Remove unnecessary template parameter Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index c447f9b592f..77cc8ad1671 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -839,9 +839,8 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. -template std::tuple -__stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item, +__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) From 4a70fe2155c3bc100bca7e47c4a3047a01f32291 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 13:07:30 -0500 Subject: [PATCH 21/65] Make non-template function inline for ODR compliance Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 77cc8ad1671..685b4760daa 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -839,7 +839,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. -std::tuple +inline std::tuple __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { From 5530209445e7a824b70a2d8201edcbaf6a8cce44 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 14:21:40 -0700 Subject: [PATCH 22/65] If the iters per work item is 1, then only compile the basic pfor kernel Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index dd963a5e591..aee0c73fdf6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -356,16 +356,25 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; - // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single - // kernel that worsen performance for small cases. - if (__count < __large_submitter::__estimate_best_start_size(__exec)) + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a + // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1, + // then only compile the basic kernel as the two versions are effectively the same. + if constexpr (__large_submitter::__iters_per_work_item > 1) { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + if (__count < __large_submitter::__estimate_best_start_size(__exec)) + { + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + else + { + return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } else { - return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, std::forward<_Ranges>(__rngs)...); } } From 90f19d4c62facaa3211e8af12826ef3ea036b861 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 25 Sep 2024 09:11:44 -0500 Subject: [PATCH 23/65] Address several PR comments * Move __stride_recommender into __parallel_for_large_submitter * Use {} to invoke constructor * Simplify if-else statements in for dispatch Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index aee0c73fdf6..30f983c8a72 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -279,6 +279,39 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. static constexpr std::uint16_t __max_work_group_size = 512; + // SPIR-V compilation targets show best performance with a stride of the sub-group size. + // Other compilation targets perform best with a work-group size stride. This utility can only be called from the + // device. + static inline std::tuple + __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) + { + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); + const bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } + } + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, // then we should start using this code path. template @@ -361,22 +394,14 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& // then only compile the basic kernel as the two versions are effectively the same. if constexpr (__large_submitter::__iters_per_work_item > 1) { - if (__count < __large_submitter::__estimate_best_start_size(__exec)) + if (__count >= __large_submitter::__estimate_best_start_size(__exec)) { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, std::forward<_Ranges>(__rngs)...); } - else - { - return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); - } - } - else - { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); } + return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } //------------------------------------------------------------------------ From 1ac65b927db4944fca994c30240157e7fb201434 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 25 Sep 2024 09:19:31 -0500 Subject: [PATCH 24/65] Remove free function __stride_recommender Signed-off-by: Matthew Michel --- .../dpcpp/parallel_backend_sycl_utils.h | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 685b4760daa..f4eb557170e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -834,39 +834,6 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; -// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different -// devices. This utility can only be called from the device. -// -// SPIR-V compilation targets show best performance with a stride of the sub-group size. -// Other compilation targets perform best with a work-group size stride. -inline std::tuple -__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, - std::size_t __work_group_size) -{ - if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) - { - const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); - const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); - - const std::size_t __sub_group_start_idx = - __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); - const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; - return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); - } - else - { - const std::size_t __work_group_start_idx = - __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; - const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); - const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; - return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); - } -} - } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi From 6a5a562aaa3140b55e7b93752658104c487ee5ef Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 25 Sep 2024 13:17:30 -0500 Subject: [PATCH 25/65] Accept ranges as forwarding references in __parallel_for_large_submitter Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 30f983c8a72..8798b50b126 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -256,16 +256,16 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name.. } }; -template +template struct __parallel_for_large_submitter; -template -struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> { // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip // iterator. using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< - std::tuple...>>::type; + std::tuple...>>::type; static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the @@ -324,7 +324,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. return __work_group_size * __iters_per_work_item * __max_cu; } - template + template auto operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { From 357032f663e39a91d2fe3e8943f9a3d5465f4c87 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 6 Nov 2024 13:44:38 -0600 Subject: [PATCH 26/65] Address reviewer comments Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++---- include/oneapi/dpl/pstl/tuple_impl.h | 19 -------------- include/oneapi/dpl/pstl/utils.h | 25 +++++++------------ 3 files changed, 12 insertions(+), 41 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 8798b50b126..d029aea0de7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "../../iterator_impl.h" #include "../../execution_impl.h" @@ -262,12 +263,8 @@ struct __parallel_for_large_submitter; template struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> { - // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip - // iterator. - using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< - std::tuple...>>::type; - static constexpr std::size_t __min_type_size = - oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; + static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size< + std::tuple...>>::value; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index c758a4a3f1b..239734d4861 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,25 +793,6 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; -// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. -template -struct __flatten_std_or_internal_tuple -{ - using type = std::tuple<_T>; -}; - -template -struct __flatten_std_or_internal_tuple> -{ - using type = decltype(std::tuple_cat(std::declval::type>()...)); -}; - -template -struct __flatten_std_or_internal_tuple> -{ - using type = decltype(std::tuple_cat(std::declval::type>()...)); -}; - } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 10d60d8c5d6..1848d33eaea 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,6 +25,7 @@ #include #include #include +#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -784,29 +785,21 @@ union __lazy_ctor_storage } }; -// Utility that returns the smallest type size in a tuple. -template -class __min_tuple_type_size; - +// Returns the smallest type within a set of potentially nested template types. +// E.g. If we consider the type: T = tuple, int, double>, +// then __min_nested_type_size::value returns sizeof(short). template -class __min_tuple_type_size> +struct __min_nested_type_size { - public: - static constexpr std::size_t value = sizeof(_T); + constexpr static std::size_t value = sizeof(_T); }; -template -class __min_tuple_type_size> +template