From e0e03bdc44c3028d24e9d40de72b97df10e3e048 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 5 Sep 2024 12:08:52 -0700 Subject: [PATCH 01/26] Optimize memory transactions in SYCL backend parallel for Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 54 +++++++++++++++++-- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 77b2e6c35f9..4d4850ec6a2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -233,13 +233,57 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); _PRINT_INFO_IN_DEBUG_MODE(__exec); - auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - - __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { - auto __idx = __item_id.get_linear_id(); - __brick(__idx, __rngs...); + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + + // For target architectures, 512 bytes is the maximum amount of data that can be performed in a single load / store + // transaction. Assuming a sub-group size of 32, 512 / 32 = 16 which is the number of bytes we wish to load / store + // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller + // but coalesced transactions will be made and performance should still be maximized. + // Grab the value type of the first range to estimate the optimal iters per work item. + using _ValueType = oneapi::dpl::__internal::__value_t>>>; + constexpr std::uint16_t __max_bytes_per_transaction = 512; + constexpr std::uint16_t __predicted_sub_group_size = 32; + constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size; + // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. + constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); + std::size_t __num_items = std::max(static_cast<_Index>(__work_group_size), oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); + // TODO: optimize for small data sizes that do not saturate the device with this scheme + __cgh.parallel_for<_Name...>(sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { + __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + + std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + + __sub_group_size * __sub_group_id); + bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; + if (__is_full_sub_group) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + { + __brick(__idx, __rngs...); + __idx += __sub_group_size; + } + } + else + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + { + if (__idx < __count) + { + __brick(__idx, __rngs...); + __idx += __sub_group_size; + } + } + } }); }); return __future(__event); From 9f3384a270721297c1f3a5fecc4da9f53c4daea6 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 5 Sep 2024 14:12:08 -0500 Subject: [PATCH 02/26] clang-format Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 68 ++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 4d4850ec6a2..dba98ac1d4d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -243,48 +243,54 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller // but coalesced transactions will be made and performance should still be maximized. // Grab the value type of the first range to estimate the optimal iters per work item. - using _ValueType = oneapi::dpl::__internal::__value_t>>>; + using _ValueType = + oneapi::dpl::__internal::__value_t>>>; constexpr std::uint16_t __max_bytes_per_transaction = 512; constexpr std::uint16_t __predicted_sub_group_size = 32; constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size; // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. - constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); - std::size_t __num_items = std::max(static_cast<_Index>(__work_group_size), oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); + constexpr std::uint16_t __iters_per_work_item = + std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); + std::size_t __num_items = + std::max(static_cast<_Index>(__work_group_size), + oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); // TODO: optimize for small data sizes that do not saturate the device with this scheme - __cgh.parallel_for<_Name...>(sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); - std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - - std::size_t __sub_group_start_idx = - __iters_per_work_item * (__work_group_id * __work_group_size + - __sub_group_size * __sub_group_id); - bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; - if (__is_full_sub_group) - { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) - { - __brick(__idx, __rngs...); - __idx += __sub_group_size; - } - } - else - { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + __cgh.parallel_for<_Name...>( + sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item __ndi) { + __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + + std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + + __sub_group_size * __sub_group_id); + bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; + if (__is_full_sub_group) { - if (__idx < __count) + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) { __brick(__idx, __rngs...); __idx += __sub_group_size; } } - } - }); + else + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + { + if (__idx < __count) + { + __brick(__idx, __rngs...); + __idx += __sub_group_size; + } + } + } + }); }); return __future(__event); } From a244bbb9ba4015ea3a473b0d84befa0b9c16fcee Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 6 Sep 2024 09:42:06 -0700 Subject: [PATCH 03/26] Correct comment and error handling. 128 byte memory operations are performed instead of 512 after inspecting the assembly. Processing 512 bytes per sub-group still seems to be the best value after experimentation. Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index dba98ac1d4d..707445e4a6c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -238,22 +238,21 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); - // For target architectures, 512 bytes is the maximum amount of data that can be performed in a single load / store - // transaction. Assuming a sub-group size of 32, 512 / 32 = 16 which is the number of bytes we wish to load / store - // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller - // but coalesced transactions will be made and performance should still be maximized. + // Processing 512 bytes per sub-group has shown the best performance on target architectures. // Grab the value type of the first range to estimate the optimal iters per work item. using _ValueType = oneapi::dpl::__internal::__value_t>>>; - constexpr std::uint16_t __max_bytes_per_transaction = 512; + + constexpr std::uint16_t __max_bytes_per_sub_group = 512; constexpr std::uint16_t __predicted_sub_group_size = 32; - constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size; + constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_sub_group / __predicted_sub_group_size; // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); - std::size_t __num_items = - std::max(static_cast<_Index>(__work_group_size), - oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item)); + std::size_t __num_groups = + std::max(__work_group_size, + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item))); + std::size_t __num_items = __num_groups * __work_group_size; // TODO: optimize for small data sizes that do not saturate the device with this scheme __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), From ad05086a8d0db3c3582e7694e02e7fa86102bc37 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 10 Sep 2024 08:29:09 -0700 Subject: [PATCH 04/26] __num_groups bugfix Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 707445e4a6c..4958e712a5b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -250,7 +250,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); std::size_t __num_groups = - std::max(__work_group_size, + std::max(std::size_t{1}, oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item))); std::size_t __num_items = __num_groups * __work_group_size; // TODO: optimize for small data sizes that do not saturate the device with this scheme From bb83642f5ac496bfac3b231354f0023ce4537a5c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 16 Sep 2024 08:58:22 -0700 Subject: [PATCH 05/26] Introduce stride recommender for different targets and better distribute work for small inputs Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 48 ++++++++----------- .../dpcpp/parallel_backend_sycl_utils.h | 34 +++++++++++++ 2 files changed, 54 insertions(+), 28 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 4958e712a5b..0fba4e8f891 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -236,56 +236,48 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + + // Limit the work-group size to 512 which has empirically yielded the best results. + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); + __work_group_size = std::min(__work_group_size, static_cast(__count)); // Processing 512 bytes per sub-group has shown the best performance on target architectures. // Grab the value type of the first range to estimate the optimal iters per work item. using _ValueType = oneapi::dpl::__internal::__value_t>>>; - constexpr std::uint16_t __max_bytes_per_sub_group = 512; - constexpr std::uint16_t __predicted_sub_group_size = 32; - constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_sub_group / __predicted_sub_group_size; - // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item. - constexpr std::uint16_t __iters_per_work_item = - std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType)); + constexpr std::size_t __bytes_per_work_item = 16; + constexpr std::size_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); + auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + std::size_t __elems_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); + // For small data sizes, distribute the work evenly among compute units. + std::size_t __iters_per_work_item = std::min(__elems_per_compute_unit, __max_iters_per_work_item); std::size_t __num_groups = - std::max(std::size_t{1}, - oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item))); + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); std::size_t __num_items = __num_groups * __work_group_size; - // TODO: optimize for small data sizes that do not saturate the device with this scheme __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); - std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - - std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + - __sub_group_size * __sub_group_id); - bool __is_full_sub_group = - __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - std::size_t __idx = __sub_group_start_idx + __sub_group_local_id; - if (__is_full_sub_group) + auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); + // TODO: Investigate using a vectorized approach similar to reduce. + // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but + // performance regressions for out-of-place (e.g. std::copy). + if (__is_full) { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) { __brick(__idx, __rngs...); - __idx += __sub_group_size; + __idx += __stride; } } else { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < __iters_per_work_item; ++i) + for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) { if (__idx < __count) { __brick(__idx, __rngs...); - __idx += __sub_group_size; + __idx += __stride; } } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index e7fbfb7ae7c..eae73b0505b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -834,6 +834,40 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; +// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different +// devices. This utility can only be called from the device. +// +// SPIR-V compilation targets show best performance with a stride of the sub-group size. +// Other compilation targets perform best with a work-group size stride. +template +std::tuple +__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) +{ + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + + std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + + __sub_group_size * __sub_group_id); + bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); + bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } +} + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi From 3dbdcec82415a80fbafa18c28f8896a2112dd89f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 16 Sep 2024 14:27:52 -0700 Subject: [PATCH 06/26] Cleanup Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 22 +++++++++---------- .../dpcpp/parallel_backend_sycl_utils.h | 22 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 0fba4e8f891..84d7c14c158 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -241,20 +241,20 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); __work_group_size = std::min(__work_group_size, static_cast(__count)); - // Processing 512 bytes per sub-group has shown the best performance on target architectures. - // Grab the value type of the first range to estimate the optimal iters per work item. using _ValueType = oneapi::dpl::__internal::__value_t>>>; - constexpr std::size_t __bytes_per_work_item = 16; - constexpr std::size_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); - auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - std::size_t __elems_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); + // Process up to 16 bytes per work-item. This results in 512 bytes loaded input range per size 32 sub-group which + // has yielded best performance on target architectures. For larger data types, load a single element. + constexpr std::uint8_t __bytes_per_work_item = 16; + constexpr std::uint8_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); // For small data sizes, distribute the work evenly among compute units. - std::size_t __iters_per_work_item = std::min(__elems_per_compute_unit, __max_iters_per_work_item); - std::size_t __num_groups = + const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); + const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); - std::size_t __num_items = __num_groups * __work_group_size; + const std::size_t __num_items = __num_groups * __work_group_size; __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { @@ -264,7 +264,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // performance regressions for out-of-place (e.g. std::copy). if (__is_full) { - for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) + for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { __brick(__idx, __rngs...); __idx += __stride; @@ -272,7 +272,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> } else { - for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i) + for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { if (__idx < __count) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index eae73b0505b..14bb5122e04 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -845,24 +845,24 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) { - __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); - std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + + const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); - bool __is_full_sub_group = + const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); } else { - std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; - std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); - bool __is_full_work_group = + const std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); + const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); } From f53ae6c6be9c13676a4ac6dbf225a3a13205e989 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 18 Sep 2024 12:52:47 -0700 Subject: [PATCH 07/26] Unroll loop if possible Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 84d7c14c158..f94cd3f3da3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -252,6 +252,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); // For small data sizes, distribute the work evenly among compute units. const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); + const bool __can_unroll_loop = __max_iters_per_work_item == __iters_per_work_item; const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -262,7 +263,16 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // TODO: Investigate using a vectorized approach similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy). - if (__is_full) + if (__is_full && __can_unroll_loop) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __max_iters_per_work_item; ++__i) + { + __brick(__idx, __rngs...); + __idx += __stride; + } + } + else if (__is_full) { for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { From 0bf77a7ef97ea38fa2a3f6a0b3364aef90830c3f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 18 Sep 2024 13:15:44 -0700 Subject: [PATCH 08/26] Revert "Unroll loop if possible" This reverts commit e4cbcebf6ec43c2eced1a90124e6306883793da0. Small sizes slightly slower and for horizontal vectorization no "real" benefit is observed. --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index f94cd3f3da3..84d7c14c158 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -252,7 +252,6 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); // For small data sizes, distribute the work evenly among compute units. const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); - const bool __can_unroll_loop = __max_iters_per_work_item == __iters_per_work_item; const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -263,16 +262,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // TODO: Investigate using a vectorized approach similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy). - if (__is_full && __can_unroll_loop) - { - _ONEDPL_PRAGMA_UNROLL - for (std::uint8_t __i = 0; __i < __max_iters_per_work_item; ++__i) - { - __brick(__idx, __rngs...); - __idx += __stride; - } - } - else if (__is_full) + if (__is_full) { for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { From 7768aff5a03598f4ff92fe74bae2565495837bde Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 19 Sep 2024 20:02:21 -0700 Subject: [PATCH 09/26] Use a small and large kernel in parallel for Small but measurable overheads can be observed for small inputs where runtime dispatch in the kernel is present to check for the correct path to take. Letting the compiler handle the the small input case in the original kernel shows the best performance. Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 70 +++++++++++++++---- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 84d7c14c158..d5df7c06cf6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -214,6 +214,12 @@ class __scan_single_wg_dynamic_kernel; template class __scan_copy_single_wg_kernel; +template +class __parallel_for_small_kernel; + +template +class __parallel_for_large_kernel; + //------------------------------------------------------------------------ // parallel_for - async pattern //------------------------------------------------------------------------ @@ -222,10 +228,35 @@ class __scan_copy_single_wg_kernel; // as the parameter pack that can be empty (for unnamed kernels) or contain exactly one // type (for explicitly specified name by the user) template -struct __parallel_for_submitter; +struct __parallel_for_small_submitter; template -struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> +struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + + __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { + auto __idx = __item_id.get_linear_id(); + __brick(__idx, __rngs...); + }); + }); + return __future(__event); + } +}; + +template +struct __parallel_for_large_submitter; + +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>> { template auto @@ -244,14 +275,13 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> using _ValueType = oneapi::dpl::__internal::__value_t>>>; - // Process up to 16 bytes per work-item. This results in 512 bytes loaded input range per size 32 sub-group which - // has yielded best performance on target architectures. For larger data types, load a single element. + // Process up to 16 bytes per work-item per input range. This value has been the empirically determined minimum + // number of bytes for a single input range to saturate HW bandwidth on target architecures. constexpr std::uint8_t __bytes_per_work_item = 16; - constexpr std::uint8_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); - const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size); - // For small data sizes, distribute the work evenly among compute units. - const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast(__max_iters_per_work_item)); + // TODO: Better handle this heuristic for the case where the input is a zip iterator + constexpr std::uint8_t __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); + const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -264,6 +294,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> // performance regressions for out-of-place (e.g. std::copy). if (__is_full) { + _ONEDPL_PRAGMA_UNROLL for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) { __brick(__idx, __rngs...); @@ -295,10 +326,23 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>; - - return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count, - ::std::forward<_Ranges>(__rngs)...); + using _ForKernelSmall = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>; + using _ForKernelLarge = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single + // kernel that worsen performance for small cases. + if (__count <= 262144) + { + return __parallel_for_small_submitter<_ForKernelSmall>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + else + { + return __parallel_for_large_submitter<_ForKernelLarge>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } //------------------------------------------------------------------------ From 4316e07ba62c962588b253de4d0d774bc205d926 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 20 Sep 2024 10:46:24 -0500 Subject: [PATCH 10/26] Improve __iters_per_work_item heuristic. We now flatten the user-provided ranges and find the minimum sized type to estimate the best __iters_per_work_item. This benefits performance in calls that wrap multiple buffers in a single input / output through a zip_iterator (e.g. dpct::scatter_if in SYCLomatic compatibility headers). Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 59 ++++++++++++------- include/oneapi/dpl/pstl/tuple_impl.h | 20 +++++++ include/oneapi/dpl/pstl/utils.h | 20 +++++++ 3 files changed, 77 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d5df7c06cf6..b1328204a15 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -252,13 +252,35 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name.. } }; -template +template struct __parallel_for_large_submitter; -template -struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>> -{ - template +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> +{ + static constexpr std::uint8_t __bytes_per_work_item = 16; + // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip + // iterator. + using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< + std::tuple...>>::type; + using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type; + // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the + // flattened ranges. This allows us to launch enough work per item to saturate device memory. + static constexpr std::uint8_t __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); + + // Once there is enough work to launch a group on each compute unit with our __iters_per_item, + // then we should start using this code path. + template + static std::size_t + __estimate_best_start_size(const _ExecutionPolicy& __exec) + { + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + return __work_group_size * __iters_per_work_item * __max_cu; + } + + template auto operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { @@ -270,25 +292,16 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // Limit the work-group size to 512 which has empirically yielded the best results. std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); - __work_group_size = std::min(__work_group_size, static_cast(__count)); - using _ValueType = - oneapi::dpl::__internal::__value_t>>>; - - // Process up to 16 bytes per work-item per input range. This value has been the empirically determined minimum - // number of bytes for a single input range to saturate HW bandwidth on target architecures. - constexpr std::uint8_t __bytes_per_work_item = 16; // TODO: Better handle this heuristic for the case where the input is a zip iterator - constexpr std::uint8_t __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType)); - const std::size_t __num_groups = - oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); + auto [__idx, __stride, __is_full] = + __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); // TODO: Investigate using a vectorized approach similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy). @@ -331,17 +344,19 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& using _ForKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; + using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single // kernel that worsen performance for small cases. - if (__count <= 262144) + if (__count < __large_submitter::__estimate_best_start_size(__exec)) { - return __parallel_for_small_submitter<_ForKernelSmall>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } else { - return __parallel_for_large_submitter<_ForKernelLarge>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } } diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index 239734d4861..0c528b0d15e 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,6 +793,26 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; + +// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. +template +struct __flatten_std_or_internal_tuple +{ + using type = std::tuple<_T>; +}; + +template +struct __flatten_std_or_internal_tuple> +{ + using type = decltype(std::tuple_cat(std::declval::type>()...)); +}; + +template +struct __flatten_std_or_internal_tuple> +{ + using type = decltype(std::tuple_cat(std::declval::type>()...)); +}; + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 8a8dfdae1bc..1168dc76586 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -784,6 +784,26 @@ union __lazy_ctor_storage } }; +// Utility that returns the smallest type in tuple. +template +class __min_tuple_type; + +template +class __min_tuple_type> +{ + public: + using type = _T; +}; + +template +class __min_tuple_type> +{ + using __min_type_ts = typename __min_tuple_type>::type; + + public: + using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>; +}; + } // namespace __internal } // namespace dpl } // namespace oneapi From d2cf632c12ffb28d499b17f2ea334578fc92caa3 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 20 Sep 2024 16:00:45 -0500 Subject: [PATCH 11/26] Code cleanup Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 22 ++++++++++--------- .../dpcpp/parallel_backend_sycl_utils.h | 16 +++++++------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index b1328204a15..d768700cca0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -258,24 +258,29 @@ struct __parallel_for_large_submitter; template struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> { - static constexpr std::uint8_t __bytes_per_work_item = 16; // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip // iterator. using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< std::tuple...>>::type; using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the - // flattened ranges. This allows us to launch enough work per item to saturate device memory. + // flattened ranges. This allows us to launch enough work per item to saturate the device's memory + // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to + // achieve full bandwidth utilization for algorithms that have multiple ranges as this has shown the + // best general performance. + static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); + // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. + static constexpr std::uint16_t __max_work_group_size = 512; - // Once there is enough work to launch a group on each compute unit with our __iters_per_item, + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, // then we should start using this code path. template static std::size_t __estimate_best_start_size(const _ExecutionPolicy& __exec) { - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); return __work_group_size * __iters_per_work_item * __max_cu; } @@ -290,10 +295,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - // Limit the work-group size to 512 which has empirically yielded the best results. - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512); - - // TODO: Better handle this heuristic for the case where the input is a zip iterator + std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; @@ -364,7 +366,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_scan_submitter; @@ -2072,7 +2074,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 14bb5122e04..2d39026d408 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -841,7 +841,8 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // Other compilation targets perform best with a work-group size stride. template std::tuple -__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) +__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) { @@ -851,19 +852,18 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); - const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + - __sub_group_size * __sub_group_id); - const bool __is_full_sub_group = - __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); } else { - const std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_group_start_idx = + __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); - const bool __is_full_work_group = - __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); } } From 4eeaf977fd50200971cd7fdb390cf45a1b95b647 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 13:29:09 -0500 Subject: [PATCH 12/26] Clang format Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 ++- include/oneapi/dpl/pstl/tuple_impl.h | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d768700cca0..bbcb496b36d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -295,7 +295,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); const std::size_t __num_items = __num_groups * __work_group_size; diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index 0c528b0d15e..c758a4a3f1b 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,12 +793,11 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; - // Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. template struct __flatten_std_or_internal_tuple { - using type = std::tuple<_T>; + using type = std::tuple<_T>; }; template From b717ac721cbe133bfee585cca2962f33e5fd5efb Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 14:02:21 -0500 Subject: [PATCH 13/26] Update comments Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index bbcb496b36d..c35ec4c54a6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -266,8 +266,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to - // achieve full bandwidth utilization for algorithms that have multiple ranges as this has shown the - // best general performance. + // achieve full bandwidth utilization. 16 bytes per input range per work item has been found as a good + // value across the different for-based algorithms. static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); @@ -305,9 +305,10 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. [=](sycl::nd_item __ndi) { auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); - // TODO: Investigate using a vectorized approach similar to reduce. + // TODO: Investigate adding a vectorized path similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but - // performance regressions for out-of-place (e.g. std::copy). + // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to + // vectorize our code. if (__is_full) { _ONEDPL_PRAGMA_UNROLL From 839f1ad5a1da289554099054609fce8895c58e17 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 14:05:09 -0500 Subject: [PATCH 14/26] Bugfix in comment Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c35ec4c54a6..b443a0e4836 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -266,7 +266,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to - // achieve full bandwidth utilization. 16 bytes per input range per work item has been found as a good + // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good // value across the different for-based algorithms. static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = From b347606996d8e6b63645d197b80d75c82ab8feb7 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 23 Sep 2024 15:10:06 -0500 Subject: [PATCH 15/26] More cleanup and better handle non-full case Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 31 ++++++++++++------- .../dpcpp/parallel_backend_sycl_utils.h | 6 ++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index b443a0e4836..2f7853659ee 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -280,7 +280,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. static std::size_t __estimate_best_start_size(const _ExecutionPolicy& __exec) { - std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); return __work_group_size * __iters_per_work_item * __max_cu; } @@ -294,8 +295,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - - std::size_t __work_group_size = + const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); @@ -303,12 +303,13 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item __ndi) { - auto [__idx, __stride, __is_full] = - __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); // TODO: Investigate adding a vectorized path similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to - // vectorize our code. + // vectorize our code. Vectorization may also improve performance of for-algorithms over small data + // types. + auto [__idx, __group_start_idx, __stride, __is_full] = + __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); if (__is_full) { _ONEDPL_PRAGMA_UNROLL @@ -320,13 +321,19 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. } else { - for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) + // Recompute iters per item and manually unroll last loop iteration to remove most branching. + if (__group_start_idx >= __count) + return; + const std::uint8_t __adjusted_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i) { - if (__idx < __count) - { - __brick(__idx, __rngs...); - __idx += __stride; - } + __brick(__idx, __rngs...); + __idx += __stride; + } + if (__idx < __count) + { + __brick(__idx, __rngs...); } } }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 2d39026d408..d5ad340fff4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -840,7 +840,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. template -std::tuple +std::tuple __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { @@ -856,7 +856,7 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; - return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group); } else { @@ -864,7 +864,7 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; - return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group); } } From 72a09419f47a393aca9665bbc510cf33cd9a41b7 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 08:38:40 -0500 Subject: [PATCH 16/26] Rename __ndi to __item for consistency with codebase Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2f7853659ee..80c9649340c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -302,14 +302,14 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. const std::size_t __num_items = __num_groups * __work_group_size; __cgh.parallel_for<_Name...>( sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), - [=](sycl::nd_item __ndi) { + [=](sycl::nd_item __item) { // TODO: Investigate adding a vectorized path similar to reduce. // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to // vectorize our code. Vectorization may also improve performance of for-algorithms over small data // types. auto [__idx, __group_start_idx, __stride, __is_full] = - __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size); + __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size); if (__is_full) { _ONEDPL_PRAGMA_UNROLL diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index d5ad340fff4..7205fa7fcf4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -839,18 +839,18 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. -template +template std::tuple -__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, +__stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) { - const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); @@ -861,8 +861,8 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite else { const std::size_t __work_group_start_idx = - __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; - const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id(); + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group); } From 9f82b11e41314ec4407d735083e670926ec7ea02 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 08:45:52 -0500 Subject: [PATCH 17/26] Update all comments on kernel naming trick Signed-off-by: Matthew Michel --- .../experimental/kt/internal/esimd_radix_sort_submitters.h | 2 +- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h | 2 +- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4d7b81e6a2e..4fc274f2445 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl { //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ template struct __parallel_scan_submitter; @@ -2083,7 +2083,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..3be82fdc623 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -48,7 +48,7 @@ namespace __par_backend_hetero //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_for_fpga_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..280bb5181bd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -129,7 +129,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_merge_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index edad63d2a79..23e38268bf9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ //------------------------------------------------------------------------ // parallel_transform_reduce - async patterns -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ // Parallel_transform_reduce for a small arrays using a single work group. From b5f00211d6b0533afc808a51bf94d19ede951558 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 09:29:54 -0500 Subject: [PATCH 18/26] Handle non-full case in a cleaner way Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 16 +++++----------- .../hetero/dpcpp/parallel_backend_sycl_utils.h | 6 +++--- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 8fb6fd8dfb1..4691d889cc9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -308,7 +308,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to // vectorize our code. Vectorization may also improve performance of for-algorithms over small data // types. - auto [__idx, __group_start_idx, __stride, __is_full] = + auto [__idx, __stride, __is_full] = __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size); if (__is_full) { @@ -319,22 +319,16 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. __idx += __stride; } } - else + // If we are not full, then take this branch only if there is work to process. + else if (__idx < __count) { - // Recompute iters per item and manually unroll last loop iteration to remove most branching. - if (__group_start_idx >= __count) - return; const std::uint8_t __adjusted_iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride); - for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i) + oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i) { __brick(__idx, __rngs...); __idx += __stride; } - if (__idx < __count) - { - __brick(__idx, __rngs...); - } } }); }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 7205fa7fcf4..ea486f14504 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -840,7 +840,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. template -std::tuple +std::tuple __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { @@ -856,7 +856,7 @@ __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __i __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; - return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group); + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); } else { @@ -864,7 +864,7 @@ __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __i __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; - return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group); + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); } } From 5519dacf48c89adf7fcf4f7c3ed7ce99a290d446 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 11:17:14 -0500 Subject: [PATCH 19/26] Switch min tuple type utility to return size of type Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 +++-- include/oneapi/dpl/pstl/utils.h | 17 ++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 4691d889cc9..db8e5bac23f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -262,7 +262,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // iterator. using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< std::tuple...>>::type; - using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type; + static constexpr std::size_t __min_type_size = + oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to @@ -270,7 +271,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // value across the different for-based algorithms. static constexpr std::uint8_t __bytes_per_work_item = 16; static constexpr std::uint8_t __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType)); + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size); // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. static constexpr std::uint16_t __max_work_group_size = 512; diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 1168dc76586..10d60d8c5d6 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -784,26 +784,29 @@ union __lazy_ctor_storage } }; -// Utility that returns the smallest type in tuple. +// Utility that returns the smallest type size in a tuple. template -class __min_tuple_type; +class __min_tuple_type_size; template -class __min_tuple_type> +class __min_tuple_type_size> { public: - using type = _T; + static constexpr std::size_t value = sizeof(_T); }; template -class __min_tuple_type> +class __min_tuple_type_size> { - using __min_type_ts = typename __min_tuple_type>::type; + static constexpr std::size_t __min_type_value_ts = __min_tuple_type_size>::value; public: - using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>; + static constexpr std::size_t value = std::min(sizeof(_T), __min_type_value_ts); }; +template +inline constexpr std::size_t __min_tuple_type_size_v = __min_tuple_type_size<_Tuple>::value; + } // namespace __internal } // namespace dpl } // namespace oneapi From 2d985a51229381ebd487310608364a4eb7796bae Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 11:59:53 -0500 Subject: [PATCH 20/26] Remove unnecessary template parameter Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index ea486f14504..322678623e8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -839,9 +839,8 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. -template std::tuple -__stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item, +__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) From 6ab46adddb6896049af983a6f6b5a6bd91100b67 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 13:07:30 -0500 Subject: [PATCH 21/26] Make non-template function inline for ODR compliance Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 322678623e8..a3ce6390d39 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -839,7 +839,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // // SPIR-V compilation targets show best performance with a stride of the sub-group size. // Other compilation targets perform best with a work-group size stride. -std::tuple +inline std::tuple __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size) { From e3e05a7567a404c93d0463ac39f2c44f9e2b4a6e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 14:21:40 -0700 Subject: [PATCH 22/26] If the iters per work item is 1, then only compile the basic pfor kernel Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index db8e5bac23f..e55edf54996 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -352,16 +352,25 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; - // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single - // kernel that worsen performance for small cases. - if (__count < __large_submitter::__estimate_best_start_size(__exec)) + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a + // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1, + // then only compile the basic kernel as the two versions are effectively the same. + if constexpr (__large_submitter::__iters_per_work_item > 1) { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + if (__count < __large_submitter::__estimate_best_start_size(__exec)) + { + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + else + { + return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } else { - return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, std::forward<_Ranges>(__rngs)...); } } From 8f9c5bb312dfbef15a2a4cff1fa2ebcb56d6ac32 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 25 Sep 2024 09:11:44 -0500 Subject: [PATCH 23/26] Address several PR comments * Move __stride_recommender into __parallel_for_large_submitter * Use {} to invoke constructor * Simplify if-else statements in for dispatch Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index e55edf54996..85139bd92a7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -275,6 +275,39 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. static constexpr std::uint16_t __max_work_group_size = 512; + // SPIR-V compilation targets show best performance with a stride of the sub-group size. + // Other compilation targets perform best with a work-group size stride. This utility can only be called from the + // device. + static inline std::tuple + __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) + { + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); + const bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } + } + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, // then we should start using this code path. template @@ -357,22 +390,14 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& // then only compile the basic kernel as the two versions are effectively the same. if constexpr (__large_submitter::__iters_per_work_item > 1) { - if (__count < __large_submitter::__estimate_best_start_size(__exec)) + if (__count >= __large_submitter::__estimate_best_start_size(__exec)) { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, std::forward<_Ranges>(__rngs)...); } - else - { - return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); - } - } - else - { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); } + return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } //------------------------------------------------------------------------ From 55623b23aab00449ef2b248e5a6659862dea7c7d Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 25 Sep 2024 09:19:31 -0500 Subject: [PATCH 24/26] Remove free function __stride_recommender Signed-off-by: Matthew Michel --- .../dpcpp/parallel_backend_sycl_utils.h | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index a3ce6390d39..e7fbfb7ae7c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -834,39 +834,6 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; -// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different -// devices. This utility can only be called from the device. -// -// SPIR-V compilation targets show best performance with a stride of the sub-group size. -// Other compilation targets perform best with a work-group size stride. -inline std::tuple -__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, - std::size_t __work_group_size) -{ - if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) - { - const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); - const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); - const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); - const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); - const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); - - const std::size_t __sub_group_start_idx = - __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); - const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; - const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; - return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); - } - else - { - const std::size_t __work_group_start_idx = - __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; - const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); - const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; - return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); - } -} - } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi From 39b572f6bb556d0b274655c790fbe8bb6a3206a0 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 25 Sep 2024 13:17:30 -0500 Subject: [PATCH 25/26] Accept ranges as forwarding references in __parallel_for_large_submitter Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 85139bd92a7..63148e0be63 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -252,16 +252,16 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name.. } }; -template +template struct __parallel_for_large_submitter; -template -struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> { // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip // iterator. using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< - std::tuple...>>::type; + std::tuple...>>::type; static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the @@ -320,7 +320,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. return __work_group_size * __iters_per_work_item * __max_cu; } - template + template auto operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { From 33337f863289c12c4e31e04c37712774d165cb0a Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 6 Nov 2024 13:44:38 -0600 Subject: [PATCH 26/26] Address reviewer comments Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 +++----- include/oneapi/dpl/pstl/tuple_impl.h | 19 -------------- include/oneapi/dpl/pstl/utils.h | 25 +++++++------------ 3 files changed, 13 insertions(+), 41 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 63148e0be63..41cc600dc0d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -28,10 +28,12 @@ #include #include #include +#include #include "../../iterator_impl.h" #include "../../execution_impl.h" #include "../../utils_ranges.h" +#include "../../utils.h" #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" @@ -258,12 +260,8 @@ struct __parallel_for_large_submitter; template struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> { - // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip - // iterator. - using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< - std::tuple...>>::type; - static constexpr std::size_t __min_type_size = - oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; + static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size< + std::tuple...>>::value; // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the // flattened ranges. This allows us to launch enough work per item to saturate the device's memory // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index c758a4a3f1b..239734d4861 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,25 +793,6 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; -// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. -template -struct __flatten_std_or_internal_tuple -{ - using type = std::tuple<_T>; -}; - -template -struct __flatten_std_or_internal_tuple> -{ - using type = decltype(std::tuple_cat(std::declval::type>()...)); -}; - -template -struct __flatten_std_or_internal_tuple> -{ - using type = decltype(std::tuple_cat(std::declval::type>()...)); -}; - } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 10d60d8c5d6..1848d33eaea 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,6 +25,7 @@ #include #include #include +#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -784,29 +785,21 @@ union __lazy_ctor_storage } }; -// Utility that returns the smallest type size in a tuple. -template -class __min_tuple_type_size; - +// Returns the smallest type within a set of potentially nested template types. +// E.g. If we consider the type: T = tuple, int, double>, +// then __min_nested_type_size::value returns sizeof(short). template -class __min_tuple_type_size> +struct __min_nested_type_size { - public: - static constexpr std::size_t value = sizeof(_T); + constexpr static std::size_t value = sizeof(_T); }; -template -class __min_tuple_type_size> +template