diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index e55edf54996..85139bd92a7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -275,6 +275,39 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. static constexpr std::uint16_t __max_work_group_size = 512; + // SPIR-V compilation targets show best performance with a stride of the sub-group size. + // Other compilation targets perform best with a work-group size stride. This utility can only be called from the + // device. + static inline std::tuple + __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) + { + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); + const bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } + } + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, // then we should start using this code path. template @@ -357,22 +390,14 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& // then only compile the basic kernel as the two versions are effectively the same. if constexpr (__large_submitter::__iters_per_work_item > 1) { - if (__count < __large_submitter::__estimate_best_start_size(__exec)) + if (__count >= __large_submitter::__estimate_best_start_size(__exec)) { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, std::forward<_Ranges>(__rngs)...); } - else - { - return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); - } - } - else - { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); } + return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } //------------------------------------------------------------------------