From 8b9c3c99989e51618476d620b17395d016b2c426 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 24 Sep 2024 14:21:40 -0700 Subject: [PATCH] If the iters per work item is 1, then only compile the basic pfor kernel Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 5fe0fab7f4..af3ff4eecf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -352,16 +352,25 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; - // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single - // kernel that worsen performance for small cases. - if (__count < __large_submitter::__estimate_best_start_size(__exec)) + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a + // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1, + // then only compile the basic kernel as the two versions are effectively the same. + if constexpr (__large_submitter::__iters_per_work_item > 1) { - return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + if (__count < __large_submitter::__estimate_best_start_size(__exec)) + { + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + else + { + return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } else { - return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, std::forward<_Ranges>(__rngs)...); } }